In [44]:
# Dependencies

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.layers import Activation, Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Flatten, GRU, Conv1D, MaxPooling1D, Bidirectional


# NLP

from sklearn.feature_extraction .text import TfidfVectorizer

# Processing

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [45]:
df = pd.read_csv(r"../Data/processedData.csv")

X_train, X_test, y_train, y_test = train_test_split(df[['processedMessage','fingers','tail']],
                                                    df['species_group'])

# Vectorization #

In [46]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=128)
vectorizer.fit(df['processedMessage'])

tfidf_X_train = (vectorizer.transform(X_train['processedMessage'])).toarray()
tfidf_X_test = (vectorizer.transform(X_test['processedMessage'])).toarray()

label_encoder = LabelEncoder()
label_encoder.fit(df['species_group'])

y_train = to_categorical(label_encoder.transform(y_train),num_classes=12)
y_test = to_categorical(label_encoder.transform(y_test),num_classes=12)

# 1-D CNN #

In [47]:
model_CNN = Sequential()
model_CNN.add(Conv1D(filters=32, kernel_size=3, activation='leaky_relu', input_shape=(tfidf_X_train.shape[1],1)))
model_CNN.add(MaxPooling1D(pool_size=3))
model_CNN.add(Flatten())
model_CNN.add(Dense(units=32, activation = 'leaky_relu'))
model_CNN.add(Dense(units = 16, activation = 'leaky_relu'))
model_CNN.add(Dropout(0.2))
model_CNN.add(Dense(units=12, activation='softmax'))

model_CNN.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_CNN.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [48]:
# Training

model_CNN.fit(tfidf_X_train, y_train, epochs=200, validation_data=(tfidf_X_test, y_test))

Epoch 1/200


In [18]:
from sklearn.metrics import classification_report

# Testing

y_pred_msg_CNN = model_CNN.predict(tfidf_X_test)
y_pred_classes = y_pred_msg_CNN.argmax(axis=1)
y_test_classes = (y_test).argmax(axis=1)

print(classification_report(y_test_classes, y_pred_classes))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
              precision    recall  f1-score   support

           0       0.59      0.62      0.61        16
           1       0.53      0.47      0.50        17
           2       0.33      0.33      0.33         6
           3       0.44      0.50      0.47         8
           4       0.00      0.00      0.00         4
           5       1.00      0.14      0.25         7
           6       0.62      0.53      0.57        15
           7       0.28      1.00      0.43         5
           8       0.40      0.33      0.36        12
           9       0.43      0.60      0.50        10
          10       0.50      0.42      0.45        12
          11       0.56      0.38      0.45        13

    accuracy                           0.46       125
   macro avg       0.47      0.44      0.41       125
weighted avg       0.51      0.46      0.46       125



# Gaussian Classifier #

In [19]:
# Finger probability array

from sklearn.naive_bayes import GaussianNB

y_train_class = np.argmax(y_train,axis=1)
y_test_class = np.argmax(y_test,axis=1)

gnb_fingers = GaussianNB()
gnb_fingers.fit(X_train['fingers'].values.reshape(-1, 1),y_train_class)

y_pred_fingers = gnb_fingers.predict_proba(X_test['fingers'].values.reshape(-1, 1))

In [20]:
# Tail probability array

from sklearn.naive_bayes import MultinomialNB

label_encoder = LabelEncoder()
label_encoder.fit(df['tail'])

X_train_tail = (label_encoder.transform(X_train['tail'])).reshape(-1,1)
X_test_tail = label_encoder.transform(X_test['tail']).reshape(-1,1)

mnb_tail = MultinomialNB()
mnb_tail.fit(X_train_tail, y_train_class)

y_pred_tail = mnb_tail.predict_proba(X_test_tail)

y_pred_tail

array([[0.08533333, 0.10133333, 0.03733333, ..., 0.13333333, 0.08533333,
        0.10133333],
       [0.08533333, 0.10133333, 0.03733333, ..., 0.13333333, 0.08533333,
        0.10133333],
       [0.08533333, 0.10133333, 0.03733333, ..., 0.13333333, 0.08533333,
        0.10133333],
       ...,
       [0.08533333, 0.10133333, 0.03733333, ..., 0.13333333, 0.08533333,
        0.10133333],
       [0.08533333, 0.10133333, 0.03733333, ..., 0.13333333, 0.08533333,
        0.10133333],
       [0.08533333, 0.10133333, 0.03733333, ..., 0.13333333, 0.08533333,
        0.10133333]])

In [21]:
y_pred_msg_log = np.log(y_pred_msg_CNN)
y_pred_fingers_log = np.log(y_pred_fingers)
y_pred_tail_log = np.log(y_pred_tail)

arrays = (y_pred_msg_log, y_pred_fingers_log, y_pred_tail_log)
final_probabilities = np.prod(arrays, axis=0)

y_pred_final_classes = np.argmax(final_probabilities, axis=1)
print(classification_report(y_test_classes,y_pred_final_classes))

              precision    recall  f1-score   support

           0       0.59      0.62      0.61        16
           1       0.53      0.47      0.50        17
           2       0.33      0.33      0.33         6
           3       0.57      0.50      0.53         8
           4       0.00      0.00      0.00         4
           5       1.00      0.14      0.25         7
           6       0.62      0.53      0.57        15
           7       0.28      1.00      0.43         5
           8       0.40      0.33      0.36        12
           9       0.43      0.60      0.50        10
          10       0.58      0.58      0.58        12
          11       0.56      0.38      0.45        13

    accuracy                           0.48       125
   macro avg       0.49      0.46      0.43       125
weighted avg       0.53      0.48      0.47       125

