In [58]:
# Dependencies

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.layers import Activation, Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Flatten, GRU, Conv1D, MaxPooling1D, Bidirectional


# NLP

from sklearn.feature_extraction .text import TfidfVectorizer

# Processing

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [59]:
df = pd.read_csv(r"../Data/processedData.csv")

X_train, X_test, y_train, y_test = train_test_split(df[['processedMessage','fingers','tail']],
                                                    df['species_group'])

# Vectorization #

In [60]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=128)
vectorizer.fit(df['processedMessage'])

tfidf_X_train = (vectorizer.transform(X_train['processedMessage'])).toarray()
tfidf_X_test = (vectorizer.transform(X_test['processedMessage'])).toarray()

label_encoder = LabelEncoder()
label_encoder.fit(df['species_group'])

y_train = to_categorical(label_encoder.transform(y_train),num_classes=12)
y_test = to_categorical(label_encoder.transform(y_test),num_classes=12)

# 1-D CNN #

In [61]:
model_CNN = Sequential()
model_CNN.add(Conv1D(filters=32, kernel_size=3, activation='leaky_relu', input_shape=(tfidf_X_train.shape[1],1)))
model_CNN.add(MaxPooling1D(pool_size=3))
model_CNN.add(Flatten())
model_CNN.add(Dense(units=32, activation = 'leaky_relu'))
model_CNN.add(Dense(units = 16, activation = 'leaky_relu'))
model_CNN.add(Dropout(0.2))
model_CNN.add(Dense(units=12, activation='softmax'))

model_CNN.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_CNN.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [62]:
# Training

model_CNN.fit(tfidf_X_train, y_train, epochs=100, validation_data=(tfidf_X_test, y_test))

Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.1256 - loss: 2.4778 - val_accuracy: 0.2320 - val_loss: 2.4359
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2247 - loss: 2.4119 - val_accuracy: 0.1440 - val_loss: 2.3498
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2321 - loss: 2.2949 - val_accuracy: 0.1680 - val_loss: 2.2457
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2354 - loss: 2.1517 - val_accuracy: 0.2240 - val_loss: 2.1310
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2881 - loss: 2.0236 - val_accuracy: 0.2960 - val_loss: 2.0150
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3888 - loss: 1.9038 - val_accuracy: 0.3120 - val_loss: 1.8706
Epoch 7/100
[1m12/12[0m [3

<keras.src.callbacks.history.History at 0x247cdfbb7d0>

In [63]:
from sklearn.metrics import classification_report

# Testing

y_pred_msg_CNN = model_CNN.predict(tfidf_X_test)
y_pred_classes = y_pred_msg_CNN.argmax(axis=1)
y_test_classes = (y_test).argmax(axis=1)

print(classification_report(y_test_classes, y_pred_classes))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
              precision    recall  f1-score   support

           0       0.55      0.60      0.57        10
           1       0.50      0.69      0.58        13
           2       0.00      0.00      0.00         6
           3       0.50      0.33      0.40         6
           4       0.29      0.18      0.22        11
           5       0.00      0.00      0.00         5
           6       0.43      0.43      0.43         7
           7       0.50      0.53      0.51        19
           8       0.60      0.38      0.46        16
           9       0.54      0.54      0.54        13
          10       0.38      0.86      0.52         7
          11       0.50      0.50      0.50        12

    accuracy                           0.46       125
   macro avg       0.40      0.42      0.39       125
weighted avg       0.45      0.46      0.44       125



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Gaussian Classifier #

In [64]:
# Finger probability array

from sklearn.naive_bayes import GaussianNB

y_train_class = np.argmax(y_train,axis=1)
y_test_class = np.argmax(y_test,axis=1)

gnb_fingers = GaussianNB()
gnb_fingers.fit(X_train['fingers'].values.reshape(-1, 1),y_train_class)

y_pred_fingers = gnb_fingers.predict_proba(X_test['fingers'].values.reshape(-1, 1))

In [65]:
# Tail probability array

from sklearn.naive_bayes import MultinomialNB

label_encoder = LabelEncoder()
label_encoder.fit(df['tail'])

X_train_tail = (label_encoder.transform(X_train['tail'])).reshape(-1,1)
X_test_tail = label_encoder.transform(X_test['tail']).reshape(-1,1)

mnb_tail = MultinomialNB()
mnb_tail.fit(X_train_tail, y_train_class)

y_pred_tail = mnb_tail.predict_proba(X_test_tail)

y_pred_tail

array([[0.10133333, 0.112     , 0.03733333, ..., 0.12533333, 0.09866667,
        0.104     ],
       [0.10133333, 0.112     , 0.03733333, ..., 0.12533333, 0.09866667,
        0.104     ],
       [0.10133333, 0.112     , 0.03733333, ..., 0.12533333, 0.09866667,
        0.104     ],
       ...,
       [0.10133333, 0.112     , 0.03733333, ..., 0.12533333, 0.09866667,
        0.104     ],
       [0.10133333, 0.112     , 0.03733333, ..., 0.12533333, 0.09866667,
        0.104     ],
       [0.10133333, 0.112     , 0.03733333, ..., 0.12533333, 0.09866667,
        0.104     ]])

In [67]:
y_pred_msg_log = np.log(y_pred_msg_CNN)
y_pred_fingers_log = np.log(y_pred_fingers)
y_pred_tail_log = np.log(y_pred_tail)

class_counts = df['species_group'].value_counts()
total_samples = len(df)
class_probabilities = np.array(class_counts / total_samples)

final_probabilities = y_pred_msg_log + y_pred_fingers_log + y_pred_tail_log - np.log(class_probabilities)
y_pred_final_classes = np.argmax(final_probabilities, axis=1)
print(classification_report(y_test_classes,y_pred_final_classes))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60        10
           1       0.67      0.77      0.71        13
           2       1.00      0.50      0.67         6
           3       1.00      0.67      0.80         6
           4       0.67      0.18      0.29        11
           5       0.00      0.00      0.00         5
           6       0.50      0.57      0.53         7
           7       0.57      0.84      0.68        19
           8       0.81      0.81      0.81        16
           9       0.60      0.69      0.64        13
          10       0.54      1.00      0.70         7
          11       0.60      0.50      0.55        12

    accuracy                           0.64       125
   macro avg       0.63      0.59      0.58       125
weighted avg       0.64      0.64      0.61       125



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
