# Model Training

## Splitting Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## K-Fold Cross-Validation using Classifiers

### Random Forest Classifier

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# assuming X and y are already defined

# remove rows with missing values in y
y = np.nan_to_num(y, nan=0)
nz_indices = np.nonzero(y)
X_clean = X[nz_indices]
y_clean = y[nz_indices]

# fill the missing values in y_clean with the mean of y_clean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
y_filled = imputer.fit_transform(y_clean.reshape(-1, 1))
y_filled = y_filled.flatten()

# initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# initialize the KFold instance
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# list to store accuracy for each fold
accuracy = []

# list to store precision for each fold
precision = []

# list to store recall for each fold
recall = []

# list to store f1_score for each fold
f1 = []

# iterate over each fold
for train_index, test_index in kfold.split(X_clean):
    # split the data into train and test data
    X_train, X_test = X_clean[train_index], X_clean[test_index]
    y_train, y_test = y_filled[train_index], y_filled[test_index]

    # fit the model on the train data
    model.fit(X_train, y_train)

    # make predictions on the test data
    predictions = model.predict(X_test)

    # calculate accuracy, precision, recall, and f1_score
    acc = accuracy_score(y_test, predictions)
    pre = precision_score(y_test, predictions, average='macro') # update average parameter here
    rec = recall_score(y_test, predictions, average='macro') # update average parameter here
    f1_sc = f1_score(y_test, predictions, average='macro') # update average parameter here

    # append metrics to their respective lists
    accuracy.append(acc)
    precision.append(pre)
    recall.append(rec)
    f1.append(f1_sc)

# print average metrics over all folds
print("Average accuracy: ", sum(accuracy)/len(accuracy))
print("Average precision: ", sum(precision)/len(precision))
print("Average recall: ", sum(recall)/len(recall))
print("Average f1_score: ", sum(f1)/len(f1))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
accuracy = []
precision = []
recall = []
f1 = []

model = DecisionTreeClassifier(random_state=42)

for train_index, test_index in kfold.split(X):
    # split the data into train and test data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit the model on the train data
    model.fit(X_train, y_train)

    # make predictions on the test data
    predictions = model.predict(X_test)

    # calculate accuracy, precision, recall, and f1_score
    accuracy.append(accuracy_score(y_test, predictions))
    precision.append(precision_score(y_test, predictions, average='weighted'))
    recall.append(recall_score(y_test, predictions, average='weighted'))
    f1.append(f1_score(y_test, predictions, average='weighted'))

print("Accuracy: ", np.mean(accuracy))
print("Precision: ", np.mean(precision))
print("Recall: ", np.mean(recall))
print("F1-score: ", np.mean(f1))

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

### Gradient Boosting Classifier

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# assuming X and y are defined and split into training and testing sets

# Define Gradient Boosting Classifier
clf = GradientBoostingClassifier(random_state=42)

# Define KFold
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

# Initialize variables for the performance measures
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kfold.split(X_train):
    X_train_kfold, X_test_kfold = X_train[train_index], X_train[test_index]
    y_train_kfold, y_test_kfold = y_train[train_index], y_train[test_index]
    
    # Fit the model on the training set
    clf.fit(X_train_kfold, y_train_kfold)
    
    # Predict the classes of the test set
    y_pred = clf.predict(X_test_kfold)
    
    # Compute the accuracy of the model
    accuracy = accuracy_score(y_test_kfold, y_pred)
    accuracy_scores.append(accuracy)
    
    # Compute the precision of the model
    precision = precision_score(y_test_kfold, y_pred, average='weighted')
    precision_scores.append(precision)
    
    # Compute the recall of the model
    recall = recall_score(y_test_kfold, y_pred, average='weighted')
    recall_scores.append(recall)
    
    # Compute the f1_score of the model
    f1 = f1_score(y_test_kfold, y_pred, average='weighted')
    f1_scores.append(f1)

# Print the mean accuracy, precision, recall, and f1_score of the model across the k-folds
print(f"Mean accuracy: {np.mean(accuracy_scores)}")
print(f"Mean precision: {np.mean(precision_scores)}")
print(f"Mean recall: {np.mean(recall_scores)}")
print(f"Mean f1_score: {np.mean(f1_scores)}")

### Naive Bayes Classifier

In [None]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# assuming X and y are defined and split into training and testing sets

# Define Naive Bayes Classifier
clf = GaussianNB()

# Define KFold
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

# Initialize variables for the performance measures
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kfold.split(X_train):
    X_train_kfold, X_test_kfold = X_train[train_index], X_train[test_index]
    y_train_kfold, y_test_kfold = y_train[train_index], y_train[test_index]
    
    # Fit the model on the training set
    clf.fit(X_train_kfold, y_train_kfold)
    
    # Predict the classes of the test set
    y_pred = clf.predict(X_test_kfold)
    
    # Compute the accuracy of the model
    accuracy = accuracy_score(y_test_kfold, y_pred)
    accuracy_scores.append(accuracy)
    
    # Compute the precision of the model
    precision = precision_score(y_test_kfold, y_pred, average='weighted')
    precision_scores.append(precision)
    
    # Compute the recall of the model
    recall = recall_score(y_test_kfold, y_pred, average='weighted')
    recall_scores.append(recall)
    
    # Compute the f1_score of the model
    f1 = f1_score(y_test_kfold, y_pred, average='weighted')
    f1_scores.append(f1)

# Print the mean accuracy, precision, recall, and f1_score of the model across the k-folds
print(f"Mean accuracy: {np.mean(accuracy_scores)}")
print(f"Mean precision: {np.mean(precision_scores)}")
print(f"Mean recall: {np.mean(recall_scores)}")
print(f"Mean f1_score: {np.mean(f1_scores)}")

### BiLSTM Model using Keras Classifier

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import numpy as np

embedding_dim = 16
units = 76

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=X.shape[1]),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=units, return_sequences=True)),
        tf.keras.layers.Conv1D(filters=64, kernel_size=3),
        tf.keras.layers.MaxPooling1D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(units=128, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(units=5, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=125, verbose=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

# make prediction
y_pred = model.predict(X_test)

# find class with highest probability
y_pred_classes = [np.argmax(item) for item in y_pred]

scores = cross_val_score(model, X, encoded_Y, cv=5)

## Parameters

1) EPOCHS = 2
2) embedding_dim = 16
3) Batch_Size = 125
4) Units = 76
5) val_split = 0.2