In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = pd.read_csv('balanced_dataset_50000.csv')

# Drop rows with NaN values
data = data.dropna()

# Preprocess the dataset
X = data['comment']
y = data['label']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to strings to handle potential float values
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# Tokenize and pad sequences
max_words = 10000  # Maximum number of words to keep based on frequency
maxlen = 100  # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)


# LSTM

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, classification_report

# LSTM Model
lstm_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
y_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")
print('LSTM Model')
print('Accuracy:', accuracy_score(y_test, y_pred_lstm))
print('Precision:', precision_score(y_test, y_pred_lstm))
print('Classification Report:', classification_report(y_test, y_pred_lstm))


Epoch 1/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 193ms/step - accuracy: 0.5711 - loss: 0.6715 - val_accuracy: 0.6697 - val_loss: 0.6075
Epoch 2/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 189ms/step - accuracy: 0.7132 - loss: 0.5554 - val_accuracy: 0.6515 - val_loss: 0.6206
Epoch 3/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 188ms/step - accuracy: 0.7707 - loss: 0.4778 - val_accuracy: 0.6439 - val_loss: 0.6652
Epoch 4/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 192ms/step - accuracy: 0.8016 - loss: 0.4198 - val_accuracy: 0.6323 - val_loss: 0.7301
Epoch 5/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 186ms/step - accuracy: 0.8299 - loss: 0.3647 - val_accuracy: 0.6258 - val_loss: 0.8527
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 48ms/step
LSTM Model
Accuracy: 0.6298989898989898
Precision: 0.6400583576490204
Classification Report:      

# Random Forest Model

In [3]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_pad, y_train)

# Evaluate the model
y_pred_rf = rf_model.predict(X_test_pad)
print('Random Forest Model')
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Precision:', precision_score(y_test, y_pred_rf))
print('Classification Report:', classification_report(y_test, y_pred_rf))


Random Forest Model
Accuracy: 0.5583838383838384
Precision: 0.5615801704105344
Classification Report:               precision    recall  f1-score   support

           0       0.55      0.54      0.55      4892
           1       0.56      0.58      0.57      5008

    accuracy                           0.56      9900
   macro avg       0.56      0.56      0.56      9900
weighted avg       0.56      0.56      0.56      9900



# Neural Network

In [4]:
# Neural Network Model
nn_model = Sequential([
    Dense(512, activation='relu', input_shape=(maxlen,)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
y_pred_nn = (nn_model.predict(X_test_pad) > 0.5).astype("int32")
print('Neural Network Model')
print('Accuracy:', accuracy_score(y_test, y_pred_nn))
print('Precision:', precision_score(y_test, y_pred_nn))
print('Classification Report:', classification_report(y_test, y_pred_nn))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.4977 - loss: 43.1391 - val_accuracy: 0.4932 - val_loss: 0.6977
Epoch 2/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.5023 - loss: 0.8707 - val_accuracy: 0.4899 - val_loss: 0.6942
Epoch 3/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.4997 - loss: 0.7362 - val_accuracy: 0.4899 - val_loss: 0.6978
Epoch 4/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.5078 - loss: 0.7166 - val_accuracy: 0.4909 - val_loss: 0.6932
Epoch 5/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.5044 - loss: 0.7064 - val_accuracy: 0.4904 - val_loss: 0.6934
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Neural Network Model
Accuracy: 0.5059595959595959
Precision: 0.5059096878472573
Classification Report:               pr

# SVM Model

In [5]:
#IMPROVED



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, classification_report

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# SVM Model with GridSearchCV for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf']  # Kernel type
}

# Number of cross-validation folds (epochs)
num_epochs = 5

svm_model = GridSearchCV(SVC(random_state=42), param_grid, cv=num_epochs, scoring='accuracy')
svm_model.fit(X_train_tfidf, y_train)

# Best parameters and best score from grid search
print("Best Parameters: ", svm_model.best_params_)
print("Best CV Accuracy: {:.2f}%".format(svm_model.best_score_ * 100))

# Training accuracy and precision
train_predictions_svm = svm_model.predict(X_train_tfidf)
train_accuracy_svm = accuracy_score(y_train, train_predictions_svm)
train_precision_svm = precision_score(y_train, train_predictions_svm)

# Test accuracy and precision
test_predictions_svm = svm_model.predict(X_test_tfidf)
test_accuracy_svm = accuracy_score(y_test, test_predictions_svm)
test_precision_svm = precision_score(y_test, test_predictions_svm)

# Print results
print('\nSVM Model with Tuning and Epochs (Cross-Validation Folds)')
print(f'Training Accuracy: {train_accuracy_svm * 100:.2f}%')
print(f'Training Precision: {train_precision_svm * 100:.2f}%')
print(f'Test Accuracy: {test_accuracy_svm * 100:.2f}%')
print(f'Test Precision: {test_precision_svm * 100:.2f}%')
print('Classification Report:')
print(classification_report(y_test, test_predictions_svm))


Best Parameters:  {'C': 1, 'kernel': 'rbf'}
Best CV Accuracy: 64.94%

SVM Model with Tuning and Epochs (Cross-Validation Folds)
Training Accuracy: 88.35%
Training Precision: 90.35%
Test Accuracy: 65.22%
Test Precision: 67.61%
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.71      0.67      4892
           1       0.68      0.60      0.64      5008

    accuracy                           0.65      9900
   macro avg       0.65      0.65      0.65      9900
weighted avg       0.65      0.65      0.65      9900



# GRU Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from sklearn.metrics import accuracy_score, precision_score, classification_report

# Assuming you have X_train_pad, X_test_pad, y_train, and y_test ready

# Create the GRU model
gru_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),  # Remove input_length
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 5  # Number of epochs to train
batch_size = 64  # Batch size for training
validation_split = 0.1  # Fraction of training data to use for validation

history = gru_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

# Evaluate the model
y_pred_gru = (gru_model.predict(X_test_pad) > 0.5).astype("int32")
print('GRU Model')
print('Accuracy:', accuracy_score(y_test, y_pred_gru))
print('Precision:', precision_score(y_test, y_pred_gru))
print('Classification Report:', classification_report(y_test, y_pred_gru))


Epoch 1/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 81ms/step - accuracy: 0.5826 - loss: 0.6637 - val_accuracy: 0.6667 - val_loss: 0.6169
Epoch 2/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 84ms/step - accuracy: 0.7293 - loss: 0.5406 - val_accuracy: 0.6409 - val_loss: 0.6350
Epoch 3/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 81ms/step - accuracy: 0.7682 - loss: 15641.9834 - val_accuracy: 0.6167 - val_loss: 0.6925
Epoch 4/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 80ms/step - accuracy: 0.8050 - loss: 0.4241 - val_accuracy: 0.6111 - val_loss: 0.7327
Epoch 5/5
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 80ms/step - accuracy: 0.8289 - loss: 0.3825 - val_accuracy: 0.6068 - val_loss: 0.7798
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step
GRU Model
Accuracy: 0.6173737373737374
Precision: 0.6219024780175859
Classification Report:               

# Decision Tree Model

In [7]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_pad, y_train)

# Evaluate the model
y_pred_dt = dt_model.predict(X_test_pad)
print('Decision Tree Model')
print('Accuracy:', accuracy_score(y_test, y_pred_dt))
print('Precision:', precision_score(y_test, y_pred_dt))
print('Classification Report:', classification_report(y_test, y_pred_dt))


Decision Tree Model
Accuracy: 0.5374747474747474
Precision: 0.5440723238134374
Classification Report:               precision    recall  f1-score   support

           0       0.53      0.55      0.54      4892
           1       0.54      0.53      0.54      5008

    accuracy                           0.54      9900
   macro avg       0.54      0.54      0.54      9900
weighted avg       0.54      0.54      0.54      9900



# XGBoost Model

In [8]:
from xgboost import XGBClassifier

# XGBoost Model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_pad, y_train)

# Evaluate the model
y_pred_xgb = xgb_model.predict(X_test_pad)
print('XGBoost Model')
print('Accuracy:', accuracy_score(y_test, y_pred_xgb))
print('Precision:', precision_score(y_test, y_pred_xgb))
print('Classification Report:', classification_report(y_test, y_pred_xgb))


XGBoost Model
Accuracy: 0.565959595959596
Precision: 0.5693658536585365
Classification Report:               precision    recall  f1-score   support

           0       0.56      0.55      0.56      4892
           1       0.57      0.58      0.58      5008

    accuracy                           0.57      9900
   macro avg       0.57      0.57      0.57      9900
weighted avg       0.57      0.57      0.57      9900



# Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_pad, y_train)

# Evaluate the model
y_pred_lr = lr_model.predict(X_test_pad)
print('Logistic Regression Model')
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print('Precision:', precision_score(y_test, y_pred_lr))
print('Classification Report:', classification_report(y_test, y_pred_lr))


Logistic Regression Model
Accuracy: 0.5194949494949495
Precision: 0.5190122708680502
Classification Report:               precision    recall  f1-score   support

           0       0.52      0.35      0.42      4892
           1       0.52      0.68      0.59      5008

    accuracy                           0.52      9900
   macro avg       0.52      0.52      0.50      9900
weighted avg       0.52      0.52      0.51      9900



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
