In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
data = pd.read_csv('processed_data2.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,text,labels,cleaned_comment
0,<user> thanks for showing up for our appointme...,1,user thanks showing appointment today
1,haha . # lol,1,haha lol
2,i love waiting <num> min for a cab - such shor...,1,love waiting num min cab shortage user please ...
3,22 super funny quotes # funnyquotes # funnysa...,1,22 super funny quote funnyquotes funnysayings ...
4,goog morning # sorrynotsorry # morning,1,goog morning sorrynotsorry morning


In [None]:
# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(data['labels'])

# Convert labels to categorical one-hot encoding
labels = to_categorical(labels)


In [None]:
# Fill NaN values with an empty string
data['cleaned_comment'] = data['cleaned_comment'].fillna('')

# Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
features = tfidf.fit_transform(data['cleaned_comment']).toarray()


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (15568, 5000)
Testing set shape: (3893, 5000)


## Dense NN model

In [None]:
# Build the dense model
dense_model = Sequential()
dense_model.add(Dense(1024, input_shape=(X_train.shape[1],), activation='relu'))
dense_model.add(Dropout(0.5))
#dense_model.add(Dense(512, activation='relu'))
#dense_model.add(Dropout(0.4))
#dense_model.add(Dense(256, activation='relu'))
#dense_model.add(Dropout(0.3))
#dense_model.add(Dense(128, activation='relu'))
#dense_model.add(Dropout(0.2))
#dense_model.add(Dense(64, activation='relu'))
#dense_model.add(Dropout(0.1))
dense_model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
dense_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
dense_history = dense_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Evaluate the model
loss, accuracy = dense_model.evaluate(X_test, y_test, verbose=0)
print(f"Dense Model Accuracy: {accuracy:.4f}")

# Generate classification report
y_pred_probs = dense_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_test_true = np.argmax(y_test, axis=1)

class_report = classification_report(y_test_true, y_pred, target_names=['Class 0', 'Class 1'])
print("\nClassification Report:\n", class_report)

Dense Model Accuracy: 0.7488

Classification Report:
               precision    recall  f1-score   support

     Class 0       0.76      0.81      0.78      2182
     Class 1       0.73      0.67      0.70      1711

    accuracy                           0.75      3893
   macro avg       0.75      0.74      0.74      3893
weighted avg       0.75      0.75      0.75      3893



### Experiment with different number of dense layer and dropout value

In [None]:
# Build the dense model
dense_model = Sequential()
dense_model.add(Dense(2048, input_shape=(X_train.shape[1],), activation='relu'))
dense_model.add(Dropout(0.9))
#dense_model.add(Dense(512, activation='relu'))
#dense_model.add(Dropout(0.4))
#dense_model.add(Dense(256, activation='relu'))
#dense_model.add(Dropout(0.3))
#dense_model.add(Dense(128, activation='relu'))
#dense_model.add(Dropout(0.2))
#dense_model.add(Dense(64, activation='relu'))
#dense_model.add(Dropout(0.1))
dense_model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
dense_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
dense_history = dense_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Classification report

In [None]:
# Evaluate the model
loss, accuracy = dense_model.evaluate(X_test, y_test, verbose=0)
print(f"Dense Model Accuracy: {accuracy:.4f}")

# Generate classification report
y_pred_probs = dense_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_test_true = np.argmax(y_test, axis=1)

class_report = classification_report(y_test_true, y_pred, target_names=['Class 0', 'Class 1'])
print("\nClassification Report:\n", class_report)

Dense Model Accuracy: 0.7588

Classification Report:
               precision    recall  f1-score   support

     Class 0       0.76      0.82      0.79      2182
     Class 1       0.75      0.68      0.71      1711

    accuracy                           0.76      3893
   macro avg       0.76      0.75      0.75      3893
weighted avg       0.76      0.76      0.76      3893

