# Importing the libaries

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, GRU, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


# Loading the dataset and tokenization

In [21]:
# Load the dataset
df = pd.read_csv('final.csv')
text_column = 'tweet'  # Adjust if the column name is different

# Ensure all entries in the text column are strings
df[text_column] = df[text_column].astype(str).fillna('')

# Encode the labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['class'])

# Split the data
X = df[text_column].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all training and test data are strings
X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)  # Adjust num_words as needed
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 100  # Adjust max_length as needed
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

print("Shape of training data:", X_train_pad.shape)
print("Shape of test data:", X_test_pad.shape)


Shape of training data: (19826, 100)
Shape of test data: (4957, 100)


# Model Definitions

In [22]:
# Define LSTM model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
    model.add(LSTM(128))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define Bidirectional LSTM model
def create_bidirectional_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define CNN model
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


# Training and Evaluation

In [32]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Function to train and evaluate a model
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping], verbose=2)
    y_pred = np.argmax(model.predict(X_test), axis=1)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return f1

# Create models
lstm_model = create_lstm_model()
bidirectional_lstm_model = create_bidirectional_lstm_model()
cnn_model = create_cnn_model()

# Train and evaluate models
f1_scores = {}
f1_scores['LSTM'] = train_and_evaluate_model(lstm_model, X_train_pad, y_train, X_test_pad, y_test)
f1_scores['Bidirectional LSTM'] = train_and_evaluate_model(bidirectional_lstm_model, X_train_pad, y_train, X_test_pad, y_test)
f1_scores['CNN'] = train_and_evaluate_model(cnn_model, X_train_pad, y_train, X_test_pad, y_test)

print("F1 Scores:", f1_scores)


Epoch 1/10




496/496 - 42s - 85ms/step - accuracy: 0.8578 - loss: 0.3885 - val_accuracy: 0.9090 - val_loss: 0.2748
Epoch 2/10
496/496 - 43s - 88ms/step - accuracy: 0.9262 - loss: 0.2134 - val_accuracy: 0.8996 - val_loss: 0.2849
Epoch 3/10
496/496 - 42s - 85ms/step - accuracy: 0.9510 - loss: 0.1416 - val_accuracy: 0.8908 - val_loss: 0.3565
Epoch 4/10
496/496 - 37s - 75ms/step - accuracy: 0.9656 - loss: 0.0982 - val_accuracy: 0.8782 - val_loss: 0.4494
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step
Epoch 1/10
496/496 - 54s - 108ms/step - accuracy: 0.8586 - loss: 0.3941 - val_accuracy: 0.9060 - val_loss: 0.2725
Epoch 2/10
496/496 - 51s - 102ms/step - accuracy: 0.9227 - loss: 0.2202 - val_accuracy: 0.8966 - val_loss: 0.2897
Epoch 3/10
496/496 - 51s - 103ms/step - accuracy: 0.9491 - loss: 0.1467 - val_accuracy: 0.8853 - val_loss: 0.3446
Epoch 4/10
496/496 - 50s - 101ms/step - accuracy: 0.9656 - loss: 0.1008 - val_accuracy: 0.8822 - val_loss: 0.4051
[1m155/155[0m [32m━━━━━━

# Saving the Best Model

In [33]:
# Determine the best model
best_model_name = max(f1_scores, key=f1_scores.get)
best_model = None

if best_model_name == 'LSTM':
    best_model = lstm_model
elif best_model_name == 'Bidirectional LSTM':
    best_model = bidirectional_lstm_model
elif best_model_name == 'CNN':
    best_model = cnn_model

# Save the best model
best_model.save('best_deep_learning_model.h5')
print(f'The best model is {best_model_name} and it has been saved as best_deep_learning_model.h5')




The best model is CNN and it has been saved as best_deep_learning_model.h5


#  Making Predictions

In [34]:
# Load the best model
best_model = load_model('best_deep_learning_model.h5')

# Load the final dataset
final_df = pd.read_csv('final.csv')
final_df[text_column] = final_df[text_column].astype(str).fillna('')
X_new = final_df[text_column]

# Tokenize and pad the new data
X_new_seq = tokenizer.texts_to_sequences(X_new)
X_new_pad = pad_sequences(X_new_seq, maxlen=max_length)

# Predict using the best model
new_predictions = best_model.predict(X_new_pad)
new_predictions_labels = np.argmax(new_predictions, axis=1)

# Map predictions to labels
new_predictions_labels = label_encoder.inverse_transform(new_predictions_labels)

# Add predictions to the dataframe
final_df['predictions'] = new_predictions_labels

# Save the predictions to a CSV file
final_df.to_csv('final_with_deep_learning_predictions.csv', index=False)

# Print the predictions
print(final_df[['tweet', 'predictions']])




[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step
                                                   tweet  predictions
0      woman complain cleaning house man always take ...            2
1                  boy coldtyga bad cuffin hoe 1st place            1
2         friend ever fuck bitch start cry confused shit            1
3                                       look like tranny            2
4            shit hear might true might faker bitch told            1
...                                                  ...          ...
24778  yous muthafin lie right tl trash mine bible sc...            2
24779    gone broke wrong heart baby drove redneck crazy            2
24780        young buck wanna eat nigguh like fuckin dis            1
24781                  youu got wild bitches tellin lies            1
24782  ruffled ntac eileen dahlia beautiful color com...            2

[24783 rows x 2 columns]
