# **Deep Learning - D1 CrackingArena**

## **Importing Libraries**

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

## **Loading Data**

In [2]:
df = pd.read_csv('D1CrackingArena.csv')

df.head()

Unnamed: 0,content,dataset,label
0,I'll check,D1CrackingArena,NO
1,I used to think this Putin was a bad man until...,D1CrackingArena,NO
2,Android Os - suck,D1CrackingArena,NO
3,check this thread before applying: Apply For C...,D1CrackingArena,NO
4,Happy birthday have a nice day,D1CrackingArena,NO


## **Text Preprocessing**

In [3]:
df = df.drop(columns=['dataset'])

df.head()

Unnamed: 0,content,label
0,I'll check,NO
1,I used to think this Putin was a bad man until...,NO
2,Android Os - suck,NO
3,check this thread before applying: Apply For C...,NO
4,Happy birthday have a nice day,NO


### **Step 1: Text Normalisation**

In [4]:
def normalize_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)     # Remove links
    text = re.sub(r'[^a-zA-Z\s]', '', text)                 # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()                # Remove extra spaces
    return text.lower()                                     # Convert to lowercase

df['content'] = df['content'].apply(normalize_text)
df.head()

Unnamed: 0,content,label
0,ill check,NO
1,i used to think this putin was a bad man until...,NO
2,android os suck,NO
3,check this thread before applying apply for cr...,NO
4,happy birthday have a nice day,NO


### **Step 2: Stopwords Removal**

In [5]:
stop_words = set(stopwords.words('english'))

df['content'] = df['content'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

### **Step 3: Tokenisation and Lemmatization**

In [6]:
lemmatizer = WordNetLemmatizer()

df['content'] = df['content'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

df.head()

Unnamed: 0,content,label
0,ill check,NO
1,used think putin bad man ravishing russian man...,NO
2,android o suck,NO
3,check thread applying apply cracker rank,NO
4,happy birthday nice day,NO


## **Model Training**

In [7]:
# Tokenizer for text
tokenizer = Tokenizer(num_words=10000)  # Use top 10,000 most frequent words
tokenizer.fit_on_texts(df['content'])

# Convert text to sequences
X_seq = tokenizer.texts_to_sequences(df['content'])

# Pad sequences for LSTM and CNN
X_pad = pad_sequences(X_seq, padding='post', maxlen=100)  # Padding sequences to max length of 100 words
y = df['label']  # Target variable

# Initialize LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split dataset into training and testing sets (80-20 split)
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train_dl)}")
print(f"Testing set size: {len(X_test_dl)}")

Training set size: 1345
Testing set size: 337


### **1 - Bidirectional Long-Short Term Memory (Bi-LSTM)**

In [8]:
# Build BiLSTM model
bilstm_model = Sequential()

# Embedding layer
bilstm_model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))

# Bidirectional LSTM layer
bilstm_model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))

# Dense layer
bilstm_model.add(Dense(1, activation='sigmoid'))

# Compile model
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
bilstm_history = bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl))

# Evaluate the model
bilstm_loss, bilstm_acc = bilstm_model.evaluate(X_test_dl, y_test_dl)
print(f"BiLSTM Model Accuracy: {bilstm_acc:.2f}")



Epoch 1/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 667ms/step - accuracy: 0.7638 - loss: 0.5287 - val_accuracy: 0.9258 - val_loss: 0.3268
Epoch 2/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 478ms/step - accuracy: 0.9114 - loss: 0.3300 - val_accuracy: 0.9169 - val_loss: 0.3190
Epoch 3/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 442ms/step - accuracy: 0.9114 - loss: 0.2654 - val_accuracy: 0.8249 - val_loss: 0.3430
Epoch 4/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 462ms/step - accuracy: 0.8832 - loss: -0.0015 - val_accuracy: 0.7982 - val_loss: 0.6471
Epoch 5/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 420ms/step - accuracy: 0.8914 - loss: -0.5284 - val_accuracy: 0.8576 - val_loss: 0.7379
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 89ms/step - accuracy: 0.8517 - loss: 0.6012
BiLSTM Model Accuracy: 0.86


### **2 - Convolutional Neural Network (CNN)**

In [9]:
# Build CNN model
cnn_model = Sequential()

# Embedding layer
cnn_model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))

# Convolutional layer
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# Max pooling layer
cnn_model.add(MaxPooling1D(pool_size=4))

# Global Max Pooling
cnn_model.add(GlobalMaxPooling1D())

# Dense layer
cnn_model.add(Dense(1, activation='sigmoid'))

# Compile model
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
cnn_history = cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl))

# Evaluate the model
cnn_loss, cnn_acc = cnn_model.evaluate(X_test_dl, y_test_dl)
print(f"CNN Model Accuracy: {cnn_acc:.2f}")

Epoch 1/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 139ms/step - accuracy: 0.7365 - loss: 0.5573 - val_accuracy: 0.9258 - val_loss: 0.3484
Epoch 2/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 104ms/step - accuracy: 0.9093 - loss: 0.3606 - val_accuracy: 0.9199 - val_loss: 0.3257
Epoch 3/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 126ms/step - accuracy: 0.9027 - loss: 0.3007 - val_accuracy: 0.8991 - val_loss: 0.2925
Epoch 4/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 123ms/step - accuracy: 0.9063 - loss: 0.1745 - val_accuracy: 0.8665 - val_loss: 0.2767
Epoch 5/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 107ms/step - accuracy: 0.8545 - loss: 0.0790 - val_accuracy: 0.8754 - val_loss: 0.2310
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8698 - loss: 0.2403
CNN Model Accuracy: 0.88


## **Model Performance Comparison**

In [10]:
# Print both model's accuracy
print(f"BiLSTM Model Accuracy: {bilstm_acc:.2f}")
print(f"CNN Model Accuracy: {cnn_acc:.2f}")

BiLSTM Model Accuracy: 0.86
CNN Model Accuracy: 0.88


## **Hyperparameter Tuning**

### **1 - Bidirectional Long-Short Term Memory (Bi-LSTM)**

In [11]:
# # Hyperparameter tuning function for BiLSTM model
# def build_bilstm_model(hp):
#     model = Sequential()
#     model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
    
#     # Bidirectional LSTM layer with hyperparameters
#     model.add(Bidirectional(LSTM(
#         units=hp.Int('units', min_value=64, max_value=256, step=64),
#         dropout=hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1),
#         recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.1, max_value=0.5, step=0.1)
#     )))
    
#     # Dense layer
#     model.add(Dense(1, activation='sigmoid'))
    
#     # Compile model with dynamic learning rate
#     model.compile(
#         loss='binary_crossentropy',
#         optimizer=tf.keras.optimizers.Adam(
#             learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')
#         ),
#         metrics=['accuracy']
#     )
#     return model

# # Create a tuner for BiLSTM
# bilstm_tuner = kt.Hyperband(
#     build_bilstm_model,
#     objective='val_accuracy',
#     max_epochs=5,
#     factor=3,
#     directory='bilstm_tuner',
#     project_name='bilstm_hyperparam_tuning'
# )

# # Start hyperparameter tuning
# bilstm_tuner.search(X_train_dl, y_train_dl, epochs=5, validation_data=(X_test_dl, y_test_dl))

### **2 - Convolutional Neural Network (CNN)**

In [12]:
# # Hyperparameter tuning function for CNN model
# def build_cnn_model(hp):
#     model = Sequential()
#     model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
    
#     # Convolutional layer with hyperparameters
#     model.add(Conv1D(
#         filters=hp.Int('filters', min_value=64, max_value=256, step=64),
#         kernel_size=hp.Int('kernel_size', min_value=3, max_value=7, step=1),
#         activation='relu'
#     ))
    
#     # MaxPooling layer
#     model.add(MaxPooling1D(pool_size=4))
    
#     # Global Max Pooling
#     model.add(GlobalMaxPooling1D())
    
#     # Dense layer
#     model.add(Dense(1, activation='sigmoid'))
    
#     # Compile model with dynamic learning rate
#     model.compile(
#         loss='binary_crossentropy',
#         optimizer=tf.keras.optimizers.Adam(
#             learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')
#         ),
#         metrics=['accuracy']
#     )
#     return model

# # Create a tuner for CNN
# cnn_tuner = kt.Hyperband(
#     build_cnn_model,
#     objective='val_accuracy',
#     max_epochs=5,
#     factor=3,
#     directory='cnn_tuner',
#     project_name='cnn_hyperparam_tuning'
# )

# # Start hyperparameter tuning
# cnn_tuner.search(X_train_dl, y_train_dl, epochs=5, validation_data=(X_test_dl, y_test_dl))

## **Model Performance Comparison**

In [13]:
# # Get the best BiLSTM model and evaluate
# best_bilstm_model = bilstm_tuner.get_best_models(num_models=1)[0]
# bilstm_loss, bilstm_acc = best_bilstm_model.evaluate(X_test_dl, y_test_dl)
# print(f"Best BiLSTM Model Accuracy: {bilstm_acc:.2f}")

# # Get the best CNN model and evaluate
# best_cnn_model = cnn_tuner.get_best_models(num_models=1)[0]
# cnn_loss, cnn_acc = best_cnn_model.evaluate(X_test_dl, y_test_dl)
# print(f"Best CNN Model Accuracy: {cnn_acc:.2f}")