# **Convolutional Neural Networks**

## **Importing Libraries**

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D

## **Loading Data**

In [2]:
df = pd.read_csv('datasets/combined_dataset.csv')

df.head(10)

Unnamed: 0,content,dataset,label
0,PrestaShop Recipes A Problem Solution Approach...,D3DreamMarket,NO
1,Nice post bond. Just remove typo - Or...,D4Garage4hackers,NO
2,Latest version of wifite (v2) has also inclu...,D4Garage4hackers,YES
3,"On Sunday, the 28th of November 2010 around ...",D4Garage4hackers,YES
4,I have to strongly disagree with the statement...,D2Twitter,NO
5,[RT] [USERNAME] Hack Remote Windows 10 Passwor...,D2Twitter,YES
6,how to crack realityking.com,D2CrackingFire,NO
7,THE BEST PRO WIFI HACKING TOOLS PACK 2017 Her...,D3DreamMarket,YES
8,[RT] [USERNAME] Analysing the NULL SecurityDes...,D2Twitter,YES
9,Quote: Originally Posted by ShockiNN Well the ...,D1CrackingArena,NO


## **Text Preprocessing**

In [3]:
df = df.drop(columns=['dataset'])

df.head(10)

Unnamed: 0,content,label
0,PrestaShop Recipes A Problem Solution Approach...,NO
1,Nice post bond. Just remove typo - Or...,NO
2,Latest version of wifite (v2) has also inclu...,YES
3,"On Sunday, the 28th of November 2010 around ...",YES
4,I have to strongly disagree with the statement...,NO
5,[RT] [USERNAME] Hack Remote Windows 10 Passwor...,YES
6,how to crack realityking.com,NO
7,THE BEST PRO WIFI HACKING TOOLS PACK 2017 Her...,YES
8,[RT] [USERNAME] Analysing the NULL SecurityDes...,YES
9,Quote: Originally Posted by ShockiNN Well the ...,NO


### **Step 1: Text Normalisation**

In [4]:
def normalisation(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)     # Remove links
    text = re.sub(r'[^a-zA-Z\s]', '', text)                 # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()                # Remove extra spaces
    return text.lower()                                     # Convert to lowercase

df['content'] = df['content'].apply(normalisation)
df.head(10)

Unnamed: 0,content,label
0,prestashop recipes a problem solution approach...,NO
1,nice post bond just remove typo originally pos...,NO
2,latest version of wifite v has also included t...,YES
3,on sunday the th of november around utc the ma...,YES
4,i have to strongly disagree with the statement...,NO
5,rt username hack remote windows password in pl...,YES
6,how to crack realitykingcom,NO
7,the best pro wifi hacking tools pack here we f...,YES
8,rt username analysing the null securitydescrip...,YES
9,quote originally posted by shockinn well the s...,NO


### **Step 2: Stopwords Removal**

In [5]:
stop_words = set(stopwords.words('english'))

df['content'] = df['content'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

### **Step 3: Tokenisation and Lemmatization**

In [6]:
lemmatizer = WordNetLemmatizer()

df['content'] = df['content'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

df.head(10)

Unnamed: 0,content,label
0,prestashop recipe problem solution approach cu...,NO
1,nice post bond remove typo originally posted b...,NO
2,latest version wifite v also included attack v...,YES
3,sunday th november around utc main brdistribut...,YES
4,strongly disagree statement story phone hacked...,NO
5,rt username hack remote window password plain ...,YES
6,crack realitykingcom,NO
7,best pro wifi hacking tool pack figured three ...,YES
8,rt username analysing null securitydescriptor ...,YES
9,quote originally posted shockinn well site gav...,NO


## **Model Training**

In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['content'])

X_seq = tokenizer.texts_to_sequences(df['content'])
X_pad = pad_sequences(X_seq, padding='post', maxlen=100)
y = df['label']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 1692
Testing set size: 424


In [8]:
# Build CNN model
cnn = Sequential()

# Embedding layer
cnn.add(Embedding(input_dim=10000, output_dim=128, input_length=100))

# Convolutional layer
cnn.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# Max pooling layer
cnn.add(MaxPooling1D(pool_size=4))

# Global Max Pooling
cnn.add(GlobalMaxPooling1D())

# Dense layer
cnn.add(Dense(1, activation='sigmoid'))

# Compile model
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
cnn_history = cnn.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
cnn_loss, cnn_acc = cnn.evaluate(X_test, y_test)
print(f"CNN Model Accuracy: {cnn_acc:.2f}")



Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 129ms/step - accuracy: 0.5484 - loss: 0.6744 - val_accuracy: 0.7665 - val_loss: 0.6039
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 106ms/step - accuracy: 0.8871 - loss: 0.5318 - val_accuracy: 0.8585 - val_loss: 0.4313
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 96ms/step - accuracy: 0.9332 - loss: 0.3056 - val_accuracy: 0.8962 - val_loss: 0.3137
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 107ms/step - accuracy: 0.9708 - loss: 0.1373 - val_accuracy: 0.9104 - val_loss: 0.2669
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 110ms/step - accuracy: 0.9869 - loss: 0.0660 - val_accuracy: 0.9269 - val_loss: 0.2456
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - accuracy: 0.9915 - loss: 0.0384 - val_accuracy: 0.9269 - val_loss: 0.2502
Epoch 7/10
[1m27/27[0m [32m