# **Machine Learning - D1 CrackingArena**

## **Importing Libraries**

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

## **Loading Data**

In [2]:
df = pd.read_csv('D1CrackingArena.csv')

df.head()

Unnamed: 0,content,dataset,label
0,I'll check,D1CrackingArena,NO
1,I used to think this Putin was a bad man until...,D1CrackingArena,NO
2,Android Os - suck,D1CrackingArena,NO
3,check this thread before applying: Apply For C...,D1CrackingArena,NO
4,Happy birthday have a nice day,D1CrackingArena,NO


## **Text Preprocessing**

In [3]:
df = df.drop(columns=['dataset'])

df.head()

Unnamed: 0,content,label
0,I'll check,NO
1,I used to think this Putin was a bad man until...,NO
2,Android Os - suck,NO
3,check this thread before applying: Apply For C...,NO
4,Happy birthday have a nice day,NO


### **Step 1: Text Normalisation**

In [4]:
def normalize_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)     # Remove links
    text = re.sub(r'[^a-zA-Z\s]', '', text)                 # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()                # Remove extra spaces
    return text.lower()                                     # Convert to lowercase

df['content'] = df['content'].apply(normalize_text)
df.head()

Unnamed: 0,content,label
0,ill check,NO
1,i used to think this putin was a bad man until...,NO
2,android os suck,NO
3,check this thread before applying apply for cr...,NO
4,happy birthday have a nice day,NO


### **Step 2: Stopwords Removal**

In [5]:
stop_words = set(stopwords.words('english'))

df['content'] = df['content'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

### **Step 3: Tokenisation and Lemmatization**

In [6]:
lemmatizer = WordNetLemmatizer()

df['content'] = df['content'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

df.head()

Unnamed: 0,content,label
0,ill check,NO
1,used think putin bad man ravishing russian man...,NO
2,android o suck,NO
3,check thread applying apply cracker rank,NO
4,happy birthday nice day,NO


## **Model Training**

In [7]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=1000)  # Limit vocabulary size

# Convert text to feature vectors
X = vectorizer.fit_transform(df['content']).toarray()
y = df['label']  # Target variable

# Initialize LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 1345
Testing set size: 337


### **1 - Naive Bayes**

In [8]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict on test set
nb_predictions = nb_model.predict(X_test)

# Evaluate performance
print("Naive Bayes Performance:")
print(classification_report(y_test, nb_predictions))
print(f"Accuracy: {accuracy_score(y_test, nb_predictions):.2f}")

Naive Bayes Performance:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       312
           1       0.25      0.14      0.18         7
           2       0.40      0.33      0.36        18

    accuracy                           0.92       337
   macro avg       0.53      0.48      0.50       337
weighted avg       0.91      0.92      0.91       337

Accuracy: 0.92


### **2 - Support Vector Machine (SVM)**

In [9]:
# Initialize and train SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Predict on test set
svm_predictions = svm_model.predict(X_test)

# Evaluate performance
print("SVM Performance:")
print(classification_report(y_test, svm_predictions))
print(f"Accuracy: {accuracy_score(y_test, svm_predictions):.2f}")

SVM Performance:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       312
           1       0.00      0.00      0.00         7
           2       0.28      0.39      0.33        18

    accuracy                           0.89       337
   macro avg       0.41      0.44      0.42       337
weighted avg       0.90      0.89      0.89       337

Accuracy: 0.89


### **3 - Random Forest (RF)**

In [10]:
# Initialize and train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
rf_predictions = rf_model.predict(X_test)

# Evaluate performance
print("Random Forest Performance:")
print(classification_report(y_test, rf_predictions))
print(f"Accuracy: {accuracy_score(y_test, rf_predictions):.2f}")

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       312
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00        18

    accuracy                           0.91       337
   macro avg       0.31      0.33      0.32       337
weighted avg       0.86      0.91      0.89       337

Accuracy: 0.91


### **4 - Extreme Gradient Boosting (XGBoost)**

In [11]:
# Initialize and train XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate performance
print("XGBoost Performance:")
print(classification_report(y_test, xgb_predictions))
print(f"Accuracy: {accuracy_score(y_test, xgb_predictions):.2f}")

XGBoost Performance:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       312
           1       0.00      0.00      0.00         7
           2       0.60      0.33      0.43        18

    accuracy                           0.93       337
   macro avg       0.52      0.44      0.46       337
weighted avg       0.91      0.93      0.92       337

Accuracy: 0.93


## **Model Performance Comparison**

In [12]:
# Collect accuracy scores
results = {
    "Model": ["Naive Bayes", "SVM", "Random Forest", "XGBoost"],
    "Accuracy": [
        accuracy_score(y_test, nb_predictions),
        accuracy_score(y_test, svm_predictions),
        accuracy_score(y_test, rf_predictions),
        accuracy_score(y_test, xgb_predictions),
    ],
}

results_df = pd.DataFrame(results)
print(results_df)

           Model  Accuracy
0    Naive Bayes  0.919881
1            SVM  0.890208
2  Random Forest  0.913947
3        XGBoost  0.925816


## **Hyperparameter Tuning**

### **1 - Naive Bayes**

In [13]:
# Naive Bayes hyperparameter grid
nb_param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]  # Smoothing parameter
}

# Initialize Naive Bayes model
nb_model = MultinomialNB()

# GridSearchCV for Naive Bayes
nb_grid_search = GridSearchCV(estimator=nb_model, param_grid=nb_param_grid, cv=5, n_jobs=-1, verbose=1)
nb_grid_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best parameters for Naive Bayes: {nb_grid_search.best_params_}")
best_nb_model = nb_grid_search.best_estimator_

# Predict and evaluate
nb_predictions_tuned = best_nb_model.predict(X_test)
print("Naive Bayes Performance after Hyperparameter Tuning:")
print(classification_report(y_test, nb_predictions_tuned))
print(f"Accuracy: {accuracy_score(y_test, nb_predictions_tuned):.2f}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters for Naive Bayes: {'alpha': 2.0}
Naive Bayes Performance after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       312
           1       0.25      0.14      0.18         7
           2       0.45      0.28      0.34        18

    accuracy                           0.93       337
   macro avg       0.55      0.47      0.50       337
weighted avg       0.91      0.93      0.92       337

Accuracy: 0.93


### **2 - Support Vector Machine (SVM)**

In [14]:
# SVM hyperparameter grid
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize SVM model
svm_model = SVC(random_state=42)

# GridSearchCV for SVM
svm_grid_search = GridSearchCV(estimator=svm_model, param_grid=svm_param_grid, cv=5, n_jobs=-1, verbose=1)
svm_grid_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best parameters for SVM: {svm_grid_search.best_params_}")
best_svm_model = svm_grid_search.best_estimator_

# Predict and evaluate
svm_predictions_tuned = best_svm_model.predict(X_test)
print("SVM Performance after Hyperparameter Tuning:")
print(classification_report(y_test, svm_predictions_tuned))
print(f"Accuracy: {accuracy_score(y_test, svm_predictions_tuned):.2f}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for SVM: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
SVM Performance after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       312
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00        18

    accuracy                           0.92       337
   macro avg       0.31      0.33      0.32       337
weighted avg       0.86      0.92      0.89       337

Accuracy: 0.92


### **3 - Random Forest (RF)**

In [15]:
# Random Forest hyperparameter grid
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, n_jobs=-1, verbose=1)
rf_grid_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best parameters for Random Forest: {rf_grid_search.best_params_}")
best_rf_model = rf_grid_search.best_estimator_

# Predict and evaluate
rf_predictions_tuned = best_rf_model.predict(X_test)
print("Random Forest Performance after Hyperparameter Tuning:")
print(classification_report(y_test, rf_predictions_tuned))
print(f"Accuracy: {accuracy_score(y_test, rf_predictions_tuned):.2f}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Performance after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       312
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00        18

    accuracy                           0.92       337
   macro avg       0.31      0.33      0.32       337
weighted avg       0.86      0.92      0.89       337

Accuracy: 0.92


### **4 - Extreme Gradient Boosting (XGBoost)**

In [16]:
# XGBoost hyperparameter grid
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)

# GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, n_jobs=-1, verbose=1)
xgb_grid_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best parameters for XGBoost: {xgb_grid_search.best_params_}")
best_xgb_model = xgb_grid_search.best_estimator_

# Predict and evaluate
xgb_predictions_tuned = best_xgb_model.predict(X_test)
print("XGBoost Performance after Hyperparameter Tuning:")
print(classification_report(y_test, xgb_predictions_tuned))
print(f"Accuracy: {accuracy_score(y_test, xgb_predictions_tuned):.2f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
XGBoost Performance after Hyperparameter Tuning:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       312
           1       0.00      0.00      0.00         7
           2       0.70      0.39      0.50        18

    accuracy                           0.93       337
   macro avg       0.55      0.46      0.49       337
weighted avg       0.91      0.93      0.92       337

Accuracy: 0.93


## **Model Performance Comparison**

In [17]:
# Collect accuracy scores after tuning
tuned_results = {
    "Model": ["Naive Bayes", "SVM", "Random Forest", "XGBoost"],
    "Accuracy": [
        accuracy_score(y_test, nb_predictions_tuned),
        accuracy_score(y_test, svm_predictions_tuned),
        accuracy_score(y_test, rf_predictions_tuned),
        accuracy_score(y_test, xgb_predictions_tuned),
    ],
}

tuned_results_df = pd.DataFrame(tuned_results)
print(tuned_results_df)

           Model  Accuracy
0    Naive Bayes  0.925816
1            SVM  0.916914
2  Random Forest  0.916914
3        XGBoost  0.934718
