In [38]:
# Loading Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from joblib import load
from sklearn.model_selection import cross_val_score, StratifiedKFold
import joblib

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rittique/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rittique/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#Path to the directory where the data is saved
data_path = "./data/"
os.listdir(data_path)

['test.csv', 'train.csv', '.ipynb_checkpoints', 'sample_submission.csv']

In [15]:
train_data = os.path.join(data_path, os.listdir(data_path)[1])
test_data = os.path.join(data_path, os.listdir(data_path)[0])

df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)

print(f"Size of Training Set: {df_train.size}")
print(f"Size of Testing Set: {df_test.size}")

print(f"Length of Training Set: {len(df_train)}")
print(f"Length of Testing Set: {len(df_test)}")

Size of Training Set: 60000
Size of Testing Set: 40632
Length of Training Set: 15000
Length of Testing Set: 13544


# Helper Functions

In [17]:
def extract_subject_and_email(df):
    subjects = []
    bodies = []
    for i in range(len(df)):
        subjects.append(df.email[i].split("\n")[0].split("Subject: ")[1])
        bodies.append(" ".join(df.email[i].split("\n")[1:]))

    df["subjects"] = subjects
    df["email_bodies"] = bodies

    return df

def email_check(df):
    has_email_id = []
    for i in range(len(df)):
        if " @ " and " . com" in df.email_bodies[i]:
            has_email_id.append(1)
        else:
            has_email_id.append(0)
    
    return has_email_id

def clean_text(text):
    # Remove URLs with gaps
    text = re.sub(r'https\s+www\s+\S+\s+\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'www\s+\S+\s+\S+', '', text, flags=re.MULTILINE)
    
    # Remove emails with gaps
    text = re.sub(r'\S+\s+\S+\s+@\s+\S+\s+\S+', '', text)
    
    # Remove phone numbers (adjust pattern if needed)
    text = re.sub(r'\b\d{10,15}\b', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

def complete_preprocessing(df):
    df = extract_subject_and_email(df)
    df["has_email_id"] = email_check(df)
    df["subjects"] = df["subjects"].apply(lambda x: clean_text(x))
    df["email_bodies"] = df["email_bodies"].apply(lambda x: clean_text(x))
    
    # Tfidf Vectorization
    subject_tfidf = TfidfVectorizer(max_features=1000).fit_transform(df['subjects'])
    email_tfidf = TfidfVectorizer(max_features=5000).fit_transform(df['email_bodies'])
    
    # Combine all features
    X = hstack([subject_tfidf, email_tfidf, df[['source', 'has_email_id']].values])
    
    return X

In [19]:
X_train = complete_preprocessing(df_train)
y_train = df_train['class']

# Train-validation split 95% training and 5% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=42)

# Training on base state of three Models: Random Forest, SVM, Logistics Regression

In [21]:
# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'SVM' : SVC(class_weight='balanced', random_state=42),
    'Logistic Regression' : LogisticRegression(class_weight='balanced', solver='saga', max_iter=500, random_state=42)
}

# Train classifiers and evaluate
for name, classifier in classifiers.items():
    print(f"Training {name}...")
    classifier.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = classifier.predict(X_val)
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_val, y_pred))
    print(f"{name} Accuracy: {accuracy_score(y_val, y_pred):.4f}")

Training Random Forest...

Random Forest Classification Report:

              precision    recall  f1-score   support

    not_spam       0.99      0.97      0.98       357
        spam       0.97      0.99      0.98       393

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750

Random Forest Accuracy: 0.9800
Training SVM...

SVM Classification Report:

              precision    recall  f1-score   support

    not_spam       0.99      0.97      0.98       357
        spam       0.97      0.99      0.98       393

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750

SVM Accuracy: 0.9827
Training Logistic Regression...

Logistic Regression Classification Report:

              precision    recall  f1-score   support

    not_spam       0.99      0.97      0.98       35

### Doing Cross validation to check for model generalisation

In [25]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train classifiers and evaluate
for name, classifier in classifiers.items():
    print(f"Training {name}...")
    
    # Perform cross-validation
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    # Fit the classifier on the full training set
    classifier.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = classifier.predict(X_val)
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_val, y_pred))
    print(f"{name} Accuracy: {accuracy_score(y_val, y_pred):.4f}\n")

Training Random Forest...
Random Forest Cross-Validation Accuracy: 0.9817 ± 0.0026

Random Forest Classification Report:

              precision    recall  f1-score   support

    not_spam       0.99      0.98      0.98       357
        spam       0.98      0.99      0.98       393

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750

Random Forest Accuracy: 0.9827

Training SVM...
SVM Cross-Validation Accuracy: 0.9818 ± 0.0025

SVM Classification Report:

              precision    recall  f1-score   support

    not_spam       0.99      0.97      0.98       357
        spam       0.97      0.99      0.98       393

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750

SVM Accuracy: 0.9827

Training Logistic Regression...
Logistic Regression Cross-Validation Accuracy:

### Hyper Parameter Tuning

In [33]:
# Hyperparameter grids
param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

param_dist_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]  # Only for 'poly' kernel
}

param_dist_logreg = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5, 0.7]  # Only relevant when 'penalty' is 'elasticnet'
}

In [34]:
# RandomizedSearchCV for RandomForest
random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
    param_distributions=param_dist_rf,
    n_iter=50,  # Number of different combinations to try
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=2,
    random_state=42
)

# RandomizedSearchCV for SVC
random_search_svc = RandomizedSearchCV(
    estimator=SVC(class_weight='balanced', random_state=42),
    param_distributions=param_dist_svc,
    n_iter=50,  # Number of different combinations to try
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# RandomizedSearchCV for LogisticRegression
random_search_logreg = RandomizedSearchCV(
    estimator=LogisticRegression(class_weight='balanced', max_iter=500, random_state=42),
    param_distributions=param_dist_logreg,
    n_iter=50,  # Number of different combinations to try
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [35]:
print("Tuning Random Forest...")
random_search_rf.fit(X_train, y_train)
print(f"Best parameters for Random Forest: {random_search_rf.best_params_}")
print(f"Best cross-validation score for Random Forest: {random_search_rf.best_score_:.4f}\n")

print("Tuning SVM...")
random_search_svc.fit(X_train, y_train)
print(f"Best parameters for SVM: {random_search_svc.best_params_}")
print(f"Best cross-validation score for SVM: {random_search_svc.best_score_:.4f}\n")

print("Tuning Logistic Regression...")
random_search_logreg.fit(X_train, y_train)
print(f"Best parameters for Logistic Regression: {random_search_logreg.best_params_}")
print(f"Best cross-validation score for Logistic Regression: {random_search_logreg.best_score_:.4f}\n")

Tuning Random Forest...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Best cross-validation score for Random Forest: 0.9848

Tuning SVM...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for SVM: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 4, 'C': 10}
Best cross-validation score for SVM: 0.9869

Tuning Logistic Regression...
Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'l1_ratio': 0.1, 'C': 100}
Best cross-validation score for Logistic Regression: 0.9867





In [36]:
# Dictionaries to store best hyperparameters and accuracy
best_hyperparams_rf = {}
best_hyperparams_svc = {}
best_hyperparams_logreg = {}

accuracy_rf = 0.0
accuracy_svc = 0.0
accuracy_logreg = 0.0

# Evaluate the best RandomForest model
best_rf = random_search_rf.best_estimator_
best_hyperparams_rf = random_search_rf.best_params_
y_pred_rf = best_rf.predict(X_val)
accuracy_rf = accuracy_score(y_val, y_pred_rf)
print("\nRandom Forest Test Set Evaluation:\n")
print(classification_report(y_val, y_pred_rf))
print(f"Random Forest Test Accuracy: {accuracy_rf:.4f}")

# Evaluate the best SVC model
best_svc = random_search_svc.best_estimator_
best_hyperparams_svc = random_search_svc.best_params_
y_pred_svc = best_svc.predict(X_val)
accuracy_svc = accuracy_score(y_val, y_pred_svc)
print("\nSVM Test Set Evaluation:\n")
print(classification_report(y_val, y_pred_svc))
print(f"SVM Test Accuracy: {accuracy_svc:.4f}")

# Evaluate the best Logistic Regression model
best_logreg = random_search_logreg.best_estimator_
best_hyperparams_logreg = random_search_logreg.best_params_
y_pred_logreg = best_logreg.predict(X_val)
accuracy_logreg = accuracy_score(y_val, y_pred_logreg)
print("\nLogistic Regression Test Set Evaluation:\n")
print(classification_report(y_val, y_pred_logreg))
print(f"Logistic Regression Test Accuracy: {accuracy_logreg:.4f}")

# Store the best accuracy and hyperparameters for each model
best_model_info = {
    'RandomForest': {'best_hyperparams': best_hyperparams_rf, 'accuracy': accuracy_rf},
    'SVC': {'best_hyperparams': best_hyperparams_svc, 'accuracy': accuracy_svc},
    'LogisticRegression': {'best_hyperparams': best_hyperparams_logreg, 'accuracy': accuracy_logreg}
}

print("\nBest Model Information:\n", best_model_info)



Random Forest Test Set Evaluation:

              precision    recall  f1-score   support

    not_spam       0.99      0.97      0.98       357
        spam       0.98      0.99      0.98       393

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750

Random Forest Test Accuracy: 0.9827

SVM Test Set Evaluation:

              precision    recall  f1-score   support

    not_spam       1.00      0.98      0.99       357
        spam       0.98      1.00      0.99       393

    accuracy                           0.99       750
   macro avg       0.99      0.99      0.99       750
weighted avg       0.99      0.99      0.99       750

SVM Test Accuracy: 0.9880

Logistic Regression Test Set Evaluation:

              precision    recall  f1-score   support

    not_spam       0.99      0.97      0.98       357
        spam       0.98      0.99      0.99       393

    accuracy 

In [39]:
# Ensure the directories exist
os.makedirs('models', exist_ok=True)
os.makedirs('result', exist_ok=True)

# Save the trained models
joblib.dump(best_rf, 'models/random_forest_model.pkl')
joblib.dump(best_svc, 'models/svc_model.pkl')
joblib.dump(best_logreg, 'models/logistic_regression_model.pkl')

# Determine the best model based on accuracy
best_model_name = max(best_model_info, key=lambda model: best_model_info[model]['accuracy'])
best_model_params = best_model_info[best_model_name]['best_hyperparams']
best_model_accuracy = best_model_info[best_model_name]['accuracy']

# Write best model parameters and metrics to a text file
result_text = (
    f"Best Model: {best_model_name}\n"
    f"Best Hyperparameters: {best_model_params}\n"
    f"Test Accuracy: {best_model_accuracy:.4f}\n"
)

with open('result/best_model_info.txt', 'w') as file:
    file.write(result_text)

print(f"\nModel and results saved successfully.")


Model and results saved successfully.
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   3.4s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   6.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   6.9s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.9s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.6s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END bootst

# Predicting on the Test Dataset

In [40]:
# Load the best model based on accuracy
best_model_filename = {
    'RandomForest': 'models/random_forest_model.pkl',
    'SVC': 'models/svc_model.pkl',
    'LogisticRegression': 'models/logistic_regression_model.pkl'
}[best_model_name]

best_model = joblib.load(best_model_filename)

In [41]:
X_test = complete_preprocessing(df_test)

In [42]:
y_pred_test = best_model.predict(X_test)

In [43]:
print(f"\nPredictions from the best model ({best_model_name}):\n")
print(y_pred_test)


Predictions from the best model (SVC):

['spam' 'spam' 'spam' ... 'not_spam' 'not_spam' 'spam']


In [44]:
data = {
    "id_": df_test["id_"],
    "class": y_pred_test
}
submission_df = pd.DataFrame(data)
submission_df

Unnamed: 0,id_,class
0,b4c16282-2934-49c9-ae12-99ad8ca3c960,spam
1,cbe49c1c-b328-4716-9b78-9169c6111e80,spam
2,0fa05eb2-f8cd-4cbf-b48e-bbc925b2baac,spam
3,b696569e-f7de-4771-9946-be5dd477b2f6,spam
4,d9bd17c0-28ec-43f2-b29a-2b9f25089a85,not_spam
...,...,...
13539,c03b5042-4fbd-4756-86aa-7edd82777094,spam
13540,39841b62-79c2-4b9f-b00d-a72902490244,spam
13541,d5af0144-7b08-4695-8269-5b1507816b91,not_spam
13542,65735e11-cf67-4739-964e-67b1021c7153,not_spam


[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.2s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   3.5s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   6.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   3.6s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.9s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.6s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END bootstrap=True, max_depth=None, min_samples_l

In [45]:
submission_df.to_csv("result/submission_Md_Rittique_alam.csv", index=False)