In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/MyDrive/emailspam_splits.zip

Archive:  /content/drive/MyDrive/emailspam_splits.zip
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
train_data = pd.read_csv('/content/train.csv')
train_labels = pd.read_csv('/content/train_labels.csv')

test_data = pd.read_csv('/content/test.csv')
test_labels = pd.read_csv('/content/test_labels.csv')

validation_data = pd.read_csv('/content/validation.csv')
validation_labels = pd.read_csv('/content/validation_labels.csv')

# Fitting a model on train data

In [None]:
# Define the models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Initialize TF-IDF
tfidf = TfidfVectorizer()

# Transform the datasets
X_train = tfidf.fit_transform(train_data['preprocessed_text'])
X_test = tfidf.transform(test_data['preprocessed_text'])
y_train = train_labels['spam']
y_test = test_labels['spam']

# Fit the models and evaluate
model_metrics = {}

for name, model in models.items():
    # Fit model
    model.fit(X_train, y_train)
    # Predict on test data
    y_pred = model.predict(X_test)
    # Extract metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    model_metrics[name] = {
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score'],
        'accuracy': accuracy
    }

# Scoring models on given data


In [None]:
model_metrics

{'Naive Bayes': {'precision': 0.8943115183246072,
  'recall': 0.8769633507853403,
  'f1-score': 0.8615968192553174,
  'accuracy': 0.8769633507853403},
 'SVM': {'precision': 0.986984867077805,
  'recall': 0.9869109947643979,
  'f1-score': 0.9868247635518099,
  'accuracy': 0.9869109947643979},
 'Logistic Regression': {'precision': 0.980453554312941,
  'recall': 0.9799301919720768,
  'f1-score': 0.9796438812250124,
  'accuracy': 0.9799301919720768},
 'Random Forest': {'precision': 0.9690470347170281,
  'recall': 0.9677137870855148,
  'f1-score': 0.9669418450039897,
  'accuracy': 0.9677137870855148},
 'Gradient Boosting': {'precision': 0.9739014846252999,
  'recall': 0.9738219895287958,
  'f1-score': 0.9735339602470555,
  'accuracy': 0.9738219895287958}}

# Evaluating the model predictions

Interpretation in the Context of Non-Spam Classification:
--------------------------------------------------------

In the context of non-spam email classification of a skewed dataset, where the primary concern is to avoid misclassifying legitimate emails as spam, certain key metrics come into focus:

1. Precision for the Spam Class:
   - SVM (Support Vector Machine) demonstrates the highest precision, at approximately 98.70%.
   - Logistic Regression closely follows with a precision of around 98.05%.
   - These models excel in ensuring that when they predict an email as spam, it is indeed very likely to be spam. This is crucial for preventing false alarms and preserving the integrity of non-spam emails.

2. Recall for the Spam Class:
   - Capturing actual spam is important to maintain the effectiveness of the spam filter.
   - SVM and Logistic Regression perform well in this aspect, with recall rates of approximately 98.69% and 97.99%, respectively.
   - These models are adept at correctly identifying a high percentage of actual spam emails.

Overall, SVM emerges as an excellent choice for spam classification in this scenario:
- It exhibits both high precision and recall for the spam class, balancing the need to avoid misclassifying non-spam while effectively capturing spam.
- SVM also maintains a high overall accuracy rate, demonstrating its robustness in classifying all emails accurately.

Other two potential candidates are Logistic Regression and Gradient Boosting


# Validating the model

In [None]:
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grids for each model
hyperparameter_grids = {
    'Naive Bayes': {'alpha': [0.1, 0.5, 1.0, 1.5]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']},
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}

# Initialize an empty dictionary to store the best precision scores for each model
best_precision_scores_with_validation = {}
best_models_with_validation = {}
best_hyperparameters_with_validation = {}

# Loop through each model and perform grid search with validation set
for model_name, model in models.items():
    # Create a grid search for the current model
    grid_search = GridSearchCV(estimator=model, param_grid=hyperparameter_grids[model_name],
                               scoring='precision_weighted', cv=None, n_jobs=-1)

    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Transform the training and validation data using TF-IDF
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['preprocessed_text'])
    X_validation_tfidf = tfidf_vectorizer.transform(validation_data['preprocessed_text'])

    # Train the grid search on the TF-IDF transformed training data
    grid_search.fit(X_train_tfidf, train_labels['spam'])

    # Store the best model and its hyperparameters
    best_models_with_validation[model_name] = grid_search.best_estimator_
    best_hyperparameters_with_validation[model_name] = grid_search.best_params_

    # Evaluate the best model on the TF-IDF transformed validation data and store the precision score
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_validation_tfidf)
    precision = precision_score(validation_labels['spam'], y_pred, average='weighted')
    best_precision_scores_with_validation[model_name] = precision

best_precision_scores_with_validation

{'Naive Bayes': 0.9869701797900545,
 'SVM': 0.9939025473570182,
 'Logistic Regression': 0.9921376865859194,
 'Random Forest': 0.9766855497564325,
 'Gradient Boosting': 0.9711114372532532}

In [None]:
best_hyperparameters_with_validation

{'Naive Bayes': {'alpha': 0.1},
 'SVM': {'C': 1, 'kernel': 'linear'},
 'Logistic Regression': {'C': 10},
 'Random Forest': {'n_estimators': 200},
 'Gradient Boosting': {'learning_rate': 0.1, 'n_estimators': 200}}

# Scoring 3 benchmark models on test data and selecting the best one

- Rank 1: SVM
- Rank 2: Logistic Regression
- Rank 3: Naive Bayes