# Applied Machine Learning - Assignment 1
#### Submitted by 
- Anusha R
- MDS202212
- anushar@cmi.ac.in, r.anusha27@gmail.com

In [52]:
# importing necessary libraries

import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [12]:
import warnings

warnings.filterwarnings("ignore")

In [13]:
current_directory = os.getcwd()

In [14]:
# access functions from prepare.ipynb

%run prepare.ipynb

In [15]:
# fit a model on train data

def train_model(model, vectorizer, train_df):
    X_train = vectorizer.fit_transform(train_df['text'])
    y_train = train_df['spam']

    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model) ])

    pipeline.fit(train_df['text'], y_train)

    return pipeline

In [16]:
# score a model on given data

def score_model(model, X_data, y_data):
    y_pred = model.predict(X_data)
    accuracy = accuracy_score(y_data, y_pred)
    return accuracy

In [17]:
# evaluate the model predictions

def evaluate_model(model, X_data, y_data):
    y_pred = model.predict(X_data)
    report = classification_report(y_data, y_pred)
    return report

In [18]:
# fine-tune using training data

def fine_tune_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_

## Preprocessing the dataset

In [19]:
# Load the data

file_path = os.path.join(current_directory, 'Dataset/emails.csv')
data = load_data(file_path)

In [20]:
# Preprocess the data

preprocessed_data = preprocess_data(data)

In [21]:
# Split the data as train/validation/test

train_data, validation_data, test_data = split_data(preprocessed_data)

In [22]:
# Save data

save_data(train_data, validation_data, test_data)

## Naive Bayes Model

In [23]:
# Train Naive Bayes (MultinomialNB) model

nb_model = MultinomialNB()
nb_pipeline = train_model(nb_model, TfidfVectorizer(), train_data)

In [24]:
# score and evaluation on train data

print("Accuracy:", score_model(nb_pipeline, train_data['text'], train_data['spam']))
print("Evaluation report:\n", evaluate_model(nb_pipeline, train_data['text'], train_data['spam']))

Accuracy: 0.889697322467986
Evaluation report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93      2616
           1       1.00      0.54      0.70       820

    accuracy                           0.89      3436
   macro avg       0.94      0.77      0.82      3436
weighted avg       0.90      0.89      0.88      3436



In [26]:
# score and evaluation on validation data

print("Accuracy:", score_model(nb_pipeline, validation_data['text'], validation_data['spam']))
print("Evaluation report:\n", evaluate_model(nb_pipeline, validation_data['text'], validation_data['spam']))

Accuracy: 0.868237347294939
Evaluation report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       872
           1       1.00      0.45      0.62       274

    accuracy                           0.87      1146
   macro avg       0.93      0.72      0.77      1146
weighted avg       0.89      0.87      0.85      1146



### Hyperparameter -Tuning Naive Bayes Model

In [27]:
# Fine-tune Naive Bayes model

nb_param_grid = {'model__alpha': [0.1, 0.5, 1.0]}
best_nb_model = fine_tune_model(nb_pipeline, nb_param_grid, train_data['text'], train_data['spam'])
best_nb_model

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model', MultinomialNB(alpha=0.1))])

In [28]:
# Score and evaluate Naive Bayes model on train set

nb_train_accuracy = score_model(best_nb_model, train_data['text'], train_data['spam'])
nb_train_report = evaluate_model(best_nb_model, train_data['text'], train_data['spam'])

# Print Naive Bayes results
print("Naive Bayes Results:")
print(f"Training data Accuracy: {nb_train_accuracy:.4f}")
print("Classification Report:\n", nb_train_report)

Naive Bayes Results:
Training data Accuracy: 0.9988
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2616
           1       1.00      1.00      1.00       820

    accuracy                           1.00      3436
   macro avg       1.00      1.00      1.00      3436
weighted avg       1.00      1.00      1.00      3436



In [29]:
# Score and evaluate Naive Bayes model on validation set

nb_validation_accuracy = score_model(best_nb_model, validation_data['text'], validation_data['spam'])
nb_validation_report = evaluate_model(best_nb_model, validation_data['text'], validation_data['spam'])

# Print Naive Bayes results
print("Naive Bayes Results:")
print(f"Validation Accuracy: {nb_validation_accuracy:.4f}")
print("Classification Report:\n", nb_validation_report)

Naive Bayes Results:
Validation Accuracy: 0.9817
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       872
           1       1.00      0.92      0.96       274

    accuracy                           0.98      1146
   macro avg       0.99      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146



## Logistic Regression Model

In [30]:
# Train logistic regression model

lr_model = LogisticRegression()
lr_pipeline = train_model(lr_model, TfidfVectorizer(), train_data)

In [31]:
# score and evaluation on train data

print("Accuracy:", score_model(lr_pipeline, train_data['text'], train_data['spam']))
print("Evaluation report:\n", evaluate_model(lr_pipeline, train_data['text'], train_data['spam']))

Accuracy: 0.9944703143189756
Evaluation report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      2616
           1       1.00      0.98      0.99       820

    accuracy                           0.99      3436
   macro avg       1.00      0.99      0.99      3436
weighted avg       0.99      0.99      0.99      3436



In [32]:
# score and evaluation on validation data

print("Accuracy:", score_model(lr_pipeline, validation_data['text'], validation_data['spam']))
print("Evaluation report:\n", evaluate_model(lr_pipeline, validation_data['text'], validation_data['spam']))

Accuracy: 0.9825479930191972
Evaluation report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       872
           1       0.99      0.93      0.96       274

    accuracy                           0.98      1146
   macro avg       0.99      0.97      0.98      1146
weighted avg       0.98      0.98      0.98      1146



### Hyperparameter Tuning - Logistic Regression Model

In [33]:
# Fine-tune Logistic Regression model

lr_param_grid = {'model__C': [0.1, 1.0, 10.0]}
best_lr_model = fine_tune_model(lr_pipeline, lr_param_grid, train_data['text'], train_data['spam'])
best_lr_model

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model', LogisticRegression(C=10.0))])

In [34]:
# Score and evaluate Logistic Regression model on train set

lr_train_accuracy = score_model(best_lr_model, train_data['text'], train_data['spam'])
lr_train_report = evaluate_model(best_lr_model, train_data['text'], train_data['spam'])

# Print Logistic Regression results
print("Logistic Regression Results:")
print(f"Training data Accuracy: {lr_train_accuracy:.4f}")
print("Classification Report:\n", lr_train_report)

Logistic Regression Results:
Training data Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2616
           1       1.00      1.00      1.00       820

    accuracy                           1.00      3436
   macro avg       1.00      1.00      1.00      3436
weighted avg       1.00      1.00      1.00      3436



In [35]:
# Score and evaluate Logistic Regression model on validation set

lr_validation_accuracy = score_model(best_lr_model, validation_data['text'], validation_data['spam'])
lr_validation_report = evaluate_model(best_lr_model, validation_data['text'], validation_data['spam'])

# Print Logistic Regression results
print("Logistic Regression Results:")
print(f"Validation Accuracy: {lr_validation_accuracy:.4f}")
print("Classification Report:\n", lr_validation_report)

Logistic Regression Results:
Validation Accuracy: 0.9904
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       872
           1       0.99      0.97      0.98       274

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.99      1146
weighted avg       0.99      0.99      0.99      1146



##  Random Forest Classifier Model


In [45]:
rf_model = RandomForestClassifier()
tfidf_vectorizer = TfidfVectorizer()
rf_pipeline = train_model(rf_model, TfidfVectorizer(), train_data)

In [47]:
# score and evaluation on train data

print("Accuracy:", score_model(rf_pipeline, train_data['text'], train_data['spam']))
print("Evaluation report:\n", evaluate_model(rf_pipeline, train_data['text'], train_data['spam']))

Accuracy: 1.0
Evaluation report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2616
           1       1.00      1.00      1.00       820

    accuracy                           1.00      3436
   macro avg       1.00      1.00      1.00      3436
weighted avg       1.00      1.00      1.00      3436



In [48]:
# score and evaluation on validation data

print("Accuracy:", score_model(rf_pipeline, validation_data['text'], validation_data['spam']))
print("Evaluation report:\n", evaluate_model(rf_pipeline, validation_data['text'], validation_data['spam']))

Accuracy: 0.9694589877835951
Evaluation report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       872
           1       0.99      0.88      0.93       274

    accuracy                           0.97      1146
   macro avg       0.98      0.94      0.96      1146
weighted avg       0.97      0.97      0.97      1146



### Hyperparameter Tuning Random Forest Model

In [49]:
# Fine-tune the Random Forest Classifier

rf_param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
best_rf_model = fine_tune_model(rf_pipeline, rf_param_grid, train_data['text'], train_data['spam'])
best_rf_model

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model',
                 RandomForestClassifier(min_samples_split=5,
                                        n_estimators=200))])

In [50]:
# Score and evaluate Random Forest model on train set

rf_train_accuracy = score_model(best_rf_model, train_data['text'], train_data['spam'])
rf_train_report = evaluate_model(best_rf_model, train_data['text'], train_data['spam'])

# Print Random Forest results
print("Random Forest Results:")
print(f"Training data Accuracy: {rf_train_accuracy:.4f}")
print("Classification Report:\n", rf_train_report)

Random Forest Results:
Training data Accuracy: 0.9997
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2616
           1       1.00      1.00      1.00       820

    accuracy                           1.00      3436
   macro avg       1.00      1.00      1.00      3436
weighted avg       1.00      1.00      1.00      3436



In [51]:
# Score and evaluate Logistic Regression model on validation set

rf_validation_accuracy = score_model(best_rf_model, validation_data['text'], validation_data['spam'])
rf_validation_report = evaluate_model(best_rf_model, validation_data['text'], validation_data['spam'])

# Print Random Forest results
print("Random Forest Results:")
print(f"Validation Accuracy: {rf_validation_accuracy:.4f}")
print("Classification Report:\n", rf_validation_report)

Random Forest Results:
Validation Accuracy: 0.9634
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       872
           1       0.99      0.85      0.92       274

    accuracy                           0.96      1146
   macro avg       0.97      0.93      0.95      1146
weighted avg       0.96      0.96      0.96      1146



## Predicting the results of test data and choosing the best model

In [53]:
predictions_nb = best_nb_model.predict(test_data['text'])
predictions_lr = best_lr_model.predict(test_data['text'])
predictions_rf = best_rf_model.predict(test_data['text'])

In [54]:
# Create a DataFrame to store the metrics

metrics_df = pd.DataFrame(index=['Naive Bayes', 'Logistic Regression', 'Random Forest'])


In [56]:
# Calculate and store accuracy, precision, recall, and F1 score for each model

metrics_df['Accuracy'] = [accuracy_score(test_data['spam'], predictions_nb),
                          accuracy_score(test_data['spam'], predictions_lr),
                          accuracy_score(test_data['spam'], predictions_rf)]

metrics_df['Precision'] = [precision_score(test_data['spam'], predictions_nb),
                            precision_score(test_data['spam'], predictions_lr),
                            precision_score(test_data['spam'], predictions_rf)]

metrics_df['Recall'] = [recall_score(test_data['spam'], predictions_nb),
                        recall_score(test_data['spam'], predictions_lr),
                        recall_score(test_data['spam'], predictions_rf)]

metrics_df['F1 Score'] = [f1_score(test_data['spam'], predictions_nb),
                          f1_score(test_data['spam'], predictions_lr),
                          f1_score(test_data['spam'], predictions_rf)]


In [57]:
# Display the metrics table

print(metrics_df)

                     Accuracy  Precision    Recall  F1 Score
Naive Bayes          0.987784   0.996183  0.952555  0.973881
Logistic Regression  0.993019   0.988971  0.981752  0.985348
Random Forest        0.959860   0.987179  0.843066  0.909449


The fine-tuned logistic regression model has the best accuracy, recall and f1-score values. It has a pretty good precision score too. Hence, I am choosing this model as the best one for spam email detection. 

## End of Assignment 1