In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
def fit_model(X_train, y_train, model_name):
    if model_name == 'logistic_regression':
        model = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('classifier', LogisticRegression(random_state=42))
        ])
    elif model_name == 'naive_bayes':
        model = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('classifier', MultinomialNB())
        ])
    elif model_name == 'random_forest':
        model = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
    else:
        raise ValueError("Model name not found")
    
    model.fit(X_train, y_train)
    return model

In [3]:
def score_model(model, data, y_true):
    y_pred = model.predict(data)
    return accuracy_score(y_true, y_pred)

In [4]:
def evaluate_model(y_true, y_pred):
    print(classification_report(y_true, y_pred))

In [5]:
def validate_model(model, train_data, y_train, val_data, y_val):
    print("Train score:", score_model(model, train_data, y_train))
    print("Validation score:", score_model(model, val_data, y_val))

In [6]:
train_df = pd.read_csv('/kaggle/input/dataset1/train (1).csv')
val_df = pd.read_csv('/kaggle/input/dataset1/validation (1).csv')
test_df = pd.read_csv('/kaggle/input/dataset1/test (1).csv')

In [7]:
X_train = train_df['processed_text']
y_train = train_df['spam']

In [8]:
X_val = val_df['processed_text']
y_val = val_df['spam']

In [9]:
X_test = test_df['processed_text']
y_test = test_df['spam']

Model 1: Logistic Regression

In [10]:
model1 = fit_model(X_train, y_train,'logistic_regression')
# Score on train and validation
validate_model(model1, X_train, y_train, X_val, y_val)



Train score: 0.9956350938454823
Validation score: 0.9748908296943232


In [11]:
y_pred_train = model1.predict(X_train)
print("Training:\n")
evaluate_model(y_train, y_pred_train)


Training:

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3504
           1       1.00      0.98      0.99      1078

    accuracy                           1.00      4582
   macro avg       1.00      0.99      0.99      4582
weighted avg       1.00      1.00      1.00      4582



In [12]:
y_pred_val = model1.predict(X_val)
print("Validation:\n")
evaluate_model(y_val, y_pred_val)

Validation:

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       690
           1       0.99      0.91      0.95       226

    accuracy                           0.97       916
   macro avg       0.98      0.95      0.97       916
weighted avg       0.98      0.97      0.97       916



In [13]:
def fine_tune_model1(model, X_train, y_val):


    param_grid = {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs'],
        'classifier__penalty': ['l2']
    }
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    finetuned_model = grid_search.best_estimator_


    return finetuned_model

In [14]:
finetuned_model1 = fine_tune_model1(model1, X_train, y_val)

In [16]:
score_model(finetuned_model1, X_val, y_val)

0.990174672489083

Model 2: Naive bayes

In [15]:
model2 = fit_model(X_train, y_train,'naive_bayes')
# Score on train and validation
validate_model(model2, X_train, y_train, X_val, y_val)


Train score: 0.9408555216062855
Validation score: 0.888646288209607


In [16]:
y_pred_train = model2.predict(X_train)
print("Training:\n")
evaluate_model(y_train, y_pred_train)

Training:

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3504
           1       1.00      0.75      0.86      1078

    accuracy                           0.94      4582
   macro avg       0.96      0.87      0.91      4582
weighted avg       0.94      0.94      0.94      4582



In [19]:
y_pred_val = model2.predict(X_val)
print("Validation:\n")
evaluate_model(y_val, y_pred_val)

Validation:

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       690
           1       1.00      0.55      0.71       226

    accuracy                           0.89       916
   macro avg       0.94      0.77      0.82       916
weighted avg       0.90      0.89      0.88       916



In [17]:
def fine_tune_model2(model, X_train, y_val):


    param_grid = {
        'classifier__alpha': [0.01, 0.1, 1]
    }
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    finetuned_model = grid_search.best_estimator_

    return finetuned_model

In [18]:
finetuned_model2 = fine_tune_model2(model2, X_train, y_val)

In [19]:
score_model(finetuned_model2, X_val, y_val)

0.9836244541484717

Model 3: Random forest

In [23]:
model3 = fit_model(X_train, y_train,'random_forest')
# Score on train and validation
validate_model(model3, X_train, y_train, X_val, y_val)


Train score: 1.0
Validation score: 0.972707423580786


In [24]:
y_pred_train = model3.predict(X_train)
print("Training:\n")
evaluate_model(y_train, y_pred_train)

Training:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
           1       1.00      1.00      1.00      1078

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582



In [25]:
y_pred_val = model3.predict(X_val)
print("Validation:\n")
evaluate_model(y_val, y_pred_val)

Validation:

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       690
           1       1.00      0.89      0.94       226

    accuracy                           0.97       916
   macro avg       0.98      0.94      0.96       916
weighted avg       0.97      0.97      0.97       916



In [26]:
def fine_tune_model3(model, X_train, y_val):


    param_grid = {'classifier__n_estimators': [50, 100, 150],
                  'classifier__max_depth': [None,10, 20, 30]}
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    finetuned_model = grid_search.best_estimator_

    return finetuned_model

In [27]:
finetuned_model3 = fine_tune_model3(model3, X_train, y_val)

In [28]:
score_model(finetuned_model3, X_val, y_val)

0.972707423580786

Evaluation on Test data

In [29]:
score_logistic=score_model(model1, X_test, y_test)
print("Logistic regression classifier score on test data:",score_logistic)
score_naiveb=score_model(model2, X_test, y_test)
print("Naive Bayes classifier score on test data:",score_naiveb)
score_rf=score_model(model3, X_test, y_test)
print("Random forest classifier score on test data:",score_rf)


Logistic regression classifier score on test data: 0.9739130434782609
Naive Bayes classifier score on test data: 0.8782608695652174
Random forest classifier score on test data: 0.9695652173913043


In [30]:
score_logistic2=score_model(finetuned_model1, X_test, y_test)
print("Finetuned Logistic regression classifier score on test data:",score_logistic2)
score_naiveb2=score_model(finetuned_model2, X_test, y_test)
print("Finetuned Naive Bayes classifier score on test data:",score_naiveb2)
score_rf2=score_model(finetuned_model3, X_test, y_test)
print("Finetuned Random forest classifier score on test data:",score_rf2)

Finetuned Logistic regression classifier score on test data: 0.9956521739130435
Finetuned Naive Bayes classifier score on test data: 0.9739130434782609
Finetuned Random forest classifier score on test data: 0.9695652173913043


After evaluating the three models on the test data, finetuned logistic regression model gives the best results.

In [20]:
import pickle
model_path="/kaggle/working/models/finetunedlogistic.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(finetuned_model1, file)