# Importing libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import  accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  cross_val_score
from itertools import product


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\j25sr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\j25sr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\j25sr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Function to load data

In [26]:
def load_data(file_path):
    msg_train = pd.read_csv(file_path+ 'msg_train.csv').squeeze()
    msg_val = pd.read_csv(file_path+ 'msg_val.csv').squeeze()
    msg_test = pd.read_csv(file_path+ 'msg_test.csv').squeeze()
    label_train = pd.read_csv(file_path+ 'label_train.csv').squeeze()
    label_val = pd.read_csv(file_path+ 'label_val.csv').squeeze()
    label_test = pd.read_csv(file_path+ 'label_test.csv').squeeze()
    return msg_train, msg_val, msg_test, label_train, label_val, label_test

In [27]:
path = "C:/Users/j25sr/OneDrive/Desktop/AML 1/split_dataset/split_dataset"
msg_train, msg_val, msg_test, label_train, label_val, label_test = load_data(path)

## Fitting Naive Bayes

In [28]:
pipeline_NB = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),
    ('classifier', MultinomialNB())  # Classifier
])

In [30]:
def validation_accuracy(msg_train, label_train, msg_val, label_val):
  pipeline_NB.fit(msg_train,label_train)
  predictions = pipeline_NB.predict(msg_val)
  val_accuracy = accuracy_score(label_val, predictions)
  return val_accuracy

val_accuracy = validation_accuracy(msg_train, label_train, msg_val, label_val)*100

print(f"Validation Accuracy: {val_accuracy:.2f}%")


Validation Accuracy: 93.63%




Hyperparameter tuning to find best parameters for Naive Bayes

In [31]:
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

def param_search(msg_train, label_train, msg_val, label_val):
  param_grid = {
    'tfidf__use_idf': [True, False],
    'classifier__alpha': [0.1, 0.5, 1.0, 2.0],
    'classifier__fit_prior': [True, False]
  }

  param_combinations = list(product(*param_grid.values()))
  best_accuracy = 0
  best_params = None
  best_model = None

  for params in param_combinations:
      pipeline_NB.set_params(
          tfidf__use_idf=params[0],
          classifier__alpha=params[1],
          classifier__fit_prior=params[2]
      )

      pipeline_NB.fit(msg_train, label_train)

      val_predictions = pipeline_NB.predict(msg_val)
      val_accuracy = accuracy_score(label_val, val_predictions)

      print(f"Params: {params} → Validation Accuracy: {val_accuracy:.4f}")

      if val_accuracy > best_accuracy:
          best_accuracy = val_accuracy
          best_params = params
          best_model = pipeline_NB

  return best_params, best_accuracy*100

best_params, best_accuracy = param_search(msg_train, label_train, msg_val, label_val)

print("\nBest Parameters:", best_params)
print(f"Best Validation Accuracy: {best_accuracy:.2f}%")

Params: (True, 0.1, True) → Validation Accuracy: 0.9435
Params: (True, 0.1, False) → Validation Accuracy: 0.9650
Params: (True, 0.5, True) → Validation Accuracy: 0.9399
Params: (True, 0.5, False) → Validation Accuracy: 0.9632
Params: (True, 1.0, True) → Validation Accuracy: 0.9363
Params: (True, 1.0, False) → Validation Accuracy: 0.9632
Params: (True, 2.0, True) → Validation Accuracy: 0.9327
Params: (True, 2.0, False) → Validation Accuracy: 0.9632
Params: (False, 0.1, True) → Validation Accuracy: 0.8834
Params: (False, 0.1, False) → Validation Accuracy: 0.9668
Params: (False, 0.5, True) → Validation Accuracy: 0.8807
Params: (False, 0.5, False) → Validation Accuracy: 0.9677
Params: (False, 1.0, True) → Validation Accuracy: 0.8780
Params: (False, 1.0, False) → Validation Accuracy: 0.9659
Params: (False, 2.0, True) → Validation Accuracy: 0.8780
Params: (False, 2.0, False) → Validation Accuracy: 0.9686

Best Parameters: (False, 2.0, False)
Best Validation Accuracy: 96.86%


In [46]:
best_params = {
    'tfidf__use_idf': False,
    'classifier__alpha': 2.0,
    'classifier__fit_prior': False
}

In [47]:
best_model = pipeline_NB.set_params(**best_params)
best_model.fit(msg_train, label_train)
test_predictions = best_model.predict(msg_test)
test_accuracy = accuracy_score(label_test, test_predictions)*100

print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Accuracy: 96.68%


# Fitting SVM

In [34]:
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),
    ('classifier', SVC()),
])


In [35]:
from sklearn.metrics import accuracy_score

def validation_accuracy_svm(msg_train, label_train, msg_val, label_val):
    pipeline_svm.fit(msg_train, label_train)
    predictions = pipeline_svm.predict(msg_val)
    val_accuracy = accuracy_score(label_val, predictions)
    return val_accuracy

val_accuracy_svm = validation_accuracy_svm(msg_train, label_train, msg_val, label_val)*100

print(f"Validation Accuracy (SVM): {val_accuracy_svm:.2f}%")


Validation Accuracy (SVM): 97.22%


Hyperparameter tuning to find best parameters for SVM

In [None]:

def param_search_svm(msg_train, label_train, msg_val, label_val):
    param_grid = {
        'tfidf__use_idf': (True, False),
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    }

    param_combinations = list(product(*param_grid.values()))
    best_accuracy = 0
    best_params = None
    best_model = None

    for params in param_combinations:
        pipeline_svm.set_params(
            tfidf__use_idf=params[0],
            classifier__C=params[1],
            classifier__kernel=params[2]
        )

        pipeline_svm.fit(msg_train, label_train)

        val_predictions = pipeline_svm.predict(msg_val)
        val_accuracy = accuracy_score(label_val, val_predictions)

        # print(f"Params: {params} → Validation Accuracy: {val_accuracy:.4f}")

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_params = params
            best_model = pipeline_svm

    return best_params, best_accuracy*100

best_params_svm, best_accuracy_svm = param_search_svm(msg_train, label_train, msg_val, label_val)

print("\nBest Parameters (SVM):", best_params_svm)
print(f"Best Validation Accuracy (SVM): {best_accuracy_svm:.2f}%")



Best Parameters (SVM): (True, 10, 'rbf')
Best Validation Accuracy (SVM): 97.9372


In [48]:
best_params = {
    'tfidf__use_idf': True,
    'classifier__C': 10,
    'classifier__kernel': 'rbf'
}

In [49]:
best_model = pipeline_svm.set_params(**best_params)
best_model.fit(msg_train, label_train)
test_predictions = best_model.predict(msg_test)
test_accuracy = accuracy_score(label_test, test_predictions)*100

print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Accuracy: 97.49%


## Fitting Logistic Regression

In [39]:
pipeline_logistic = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),  # Directly handle tokenized data
    ('classifier', LogisticRegression(solver='liblinear')),
])

In [40]:
def validation_accuracy(msg_train, label_train, msg_val, label_val):
    pipeline_logistic.fit(msg_train, label_train)
    predictions = pipeline_logistic.predict(msg_val)
    val_accuracy = accuracy_score(label_val, predictions)
    return val_accuracy

val_accuracy = validation_accuracy(msg_train, label_train, msg_val, label_val)*100
print(f"Validation Accuracy: {val_accuracy:.2f}%")

Validation Accuracy: 95.43%


Hyperparameter tuning to find best parameters for Logistic Regression

In [41]:
def param_search(msg_train, label_train, msg_val, label_val):
    param_grid = {
        'tfidf__use_idf': (True, False),
        'classifier__C': [0.01, 0.1, 1.0, 10.0],
        'classifier__penalty': ['l1', 'l2'],
    }

    param_combinations = list(product(*param_grid.values()))
    best_accuracy = 0
    best_params = None
    best_model = None

    for params in param_combinations:
        try:
            pipeline_logistic.set_params(
                tfidf__use_idf=params[0],
                classifier__C=params[1],
                classifier__penalty=params[2]
            )

            pipeline_logistic.fit(msg_train, label_train)
            val_predictions = pipeline_logistic.predict(msg_val)
            val_accuracy = accuracy_score(label_val, val_predictions)

            print(f"Params: {params} → Validation Accuracy: {val_accuracy:.4f}")

            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = params
                best_model = pipeline_logistic
        except Exception as e:
            print(f"Skipping parameters {params} due to error: {e}")

    return best_params, best_accuracy*100

best_params, best_accuracy = param_search(msg_train, label_train, msg_val, label_val)

print("\nBest Parameters:", best_params)
print(f"Best Validation Accuracy: {best_accuracy:.2f}%")


Params: (True, 0.01, 'l1') → Validation Accuracy: 0.8744
Params: (True, 0.01, 'l2') → Validation Accuracy: 0.8744
Params: (True, 0.1, 'l1') → Validation Accuracy: 0.9426
Params: (True, 0.1, 'l2') → Validation Accuracy: 0.9094
Params: (True, 1.0, 'l1') → Validation Accuracy: 0.9668
Params: (True, 1.0, 'l2') → Validation Accuracy: 0.9543
Params: (True, 10.0, 'l1') → Validation Accuracy: 0.9794
Params: (True, 10.0, 'l2') → Validation Accuracy: 0.9695
Params: (False, 0.01, 'l1') → Validation Accuracy: 0.8744
Params: (False, 0.01, 'l2') → Validation Accuracy: 0.8744
Params: (False, 0.1, 'l1') → Validation Accuracy: 0.9184
Params: (False, 0.1, 'l2') → Validation Accuracy: 0.8744
Params: (False, 1.0, 'l1') → Validation Accuracy: 0.9578
Params: (False, 1.0, 'l2') → Validation Accuracy: 0.9229
Params: (False, 10.0, 'l1') → Validation Accuracy: 0.9722
Params: (False, 10.0, 'l2') → Validation Accuracy: 0.9543

Best Parameters: (True, 10.0, 'l1')
Best Validation Accuracy: 97.94%


In [44]:
best_params = {
    'tfidf__use_idf': True,
    'classifier__C': 10.0,
    'classifier__penalty': 'l1'
}

In [45]:
best_model = pipeline_logistic.set_params(**best_params)
best_model.fit(msg_train, label_train)
test_predictions = best_model.predict(msg_test)
test_accuracy = accuracy_score(label_test, test_predictions)*100

print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Accuracy: 97.76%


Based on the validation set, Logistic Regression has the best accuracy. So, we can use this as our final model. 
