In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
from xgboost import XGBClassifier

In [24]:
from sklearn.model_selection import GridSearchCV

In [4]:
def load_split_data(file_path):
    return pd.read_csv(file_path)

In [52]:
def fit_model(train_data, y_train, model_name='logistic_regression'):
    if model_name == 'logistic_regression':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression(random_state=42))
        ])
    elif model_name == 'naive_bayes':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', MultinomialNB())
        ])
    elif model_name == 'lightgbm':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', lgb.LGBMClassifier(random_state=42,force_row_wise=True))
        ])
    else:
        raise ValueError("Model name not recognized. Choose 'logistic_regression', 'naive_bayes', or 'lightgbm'")
    
    model.fit(train_data, y_train)
    return model

In [39]:
def score_model(model, data, y_true):
    y_pred = model.predict(data)
    return accuracy_score(y_true, y_pred)

In [7]:
def evaluate_model(y_true, y_pred):
    print(classification_report(y_true, y_pred))

In [8]:
def validate_model(model, train_data, y_train, validation_data, y_val):
    print("Train score:", score_model(model, train_data, y_train))
    print("Validation score:", score_model(model, validation_data, y_val))

In [50]:
# Model Selection
model_names = ['logistic_regression', 'naive_bayes', 'lightgbm']

In [12]:
# Load data
train_data = load_split_data(r'C:\CMI\Applied ML\ASS_1\data\dataset\train.csv')
validation_data = load_split_data(r'C:\CMI\Applied ML\ASS_1\data\dataset\validation.csv')
test_data = load_split_data(r'C:\CMI\Applied ML\ASS_1\data\dataset\test.csv')


In [40]:
X_train = train_data['text']
y_train = train_data['spam']
X_val = validation_data['text']
y_val = validation_data['spam']
X_test = test_data['text']
y_test = test_data['spam']

## Training and evaluation

In [53]:
# Training and evaluation 
for model_name in tqdm(model_names, desc="Training Models"):
    print(f"\nTraining with {model_name}:")
    model = fit_model(X_train, y_train, model_name)
    
    # Score on train and validation
    validate_model(model, X_train, y_train, X_val, y_val)

    # Evaluate on train and validation
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    print("For training.\n")
    evaluate_model(y_train, y_pred_train)
    print("For validation.\n")
    evaluate_model(y_val, y_pred_val)

Training Models:   0%|          | 0/3 [00:00<?, ?it/s]


Training with logistic_regression:
Train score: 0.9949803579223047
Validation score: 0.9825378346915018


Training Models:  33%|███▎      | 1/3 [00:01<00:03,  1.69s/it]

For training.

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3504
           1       1.00      0.98      0.99      1078

    accuracy                           0.99      4582
   macro avg       1.00      0.99      0.99      4582
weighted avg       0.99      0.99      0.99      4582

For validation.

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       645
           1       1.00      0.93      0.96       214

    accuracy                           0.98       859
   macro avg       0.99      0.97      0.98       859
weighted avg       0.98      0.98      0.98       859


Training with naive_bayes:
Train score: 0.9423832387603667
Validation score: 0.889406286379511


Training Models:  67%|██████▋   | 2/3 [00:03<00:01,  1.50s/it]

For training.

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3504
           1       1.00      0.76      0.86      1078

    accuracy                           0.94      4582
   macro avg       0.96      0.88      0.91      4582
weighted avg       0.95      0.94      0.94      4582

For validation.

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       645
           1       1.00      0.56      0.71       214

    accuracy                           0.89       859
   macro avg       0.94      0.78      0.82       859
weighted avg       0.90      0.89      0.88       859


Training with lightgbm:
[LightGBM] [Info] Number of positive: 1078, number of negative: 3504
[LightGBM] [Info] Total Bins 101384
[LightGBM] [Info] Number of data points in the train set: 4582, number of used features: 3092
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.235268 -> initscore=-1.178798
[LightGBM]

Training Models: 100%|██████████| 3/3 [00:06<00:00,  2.12s/it]

For training.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
           1       1.00      1.00      1.00      1078

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582

For validation.

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       645
           1       0.96      0.98      0.97       214

    accuracy                           0.98       859
   macro avg       0.98      0.98      0.98       859
weighted avg       0.98      0.98      0.98       859






### Without Grid Search CV , LightGBM scores the best.

## Now using Grid Search CV to get the best hyper-parameters for finetunig

In [27]:

# Combine the training and validation sets for final fine-tuning
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

In [61]:
# Define the base models
models_gcv = {
    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression(random_state=42))
    ]),
    'naive_bayes': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', MultinomialNB())
    ]),
    'lightgbm': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', lgb.LGBMClassifier(random_state=42, force_row_wise=True))
    ])
}

# Define the parameter grids for each model
param_grids = {
    'logistic_regression': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    },
    'naive_bayes': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__alpha': [0.01, 0.1, 1]
    },
    'lightgbm': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__learning_rate': [0.01, 0.1, 0.5],
        'clf__num_leaves': [15,31,63],
        'clf__max_depth': [6, 8]
           
    }
}

In [62]:
best_models = {}
for model_name in tqdm(models_gcv.keys(), desc="Grid Searching Models"):
    print(f"Starting grid search for {model_name}...")
    grid_search = GridSearchCV(models_gcv[model_name], param_grids[model_name], cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_val,y_train_val )
    
    print(f"Best parameters for {model_name}:")
    print(grid_search.best_params_)
    
    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

Grid Searching Models:   0%|          | 0/3 [00:00<?, ?it/s]

Starting grid search for logistic_regression...
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Grid Searching Models:  33%|███▎      | 1/3 [00:37<01:15, 37.73s/it]

Best parameters for logistic_regression:
{'clf__C': 10, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}
Starting grid search for naive_bayes...
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Grid Searching Models:  67%|██████▋   | 2/3 [00:47<00:21, 21.01s/it]

Best parameters for naive_bayes:
{'clf__alpha': 0.01, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
Starting grid search for lightgbm...
Fitting 3 folds for each of 72 candidates, totalling 216 fits
[LightGBM] [Info] Number of positive: 1292, number of negative: 4149
[LightGBM] [Info] Total Bins 120518
[LightGBM] [Info] Number of data points in the train set: 5441, number of used features: 3458
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.237456 -> initscore=-1.166676
[LightGBM] [Info] Start training from score -1.166676


Grid Searching Models: 100%|██████████| 3/3 [05:28<00:00, 109.67s/it]

Best parameters for lightgbm:
{'clf__learning_rate': 0.5, 'clf__max_depth': 8, 'clf__num_leaves': 15, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}





In [63]:
best_models

{'logistic_regression': Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5, stop_words='english')),
                 ('clf', LogisticRegression(C=10, random_state=42))]),
 'naive_bayes': Pipeline(steps=[('tfidf',
                  TfidfVectorizer(max_df=0.5, ngram_range=(1, 2),
                                  stop_words='english')),
                 ('clf', MultinomialNB(alpha=0.01))]),
 'lightgbm': Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5, stop_words='english')),
                 ('clf',
                  LGBMClassifier(force_row_wise=True, learning_rate=0.5,
                                 max_depth=8, num_leaves=15, random_state=42))])}

In [64]:
# Dictionary to store the test accuracy of each model
test_accuracies = {}

for model_name, model in tqdm(best_models.items()):
    # Predicting on the test set
    y_pred = model.predict(X_test)
    
    # Calculating accuracy
    accuracy = accuracy_score(y_test, y_pred)
    test_accuracies[model_name] = accuracy
    
    print(f"{model_name} Test Accuracy: {accuracy:.8f}")


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 25.35it/s]

logistic_regression Test Accuracy: 0.98954704
naive_bayes Test Accuracy: 0.98257840
lightgbm Test Accuracy: 0.98606272





### Logistic_Regression gives the best score after Grid Search CV on test data.

-- This may change depending on random state and data split

In [65]:
# Determine the model with the best test accuracy
best_model_name = max(test_accuracies, key=test_accuracies.get)

## Classification report on Test Data for best model

In [66]:
# Assuming best_model_name is the name of the best model determined from previous steps
best_model = best_models[best_model_name]

# Generate predictions and classification reports for both train and test sets
datasets = {
    'Train': (X_train_val, y_train_val),
    'Test': (X_test, y_test)
}

for phase, (features, labels) in tqdm(datasets.items(), desc="Evaluating Best Model"):
    y_pred = best_model.predict(features)
    report = classification_report(labels, y_pred)
    print(f"{phase} Classification Report for {best_model_name}:\n{report}\n")


Evaluating Best Model: 100%|██████████| 2/2 [00:00<00:00,  3.77it/s]

Train Classification Report for logistic_regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4149
           1       1.00      1.00      1.00      1292

    accuracy                           1.00      5441
   macro avg       1.00      1.00      1.00      5441
weighted avg       1.00      1.00      1.00      5441


Test Classification Report for logistic_regression:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       211
           1       0.99      0.97      0.98        76

    accuracy                           0.99       287
   macro avg       0.99      0.98      0.99       287
weighted avg       0.99      0.99      0.99       287







-------------------------------------------------------------------------------------------------------------------------------