In [18]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import os
from sklearn.model_selection import GridSearchCV

In [13]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rickc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rickc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rickc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data_path = r'C:\CMI\Applied ML\applied-ML\ASS_2\data\raw_data.csv'

In [3]:
df = pd.read_csv(data_path)

In [12]:
def preprocess_data(data):
    # Lowercasing
    data['text'] = data['text'].apply(lambda x: x.lower())
    
    # Removing HTML tags
    data['text'] = data['text'].apply(lambda x: re.sub(r'<.*?>', '', x))
    
    # Removing URLs
    data['text'] = data['text'].apply(lambda x: re.sub(r'http[s]?://\S+', '', x))
    
    # Removing email addresses
    data['text'] = data['text'].apply(lambda x: re.sub(r'\S*@\S*\s?', '', x))
    
    # Removing punctuation and special characters
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x))
    
    # Tokenization
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stop_words])
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Joining tokens back to string
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    
    return data

In [16]:
pr_df = preprocess_data(df)

In [15]:
def store_pre_processed_data(train_data, dir_path=r'C:\CMI\Applied ML\applied-ML\ASS_3\data'):
    train_data.to_csv(os.path.join(dir_path, 'train.csv'), index=False)

In [17]:
store_pre_processed_data(pr_df)

In [23]:
X_train = pr_df['text']
y_train = pr_df['spam']

In [21]:
def fit_model(train_data, y_train, model_name='logistic_regression'):
    if model_name == 'logistic_regression':
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression(random_state=42))
        ])
        parameters = {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'clf__C': [0.1, 1, 10]
        }
    elif model_name == 'naive_bayes':
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', MultinomialNB())
        ])
        parameters = {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'clf__alpha': [0.1, 1, 10]
        }
    elif model_name == 'lightgbm':
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', lgb.LGBMClassifier(random_state=42, force_row_wise=True))
        ])
        parameters = {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'clf__num_leaves': [31, 50],
            'clf__learning_rate': [0.1, 0.01],
            'clf__n_estimators': [100, 200]
        }
    else:
        raise ValueError("Model name not recognized. Choose 'logistic_regression', 'naive_bayes', or 'lightgbm'")

    # Hyperparameter tuning
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(train_data, y_train)

   

    return grid_search.best_estimator_


In [22]:
# Model Selection
model_names = ['logistic_regression', 'naive_bayes', 'lightgbm']

In [25]:
for model_name in tqdm(model_names, desc="Training Models"):
    print(f"\nTraining with {model_name}:")
    model = fit_model(X_train, y_train, model_name)
    models_dir = r"C:\CMI\Applied ML\applied-ML\ASS_3\best_models"
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    model_path = os.path.join(models_dir, f"{model_name}_best_model.pkl")
    with open(model_path, 'wb') as file:
        pickle.dump(model, file)

    #print(f"Best parameters for {model_name}: {model.best_params_}")
    print(f"Model saved to {model_path}")

Training Models:   0%|          | 0/3 [00:00<?, ?it/s]


Training with logistic_regression:
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Training Models:  33%|███▎      | 1/3 [00:23<00:47, 23.79s/it]

Model saved to C:\CMI\Applied ML\applied-ML\ASS_3\best_models\logistic_regression_best_model.pkl

Training with naive_bayes:
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Training Models:  67%|██████▋   | 2/3 [00:35<00:16, 16.47s/it]

Model saved to C:\CMI\Applied ML\applied-ML\ASS_3\best_models\naive_bayes_best_model.pkl

Training with lightgbm:
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[LightGBM] [Info] Number of positive: 1368, number of negative: 4360
[LightGBM] [Info] Total Bins 126660
[LightGBM] [Info] Number of data points in the train set: 5728, number of used features: 3556
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.238827 -> initscore=-1.159122
[LightGBM] [Info] Start training from score -1.159122


Training Models: 100%|██████████| 3/3 [06:44<00:00, 134.82s/it]

Model saved to C:\CMI\Applied ML\applied-ML\ASS_3\best_models\lightgbm_best_model.pkl



