In [2]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from lib_ml.preprocessing import preprocess_dataset
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

## Transforming the data
- We read the `.tsv` file and split the data into X and y.
- Then we clean the data using the lib_ml library.
- Then we transform the data and save the used CountVectorizer to upload later.

In [3]:
dataset = pd.read_csv('../data/a1_RestaurantReviews_HistoricDump.tsv', delimiter = '\t', quoting = 3)
corpus, labels = preprocess_dataset(dataset)
cv = CountVectorizer(max_features = 1420)
data = cv.fit_transform(corpus).toarray()
bow_path = '../../models/c1_BoW_Sentiment_Model.pkl'
pickle.dump(cv, open(bow_path, "wb"))

## Training the model
- Using the labels and data, the model is trained using a gridsearch over the given parameter grid with a specified number of folds.
- The best score and best model is returned.

In [4]:
def create_pipeline_and_train(data, labels, classifier, param_grid, cv_folds):
    pipeline = Pipeline([
        ('classifier', classifier)
    ])

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    scoring = 'accuracy' 

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=1,    
        return_train_score=False,
        verbose=1,
    )

    print(f"Starting GridSearchCV with {cv_folds} folds...")
    print(f"Parameter Grid: {param_grid}")

    grid_search.fit(data, labels)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_ 
    best_estimator = grid_search.best_estimator_ 

    print("\n--- GridSearchCV Complete ---")
    print(f"Best Parameters Found: {best_params}")
    print(f"Best Cross-Validation Score ({scoring}): {best_score:.6f}")
    print("-" * 29)

    return best_score, best_estimator

## Classifiers
- We train different classifiers with parameter grids as shown below.

In [5]:
def GaussuanNB_Classify(data, labels, cv_folds):
    classifier = GaussianNB()
    param_grid = {
        'classifier__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    }

    best_score, best_estimator = create_pipeline_and_train(
        data,
        labels,
        classifier,
        param_grid,
        cv_folds
    )

    return best_score, best_estimator

In [6]:
def SGD_Classify(data, labels, cv_folds):

    classifier = SGDClassifier(random_state=42)
    param_grid = {
        'classifier__loss': ['hinge', 'log', 'squared_hinge', 'perceptron'],
        'classifier__alpha': [1e-4, 1e-3, 1e-2, 1e-1],
        'classifier__penalty': ['l2', 'l1', 'elasticnet'],
        'classifier__max_iter': [1000, 2000, 3000],
        'classifier__tol': [1e-3, 1e-4, 1e-5]
    }

    best_score, best_estimator = create_pipeline_and_train(
        data,
        labels,
        classifier,
        param_grid,
        cv_folds
    )

    return best_score, best_estimator

In [7]:
def Logistic_Classify(data, labels, cv_folds):
 
    classifier = LogisticRegression(random_state=42)
    param_grid = {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l2', 'l1'],
        'classifier__solver': ['liblinear', 'saga'],
        'classifier__max_iter': [100, 200, 300]
    }

    best_score, best_estimator = create_pipeline_and_train(
        data,
        labels,
        classifier,
        param_grid,
        cv_folds
    )

    return best_score, best_estimator

In [8]:
def SVM_Classify(data, labels, cv_folds):
    classifier = SVC(random_state=42)
    param_grid = {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf', 'poly'],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__degree': [2, 3, 4]
    }

    best_score, best_estimator = create_pipeline_and_train(
        data,
        labels,
        classifier,
        param_grid,
        cv_folds
    )

    return best_score, best_estimator

In [9]:
def KNN_Classify(data, labels, cv_folds):

    classifier = KNeighborsClassifier()
    param_grid = {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'classifier__leaf_size': [10, 20, 30]
    }

    best_score, best_estimator = create_pipeline_and_train(
        data,
        labels,
        classifier,
        param_grid,
        cv_folds
    )

    return best_score, best_estimator

In [10]:
def RandomForest_Classify(data, labels, cv_folds):
    

    classifier = RandomForestClassifier(random_state=42)
    param_grid = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__bootstrap': [True, False]
    }

    best_score, best_estimator = create_pipeline_and_train(
        data,
        labels,
        classifier,
        param_grid,
        cv_folds
    )

    return best_score, best_estimator

## Checking the models
- We check the outcomes of training all the models to see which obtained the best results on a fresh test set.

In [11]:
best_score  = 0.0
best_estimator = None


In [12]:
def check_GaussianNB(data, labels):
    global best_score, best_estimator
    temp_score, temp_estimator = GaussuanNB_Classify(data, labels, 5)
    print(f"GaussianNB best score: {temp_score}")
    if temp_score > best_score:
        best_score = temp_score
        best_estimator = temp_estimator


In [13]:
def check_SGD(data, labels):
    global best_score, best_estimator
    temp_score, temp_estimator = SGD_Classify(data, labels, 5)
    print(f"SGDClassifier best score: {temp_score}")
    if temp_score > best_score:
        best_score = temp_score
        best_estimator = temp_estimator

In [14]:
def check_Logistic(data, labels):
    global best_score, best_estimator
    temp_score, temp_estimator = Logistic_Classify(data, labels, 5)
    print(f"LogisticRegression best score: {temp_score}")
    if temp_score > best_score:
        best_score = temp_score
        best_estimator = temp_estimator

In [15]:
def check_SVM(data, labels):
    global best_score, best_estimator
    temp_score, temp_estimator = SVM_Classify(data, labels, 5)
    print(f"SVC best score: {temp_score}")
    if temp_score > best_score:
        best_score = temp_score
        best_estimator = temp_estimator

In [16]:
def check_KNN(data, labels):
    global best_score, best_estimator
    temp_score, temp_estimator = KNN_Classify(data, labels, 5)
    print(f"KNeighborsClassifier best score: {temp_score}")
    if temp_score > best_score:
        best_score = temp_score
        best_estimator = temp_estimator

In [17]:
def check_RandomForest(data, labels):
    global best_score, best_estimator
    temp_score, temp_estimator = RandomForest_Classify(data, labels, 5)
    print(f"RandomForestClassifier best score: {temp_score}")
    if temp_score > best_score:
        best_score = temp_score
        best_estimator = temp_estimator

In [18]:
def find_best_model(data, labels):
    check_GaussianNB(data, labels)
    # check_SGD(data, labels)
    # check_Logistic(data, labels)
    # check_SVM(data, labels)
    # check_KNN(data, labels)
    # check_RandomForest(data, labels)

    print(f"Best score: {best_score}")
    print(f"Best estimator: {best_estimator}")

    return best_estimator

## Saving the model
- Save the model in a joblib file

In [19]:
def save_model(model, filename):
    """
    Save the trained model to a file.
    """
    with open(filename, 'wb') as file:
        joblib.dump(model, file)

In [20]:
find_best_model(data, labels)
save_model(best_estimator, '../../models/c2_Classifier_Sentiment_Model.joblib')

Starting GridSearchCV with 5 folds...
Parameter Grid: {'classifier__var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]}
Fitting 5 folds for each of 9 candidates, totalling 45 fits

--- GridSearchCV Complete ---
Best Parameters Found: {'classifier__var_smoothing': 0.1}
Best Cross-Validation Score (accuracy): 0.741111
-----------------------------
GaussianNB best score: 0.7411111111111112
Best score: 0.7411111111111112
Best estimator: Pipeline(steps=[('classifier', GaussianNB(var_smoothing=0.1))])
