# NLTK TOKENIZER + LEMMATIZATION AND TF-IDF VECTORIZER

In [2]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer
from hyperparameters import grid_search_predict, grid_search, random_search, random_search_predict, custom_score
import pandas as pd
import numpy as np
import sklearn as sk
from scipy.stats import uniform
from cross_validation import cross_validate_torch
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

### Load the dataset

In [3]:
data = pd.read_csv('../../_data/Reviews.csv')
X, y = data['Text'], data['Score']

In [4]:
# FOR TESTING : only select first 20000 samples
X, y = X[:], y[:]

### Tokenize the dataset with NLTK + Lemmatization

In [5]:
tokenized_documents = tokenizer(X.copy())

Number of tokens:  23767229
Number of sentences:  2832806


### Vectorize the dataset with TF-IDF 

In [6]:
X, vect = vectorizer(tokenized_documents)



### Some Statistiscs

In [7]:
# Most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# Least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '000' '0000' '000001' '00001' '000013' '0000soo' '0001'
 '000111052']
Top 10 least frequent words in the dataset
['¾' 'â' 'çay' 'çaykur' 'çelem' 'être' 'île' 'ît' 'ø' 'þ']


### Split Dataset

In [8]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(454763, 111813) (113691, 111813) (454763,) (113691,)


# Logistic Regression
### SciKit Learn Model

In [10]:
def evaluate_model_performance(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    classification_rep = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)

In [11]:
logreg = LogisticRegression(max_iter=5000)
logreg.fit(X_train, y_train)

# Predict the test set results
y_pred = logreg.predict(X_test)

evaluate_model_performance(y_test, y_pred)

Precision: 0.7201525864381245
Recall: 0.752161560721605
F1 Score: 0.7214709235058786
Accuracy: 0.752161560721605
Classification Report:
               precision    recall  f1-score   support

           1       0.68      0.69      0.69     10326
           2       0.54      0.26      0.35      5855
           3       0.51      0.33      0.40      8485
           4       0.55      0.28      0.37     16123
           5       0.80      0.95      0.87     72902

    accuracy                           0.75    113691
   macro avg       0.62      0.50      0.54    113691
weighted avg       0.72      0.75      0.72    113691

Confusion Matrix:
 [[ 7176   509   441   210  1990]
 [ 1452  1544   772   349  1738]
 [  827   461  2790  1125  3282]
 [  398   166   959  4454 10146]
 [  744   166   522  1920 69550]]


## Hyperparameters Tuning

In [12]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Penalty type
    'solver': ['liblinear']  # Solver supporting 'l1' penalty
}

### Grid Search 

In [13]:
logreg = LogisticRegression(max_iter=5000)
custom_scorer = make_scorer(custom_score)
grid_search_result = grid_search(logreg, param_grid, 'accuracy', X_train, y_train)
best_params_grid_search, best_model_grid_search = grid_search_predict(grid_search_result)

print("Best parameters found by grid search:")
print(best_params_grid_search)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found by grid search:
{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


In [14]:
# Train the best model on the entire training set
best_model_grid_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model_grid_search.predict(X_test)

evaluate_model_performance(y_true=y_test, y_pred=y_pred)

Precision: 0.7378305035051417
Recall: 0.7631738660052246
F1 Score: 0.7422680011623713
Accuracy: 0.7631738660052246
Classification Report:
               precision    recall  f1-score   support

           1       0.71      0.70      0.70     10326
           2       0.56      0.34      0.43      5855
           3       0.54      0.37      0.44      8485
           4       0.55      0.36      0.43     16123
           5       0.82      0.94      0.88     72902

    accuracy                           0.76    113691
   macro avg       0.64      0.54      0.58    113691
weighted avg       0.74      0.76      0.74    113691

Confusion Matrix:
 [[ 7226   507   486   322  1785]
 [ 1159  2002   668   469  1557]
 [  715   475  3164  1308  2823]
 [  352   256   911  5725  8879]
 [  735   304   682  2532 68649]]


## Random Search 

In [15]:
param_distributions = {
    'C': uniform(0.001, 100),  # Uniform distribution for regularization strength
    'penalty': ['l1', 'l2']  # Penalty type
}

In [None]:
logreg = LogisticRegression(max_iter=5000)
custom_scorer = make_scorer(custom_score)
grid_search_result = random_search(logreg, param_distributions, 'accuracy', X_train, y_train)
best_params_random_search, best_model_random_search = random_search_predict(grid_search_result)

print("Best parameters found by random search:")
print(best_params_random_search)

In [None]:
# Train the best model on the entire training set
best_model_random_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model_random_search.predict(X_test)

evaluate_model_performance(y_true=y_test, y_pred=y_pred)