# BPE AND COUNT VECTORIZER

In [2]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer
from hyperparameters import grid_search_predict, grid_search, random_search, random_search_predict, custom_score
import pandas as pd
import numpy as np
import sklearn as sk
from scipy.stats import uniform
from cross_validation import cross_validate_torch
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

### Load the dataset

In [3]:
data = pd.read_csv('../../_data/Reviews.csv')
X, y = data['Text'], data['Score']

In [4]:
# FOR TESTING : only select first 20000 samples
X, y = X[:], y[:]

### Tokenize the dataset with Byte-Pair Encoding

In [5]:
tokenized_documents = tokenizer(X.copy())

Number of tokens:  58325048
Number of sentences:  3661772


### Vectorize the dataset with Count Vectorizer 

In [6]:
X, vect = vectorizer(tokenized_documents)



### Some Statistiscs

In [7]:
# Most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# Least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset


['0' '1' '10' '100004' '100005' '10001' '100011' '100012' '100014'
 '100016']
Top 10 least frequent words in the dataset
['99975' '9998' '99981' '99984' '99985' '99986' '9999' '99990' '99992'
 '99994']


### Split Dataset

In [8]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(454763, 53351) (113691, 53351) (454763,) (113691,)


# Logistic Regression
### SciKit Learn Model

In [10]:
def evaluate_model_performance(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    classification_rep = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)

In [11]:
logreg = LogisticRegression(max_iter=5000)
logreg.fit(X_train, y_train)

# Predict the test set results
y_pred = logreg.predict(X_test)

evaluate_model_performance(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision: 0.7503198243628754
Recall: 0.7713979118839662
F1 Score: 0.7553350686960411
Accuracy: 0.7713979118839662
Classification Report:
               precision    recall  f1-score   support

           1       0.71      0.72      0.72     10326
           2       0.50      0.40      0.45      5855
           3       0.54      0.44      0.49      8485
           4       0.57      0.37      0.45     16123
           5       0.84      0.94      0.89     72902

    accuracy                           0.77    113691
   macro avg       0.63      0.57      0.60    113691
weighted avg       0.75      0.77      0.76    113691

Confusion Matrix:
 [[ 7450   868   455   236  1317]
 [ 1239  2342   829   337  1108]
 [  689   746  3762  1110  2178]
 [  334   340  1146  5894  8409]
 [  744   353   813  2739 68253]]


## Hyperparameters Tuning

In [13]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Penalty type
    'solver': ['liblinear']  # Solver supporting 'l1' penalty
}

### Grid Search 

In [14]:
logreg = LogisticRegression(max_iter=5000)
custom_scorer = make_scorer(custom_score)
grid_search_result = grid_search(logreg, param_grid, 'accuracy', X_train, y_train)
best_params_grid_search, best_model_grid_search = grid_search_predict(grid_search_result)

print("Best parameters found by grid search:")
print(best_params_grid_search)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
# Train the best model on the entire training set
best_model_grid_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model_grid_search.predict(X_test)

evaluate_model_performance(y_true=y_test, y_pred=y_pred)

## Random Search 

In [None]:
param_distributions = {
    'C': uniform(0.001, 100),  # Uniform distribution for regularization strength
    'penalty': ['l1', 'l2']  # Penalty type
}

In [None]:
logreg = LogisticRegression(max_iter=1000)
custom_scorer = make_scorer(custom_score)
grid_search_result = random_search(logreg, param_distributions, 'accuracy', X_train, y_train)
best_params_random_search, best_model_random_search = random_search_predict(grid_search_result)

print("Best parameters found by random search:")
print(best_params_random_search)

In [None]:
# Train the best model on the entire training set
best_model_random_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model_random_search.predict(X_test)

evaluate_model_performance(y_true=y_test, y_pred=y_pred)