# WORDPIECE TOKENIZER AND WORD2VEC

In [1]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer
from hyperparameters import grid_search_predict, grid_search, random_search, random_search_predict, custom_score
import pandas as pd
import numpy as np
from scipy.stats import uniform
from cross_validation import cross_validate_torch
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


### Load the dataset

In [2]:
data = pd.read_csv('../../_data/Reviews.csv')
X, y = data['Text'], data['Score']

In [3]:
# FOR TESTING : only select first 20000 samples
X, y = X[:], y[:]

### Tokenize the dataset with WordPiece

In [4]:
tokenized_documents = tokenizer(X.copy())
tokenized_documents

Number of tokens:  0
Number of sentences:  0


[['I',
  'bought',
  'several',
  'Vitality',
  'canned',
  'dog',
  'food',
  'products',
  'found',
  'good',
  'quality',
  '.',
  'The',
  'product',
  'looks',
  'like',
  'stew',
  'processed',
  'meat',
  'smells',
  'better',
  '.',
  'My',
  'Labrador',
  'finicky',
  'appreciates',
  'product',
  'better',
  ' ',
  '.'],
 ['Product',
  'arrived',
  'labeled',
  'Jumbo',
  'Salted',
  'Peanuts',
  '...',
  'peanuts',
  'actually',
  'small',
  'sized',
  'unsalted',
  '.',
  'Not',
  'sure',
  'error',
  'vendor',
  'intended',
  'represent',
  'product',
  '"',
  'Jumbo',
  '"',
  '.'],
 ['This',
  'confection',
  'around',
  'centuries',
  '.',
  ' ',
  'It',
  'light',
  ',',
  'pillowy',
  'citrus',
  'gelatin',
  'nuts',
  '-',
  'case',
  'Filberts',
  '.',
  'And',
  'cut',
  'tiny',
  'squares',
  'liberally',
  'coated',
  'powdered',
  'sugar',
  '.',
  ' ',
  'And',
  'tiny',
  'mouthful',
  'heaven',
  '.',
  ' ',
  'Not',
  'chewy',
  ',',
  'flavorful',
  '.',
  

### Vectorize the dataset with Word2Vec 

In [5]:
X, vect = vectorizer(tokenized_documents)

### Split Dataset

In [6]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(454763, 300) (113691, 300) (454763,) (113691,)


# Logistic Regression
### SciKit Learn Model

In [8]:
def evaluate_model_performance(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    classification_rep = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)

In [9]:
logreg = LogisticRegression(max_iter=5000)
logreg.fit(X_train, y_train)

# Predict the test set results
y_pred = logreg.predict(X_test)

evaluate_model_performance(y_test, y_pred)

Precision: 0.6151169065628436
Recall: 0.6878204959055686
F1 Score: 0.6180023358708377
Accuracy: 0.6878204959055686
Classification Report:
               precision    recall  f1-score   support

           1       0.55      0.53      0.54     10326
           2       0.35      0.08      0.13      5855
           3       0.33      0.12      0.18      8485
           4       0.39      0.08      0.13     16123
           5       0.73      0.96      0.83     72902

    accuracy                           0.69    113691
   macro avg       0.47      0.35      0.36    113691
weighted avg       0.62      0.69      0.62    113691

Confusion Matrix:
 [[ 5438   334   314   130  4110]
 [ 1449   449   547   213  3197]
 [ 1057   273  1036   628  5491]
 [  587   108   766  1244 13418]
 [ 1312   112   499   947 70032]]


## Hyperparameters Tuning

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Penalty type
    'solver': ['liblinear']  # Solver supporting 'l1' penalty
}

### Grid Search 

In [None]:
custom_scorer = make_scorer(custom_score)
grid_search_result = grid_search(logreg, param_grid, 'accuracy', X_train, y_train)
best_params_grid_search, best_model_grid_search = grid_search_predict(grid_search_result)

print("Best parameters found by grid search:")
print(best_params_grid_search)

In [None]:
# Train the best model on the entire training set
best_model_grid_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model_grid_search.predict(X_test)

evaluate_model_performance(y_true=y_test, y_pred=y_pred)

## Random Search 

In [None]:
param_distributions = {
    'C': uniform(0.001, 100),  # Uniform distribution for regularization strength
    'penalty': ['l1', 'l2']  # Penalty type
}

In [None]:
logreg = LogisticRegression(max_iter=1000)
custom_scorer = make_scorer(custom_score)
grid_search_result = random_search(logreg, param_distributions, 'accuracy', X_train, y_train)
best_params_random_search, best_model_random_search = random_search_predict(grid_search_result)

print("Best parameters found by random search:")
print(best_params_random_search)

In [None]:
# Train the best model on the entire training set
best_model_random_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model_random_search.predict(X_test)

evaluate_model_performance(y_true=y_test, y_pred=y_pred)