In [71]:
import pandas as pd
import numpy as np
import torch
import nltk
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

nltk.download('punkt')
#import fasttext
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch.nn.functional as F
from gensim.models import Word2Vec
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to /Users/romain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
df = pd.read_csv("../_data/Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## My Model

In [17]:
class MyLogisticRegression(torch.nn.Module):
    def __init__(self, input_size, num_classes):
        super(MyLogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

## My Dataset

In [18]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
      text = self.X[index]
      label = self.y.iloc[index]

      text = torch.tensor(text, dtype=torch.float)
      label = torch.tensor(label, dtype=torch.long)

      return text, label

In [51]:
X = df['Text']
y = df['Score']
y = y.map({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})
train_len = 10#int(0.8 * len(y))

## Tokenize (NLTK)

In [52]:
token_train = [nltk.tokenize.word_tokenize(s) for s in X.iloc[:train_len]]
# token_test = [nltk.tokenize.word_tokenize(s) for s in X.iloc[train_len:]]
tokenized_train = [" ".join(tokens) for tokens in token_train]
# tokenized_test = [" ".join(tokens) for tokens in token_test]


## Vectorize the text

### Bag of Words Vectorizer

In [48]:
vectorizer = CountVectorizer(stop_words='english', max_features=500)
X_train = vectorizer.fit_transform(tokenized_train)
# X_test = vectorizer.transform(tokenized_test)

### TF-IDF Vectorizer

In [54]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X_train = vectorizer.fit_transform(tokenized_train)
# X_test = vectorizer.transform(tokenized_test)

### Word2Vec

In [60]:
training_algorithm = 0 # 0 is for CBOW and 1 is for Skip gram
model = Word2Vec(sentences=X, min_count=5, vector_size=300, sg=training_algorithm)

# Create a vector for each text in the training/test set
X_train = np.array([np.mean([model.wv[word] for word in doc if word in model.wv.key_to_index], axis=0) for doc in tokenized_train])
X_test = np.array([np.mean([model.wv[word] for word in doc if word in model.wv.key_to_index], axis=0) for doc in tokenized_test])


### FastText

In [None]:
ft_model = fasttext.load_model('cc.en.300.bin')

def get_fasttext_vector(text, model):
    return model.get_sentence_vector(text)

X_train = np.array([get_fasttext_vector(text, ft_model) for text in X.iloc[:train_len]])
X_test = np.array([get_fasttext_vector(text, ft_model) for text in X.iloc[train_len:]])

## Dataloader Split

In [82]:
train_dataset = CustomDataset(X_train.toarray(), y.iloc[:train_len])
test_dataset = CustomDataset(X_test.toarray(), y.iloc[train_len:])

batch_size = 2

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

TEST [0 1]
TEST 2 4


### Classic Split

In [63]:
vectorizer = CountVectorizer(stop_words='english', max_features=500)
X2 = vectorizer.fit_transform(X.copy())
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y.copy(), test_size=0.2, random_state=11)

In [None]:
def custom_score(gt, y_pred):
    precision = precision_score(gt, y_pred)
    recall = recall_score(gt, y_pred)
    f1 = f1_score(gt, y_pred)
    combined_score = precision + recall + f1
    return combined_score

def grid_search(model, param_grid, scoring, X_train, y_train, cv=5, verbose=1, n_jobs=-1):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring=scoring, verbose=verbose, n_jobs=n_jobs)
    grid_search.fit(X_train, y_train)
    return grid_search
# combined_scorer = make_scorer(custom_score)

def grid_search_predict(grid_search):
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    return best_params, best_model

In [64]:
# Define the parameter grid for grid search
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
}
combined_scorer = make_scorer(custom_score)

# Grid Search
grid_search_logistic = grid_search(LogisticRegression(), param_grid, 'accuracy', X_train2, y_train2)
best_params, best_model = grid_search_predict(grid_search_logistic)
best_model.fit(X_train2, y_train2)
y_pred = best_model.predict(X_test)

# Randomized Search
random_search_logistic = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, scoring=combined_scorer, cv=5)
random_search.fit(X_train, y_train2)
best_params2, best_model2 = grid_search_predict(random_search_logistic)
best_model2.fit(X_train2, y_train2)
y_pred_randomized = best_model.predict(X_test2)

# Evaluate the model
precision = precision_score(y_test2, y_pred)
recall = recall_score(y_test2, y_pred)
f1 = f1_score(y_test2, y_pred)
accuracy = accuracy_score(y_test2, y_pred)
conf_matrix = confusion_matrix(y_test2, y_pred)
classification_rep = classification_report(y_test2, y_pred)

print("Best Parameters:", best_params)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

## Scikit-Learn Logistic Regression

In [None]:
logreg = LogisticRegression()
# Train the model using the training sets
logreg.fit(X_train2, y_train2)

# Predict the test set results
y_pred = logreg.predict(X_test2)

# Print the accuracy
print("Accuracy: ", accuracy_score(y_test2, y_pred))

# Print the classification report
print("Classification Report: \n", classification_report(y_test2, y_pred))

# Print the confusion matrix
print("Confusion Matrix: \n", confusion_matrix(y_test2, y_pred))

Accuracy:  0.6379748616865011


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     10411
           1       0.00      0.00      0.00      5855
           2       0.00      0.00      0.00      8624
           3       0.21      0.00      0.00     16225
           4       0.64      1.00      0.78     72576

    accuracy                           0.64    113691
   macro avg       0.17      0.20      0.16    113691
weighted avg       0.44      0.64      0.50    113691

Confusion Matrix: 
 [[    0     0     0    11 10400]
 [    0     0     0     3  5852]
 [    0     0     0     8  8616]
 [    0     0     0    23 16202]
 [    0     0     0    67 72509]]


## Pytorch Logistic Regression

In [83]:
input_dim = X_train.shape[1]
output_dim = 5

model = MyLogisticRegression(input_dim, output_dim)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Step 4: Train the model
num_epochs = 1
for epoch in range(num_epochs):
    for text, labels in train_loader:
        # Forward pass
        outputs = model(text)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TEST 2 1
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [13  3]
TEST 2 4
TEST [15  4]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 1]
TEST 2 4
TEST [2 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [8 2]
TEST 2 3
TEST [0 0]
TEST 2 0
TEST [3 0]
TEST 2 3
TEST [0 1]
TEST 2 2
TEST [4 0]
TEST 2 4
TEST [0 1]
TEST 2 4
TEST [0 1]
TEST 2 3
TEST [0 0]
TEST 2 4
TEST [1 0]
TEST 2 0
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [4 1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [3 2]
TEST 2 4
TEST [0 0]
TEST 2 3
TEST [2 0]
TEST 2 4
TEST [0 0]
TEST 2 3
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 3
TEST [0 1]
TEST 2 4
TEST [0 2]
TEST 2 2
TEST [2 0]
TEST 2 4
TEST [0 0]
TEST 2 3
TEST [0 2]
TEST 2 2
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [2 0]
TEST 2 4
TEST [4 4]
TEST 2 3
TEST [2 1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TE

In [85]:
true_labels = []
predicted_labels = []

correct = 0
total = 0
with torch.no_grad():
    for text, labels in test_loader:
        true_labels.extend(labels.numpy())  # Convert tensor to numpy array for sklearn metrics
        outputs = model(text)
        _, predicted = torch.max(outputs, dim=1)
        predicted_labels.extend(predicted.numpy())  # Convert tensor to numpy array for sklearn metrics
        total += labels.size(0)
        correct += (predicted == labels).sum().item()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 1]
TEST 2 0
TEST [0 0]
TEST 2 1
TEST [0 1]
TEST 2 1
TEST [1 0]
TEST 2 4
TEST [0 0]
TEST 2 2
TEST [0 0]
TEST 2 3
TEST [0 1]
TEST 2 2
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [1 2]
TEST 2 2
TEST [0 1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [30  1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [2 0]
TEST 2 3
TEST [4 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [0 2]
TEST 2 0
TEST [0 0]
TEST 2 4
TEST [0 0]
TEST 2 0
TEST [6 0]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [4 1]
TEST 2 3
TEST [0 1]
TEST 2 4
TEST [2 0]
TEST 2 3
TEST [0 0]
TEST 2 1
TEST [0 1]
TEST 2 4
TEST [4 0]
TEST 2 4
TEST [6 1]
TEST 2 4
TEST [0 0]
TEST 2 4
TEST [4 1]
TEST 2 2
TEST [0 1]
TE

In [86]:
# Calculate accuracy
accuracy = correct / total
print(f'Test accuracy: {accuracy:.4f}')

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate F1 score, recall, and precision
f1 = f1_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')

print(f'F1 Score: {f1:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Precision: {precision:.4f}')


Test accuracy: 0.6462
Confusion Matrix:
[[    0     0     0     2  9953]
 [    0     0     0     3  5814]
 [    0     0     0    11  8396]
 [    0     0     0     9 16017]
 [    0     0     3    21 73462]]
F1 Score: 0.5077
Recall: 0.6462
Precision: 0.4454


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
test = np.array([[1,2], [3,4]])
test

array([[1, 2],
       [3, 4]])

In [4]:
np.mean(test, axis=0)

array([2., 3.])