In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [4]:
df = pd.read_csv("./drive/MyDrive/Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## My Model

In [5]:
class MyLogisticRegression(torch.nn.Module):
    def __init__(self, input_size, num_classes):
        super(MyLogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

## My Dataset

In [6]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      text = self.data.iloc[index, :-1].values
      label = self.data.iloc[index, -1]

      # Convert label to integer explicitly
      label = int(label)

      text = torch.tensor(text, dtype=torch.float)
      label = torch.tensor(label, dtype=torch.long)

      return text, label

## Vectorize the text

In [7]:
X = df['Text']
y = df['Score']
y = y.map({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})
train_len = int(0.8 * len(y))

In [8]:
vectorizer = CountVectorizer(stop_words='english', max_features=2)
X_train = vectorizer.fit_transform(X.iloc[:train_len])
X_test = vectorizer.transform(X.iloc[train_len:])

## Dataloader Split

In [9]:
train_dataset = pd.concat([pd.DataFrame(X_train.toarray()), y.iloc[:train_len]], axis=1)
test_dataset = pd.concat([pd.DataFrame(X_test.toarray()), y.iloc[train_len:]], axis=1)

train_dataset = train_dataset.sample(frac=1)
test_dataset = test_dataset.sample(frac=1)

train_dataset = CustomDataset(train_dataset)
test_dataset = CustomDataset(test_dataset)


batch_size = 2

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

### Classic Split

In [10]:
vectorizer = CountVectorizer(stop_words='english', max_features=1)
X2 = vectorizer.fit_transform(X.copy())
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y.copy(), test_size=0.2, random_state=11)

## Scikit-Learn Logistic Regression

In [None]:
logreg = LogisticRegression()
# Train the model using the training sets
logreg.fit(X_train2, y_train2)

# Predict the test set results
y_pred = logreg.predict(X_test2)

# Print the accuracy
print("Accuracy: ", accuracy_score(y_test2, y_pred))

# Print the classification report
print("Classification Report: \n", classification_report(y_test2, y_pred))

# Print the confusion matrix
print("Confusion Matrix: \n", confusion_matrix(y_test2, y_pred))

Accuracy:  0.6379748616865011


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     10411
           1       0.00      0.00      0.00      5855
           2       0.00      0.00      0.00      8624
           3       0.21      0.00      0.00     16225
           4       0.64      1.00      0.78     72576

    accuracy                           0.64    113691
   macro avg       0.17      0.20      0.16    113691
weighted avg       0.44      0.64      0.50    113691

Confusion Matrix: 
 [[    0     0     0    11 10400]
 [    0     0     0     3  5852]
 [    0     0     0     8  8616]
 [    0     0     0    23 16202]
 [    0     0     0    67 72509]]


## Pytorch Logistic Regression

In [None]:
input_dim = X_train.shape[1]
output_dim = 5

model = MyLogisticRegression(input_dim, output_dim)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Step 4: Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for text, labels in train_loader:
        # Forward pass
        outputs = model(text)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Step 5: Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for text, labels in test_loader:
        outputs = model(text)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test accuracy: {accuracy:.4f}')


In [None]:
# Sample text data and labels (replace with your data)
texts = df['Text'].iloc[:200]
df['Score'] = df['Score'].apply(lambda x: x - 1)
labels = df['Score'].iloc[:200]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Vectorizing the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
from torch.utils.data import TensorDataset
import torch.nn as nn

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vec.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vec.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

input_size = X_train_tensor.shape[1]
hidden_size = 64
num_classes = 5
num_epochs = 10
batch_size = 32
learning_rate = 0.001


# Define the model architecture
class FFNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Initialize the model, loss function, and optimizer
model = FFNN(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training
num_epochs = 100
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Testing
model.eval()
all_predictions = []
all_targets = []
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_predictions.extend(predicted.tolist())
        all_targets.extend(labels.tolist())

    print(f"Test Accuracy: {correct / total:.4f}")

# Calculate accuracy
accuracy = sum(1 for p, t in zip(all_predictions, all_targets) if p == t) / len(all_targets)
print(f'Accuracy: {accuracy:.2f}')

# Calculate additional performance metrics
print(classification_report(all_targets, all_predictions))

# Confusion matrix
conf_matrix = confusion_matrix(all_targets, all_predictions)
print('Confusion Matrix:')
print(conf_matrix)


Epoch [1/100], Loss: 1.8946
Epoch [2/100], Loss: 1.1895
Epoch [3/100], Loss: 1.1174
Epoch [4/100], Loss: 0.8594
Epoch [5/100], Loss: 0.0340
Epoch [6/100], Loss: 1.0243
Epoch [7/100], Loss: 0.2062
Epoch [8/100], Loss: 0.0127
Epoch [9/100], Loss: 0.0240
Epoch [10/100], Loss: 0.0115
Epoch [11/100], Loss: 0.0021
Epoch [12/100], Loss: 0.0153
Epoch [13/100], Loss: 0.0173
Epoch [14/100], Loss: 0.0041
Epoch [15/100], Loss: 0.0045
Epoch [16/100], Loss: 0.0001
Epoch [17/100], Loss: 0.0002
Epoch [18/100], Loss: 0.0000
Epoch [19/100], Loss: 0.0031
Epoch [20/100], Loss: 0.0010
Epoch [21/100], Loss: 0.0024
Epoch [22/100], Loss: 0.0004
Epoch [23/100], Loss: 0.0008
Epoch [24/100], Loss: 0.0235
Epoch [25/100], Loss: 0.0000
Epoch [26/100], Loss: 0.0002
Epoch [27/100], Loss: 0.0001
Epoch [28/100], Loss: 0.0001
Epoch [29/100], Loss: 0.0009
Epoch [30/100], Loss: 0.0000
Epoch [31/100], Loss: 0.0006
Epoch [32/100], Loss: 0.0089
Epoch [33/100], Loss: 0.0000
Epoch [34/100], Loss: 0.0005
Epoch [35/100], Loss: 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
