In [64]:
!pip install pandas numpy scikit-learn torch transformers datasets nltk tqdm streamlit pyngrok

In [14]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
df = pd.read_csv("/Amazon_Reviews.csv", engine='python', encoding='utf-8')

In [16]:
df = df[['Review Text', 'Rating']].dropna()
df['Review Text'] = df['Review Text'].astype(str)

In [17]:
df = df.rename(columns={
    'Review Text': 'text',
    'Rating': 'rating'
})

In [18]:
df.head()

Unnamed: 0,text,rating
0,"I registered on the website, tried to order a ...",Rated 1 out of 5 stars
1,Had multiple orders one turned up and driver h...,Rated 1 out of 5 stars
2,I informed these reprobates that I WOULD NOT B...,Rated 1 out of 5 stars
3,I have bought from Amazon before and no proble...,Rated 1 out of 5 stars
4,If I could give a lower rate I would! I cancel...,Rated 1 out of 5 stars


In [19]:
import re
def extract_rating(text):
    match = re.search(r'Rated\s+(\d)', str(text))
    if match:
        return int(match.group(1))
    return None

In [20]:
df['rating'] = df['rating'].apply(extract_rating)

In [21]:
df.head(3)

Unnamed: 0,text,rating
0,"I registered on the website, tried to order a ...",1
1,Had multiple orders one turned up and driver h...,1
2,I informed these reprobates that I WOULD NOT B...,1


In [22]:
def rating_to_label(r):
    if r <= 2:
        return 0   # Negative
    elif r == 3:
        return 1   # Neutral
    else:
        return 2   # Positive

In [23]:
df['label'] = df['rating'].apply(rating_to_label)

In [24]:
df = df[['text', 'label']]
df.head(3)

Unnamed: 0,text,label
0,"I registered on the website, tried to order a ...",0
1,Had multiple orders one turned up and driver h...,0
2,I informed these reprobates that I WOULD NOT B...,0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train_vec, y_train)

preds = baseline_model.predict(X_test_vec)
print("Baseline Results:")
print(classification_report(y_test, preds))

Baseline Results:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2870
           1       0.25      0.01      0.01       177
           2       0.88      0.88      0.88      1164

    accuracy                           0.90      4211
   macro avg       0.68      0.62      0.61      4211
weighted avg       0.88      0.90      0.88      4211



In [27]:
from datasets import Dataset
from transformers import DistilBertTokenizerFast

hf_df = df[['text', 'label']]
dataset = Dataset.from_pandas(hf_df)

In [28]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [29]:
def tokenize(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

In [30]:
dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/21055 [00:00<?, ? examples/s]

In [31]:
dataset = dataset.rename_column('label', 'labels')
dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)

In [32]:
train_test = dataset.train_test_split(test_size=0.2)

In [33]:
import torch
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# !pip install -U transformers accelerate

In [35]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

In [36]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50
)

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test['train'],
    eval_dataset=train_test['test'],
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 1


[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m Invalid API key: API key must have 40+ characters, has 1.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
50,0.6804
100,0.3474
150,0.3037
200,0.2799
250,0.3794
300,0.3004
350,0.3675
400,0.3518
450,0.352
500,0.3465


TrainOutput(global_step=4212, training_loss=0.26939181551860836, metrics={'train_runtime': 955.8645, 'train_samples_per_second': 35.243, 'train_steps_per_second': 4.406, 'total_flos': 2231320654688256.0, 'train_loss': 0.26939181551860836, 'epoch': 2.0})

In [61]:
import torch
from nltk.tokenize import sent_tokenize

ASPECTS = {
    'delivery': ['delivery', 'shipping', 'late'],
    'price': ['price', 'cost', 'expensive', 'cheap'],
    'quality': ['quality', 'build', 'durable'],
    'service': ['service', 'support', 'customer'],
    'packaging': ['package', 'packaging']
}

LABELS = ['Negative', 'Neutral', 'Positive']

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    return LABELS[torch.argmax(logits).item()]

def aspect_sentiment(review):
    sentences = sent_tokenize(review)
    result = {}
    for aspect, keywords in ASPECTS.items():
        for s in sentences:
            if any(k in s.lower() for k in keywords):
                result[aspect] = predict_sentiment(s)
    return result

# Test
review = "Delivery was late but the product quality is excellent"
aspect_sentiment(review)

{'delivery': 'Positive', 'quality': 'Positive'}

In [63]:
reviews = [
    "Delivery was late but the product quality is excellent",
    "The price is too high for the quality",
    "Customer service was very helpful"
]

for r in reviews:
    print(aspect_sentiment(r))


{'delivery': 'Positive', 'quality': 'Positive'}
{'price': 'Neutral', 'quality': 'Negative'}
{'service': 'Positive'}
