In [None]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import random
from tqdm import tqdm
from torch.amp import GradScaler, autocast
import nlpaug.augmenter.word as naw
import os

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
# Text preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
# Function to predict sentiment for a single query
def predict_sentiment(model, tokenizer, text, max_length=64, device='cuda'):
    model.eval()
    processed_text = preprocess_text(text)
    encodings = tokenizer(
        [processed_text],
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]

    label_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    return {
        'original_text': text,
        'processed_text': processed_text,
        'predicted_sentiment': label_map[pred],
        'probabilities': {
            'Negative': probs[0],
            'Neutral': probs[1],
            'Positive': probs[2]
        }
    }


In [None]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
os.environ['KAGGLE_CONFIG_DIR'] = '/root/.kaggle'

import shutil
shutil.move('/content/kaggle (1).json', '/root/.kaggle/kaggle.json')

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
api.dataset_download_files('arhamrumi/amazon-product-reviews', path='.', unzip=True)


print("Dataset downloaded and extracted!")


Dataset URL: https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews
Dataset downloaded and extracted!


In [None]:
# Load dataset and train model (reproducing training pipeline)
df = pd.read_csv('Reviews.csv')

In [None]:
print("Handling missing values...")
df['Text'] = df['Text'].fillna('')
df['Score'] = df['Score'].interpolate(method='linear')
print("Missing values per column before dropping:")
print(df.isnull().sum())
df = df.dropna()
print("Missing values after handling and dropping:", df.isnull().sum().sum())
print("Dataset shape after handling missing values:", df.shape)


Handling missing values...
Missing values per column before dropping:
Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64
Missing values after handling and dropping: 0
Dataset shape after handling missing values: (568401, 10)


In [None]:
print(df.isnull().sum())
df = df.dropna()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


In [None]:
print("Removing duplicates...")
df = df.drop_duplicates(subset=['Text', 'Score'])
print("Dataset shape after deduplication:", df.shape)

Removing duplicates...
Dataset shape after deduplication: (393656, 10)


In [None]:
print("Transforming sentiment scores...")
def map_sentiment(score):
    if score in [1, 2]:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    elif score in [4, 5]:
        return 2  # Positive
df['Sentiment'] = df['Score'].apply(map_sentiment)

print("Selecting only Text and Sentiment columns...")
df = df[['Text', 'Sentiment']]
print("Dataset shape after column selection:", df.shape)
print("Columns in dataset:", df.columns.tolist())

print("Balancing dataset...")
negative_df = df[df['Sentiment'] == 0].sample(n=1000, random_state=42)
neutral_df = df[df['Sentiment'] == 1].sample(n=1000, random_state=42)
positive_df = df[df['Sentiment'] == 2].sample(n=1000, random_state=42)
balanced_df = pd.concat([negative_df, neutral_df, positive_df]).reset_index(drop=True)
print("Balanced dataset shape:", balanced_df.shape)

Transforming sentiment scores...
Selecting only Text and Sentiment columns...
Dataset shape after column selection: (393656, 2)
Columns in dataset: ['Text', 'Sentiment']
Balancing dataset...
Balanced dataset shape: (3000, 2)


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
print("Preprocessing text...")
balanced_df['Processed_Text'] = balanced_df['Text'].apply(preprocess_text)

Preprocessing text...


In [None]:
print("Applying contextual data augmentation...")
aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute", aug_p=0.4, device='cuda' if torch.cuda.is_available() else 'cpu')
augmented_texts = []
augmented_labels = []
for idx, row in balanced_df.iterrows():
    text = row['Processed_Text']
    label = row['Sentiment']
    augmented_texts.append(text)
    augmented_labels.append(label)
    aug_text = aug.augment(text)[0]
    augmented_texts.append(aug_text)
    augmented_labels.append(label)

augmented_df = pd.DataFrame({
    'Processed_Text': augmented_texts,
    'Sentiment': augmented_labels
})
print("Augmented dataset shape:", augmented_df.shape)


Applying contextual data augmentation...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Augmented dataset shape: (6000, 2)


In [None]:
def prepare_data(df, tokenizer, max_length=64):
    encodings = tokenizer(
        df['Processed_Text'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = torch.tensor(df['Sentiment'].values)
    dataset = TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],
        labels
    )
    return dataset

In [None]:
train_df, test_df = train_test_split(augmented_df, test_size=0.2, random_state=42, stratify=augmented_df['Sentiment'])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['Sentiment'])
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

Train set shape: (4320, 2)
Validation set shape: (480, 2)
Test set shape: (1200, 2)


In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset = prepare_data(train_df, roberta_tokenizer)
val_dataset = prepare_data(val_df, roberta_tokenizer)
test_dataset = prepare_data(test_df, roberta_tokenizer)

In [None]:
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=7, patience=2, accumulation_steps=4):
    model.to(device)
    scaler = GradScaler('cuda')
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_state = None
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            with autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                loss = loss / accumulation_steps
            scaler.scale(loss).backward()
            if (i + 1) % accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
            total_loss += loss.item() * accumulation_steps
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                with autocast('cuda'):
                    outputs = model(input_ids, attention_mask=attention_mask)
                    val_loss += criterion(outputs.logits, labels).item()
        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                break

    model.load_state_dict(best_model_state)
    return model

print("Training Optimized RoBERTa model...")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3, hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'roberta.encoder.layer' in n],
        'weight_decay': 0.01,
        'lr': 1e-5 * 0.8
    },
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'classifier' in n],
        'weight_decay': 0.01,
        'lr': 1e-5
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
        'lr': 1e-5
    }
]
optimizer = AdamW(optimizer_grouped_parameters)
total_steps = len(train_loader) * 7 // 4
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)
model = train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=7, patience=2, accumulation_steps=4)


Using device: cuda
Training Optimized RoBERTa model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/7: 100%|██████████| 1080/1080 [00:53<00:00, 20.07it/s]


Epoch 1/7, Train Loss: 1.0719
Validation Loss: 0.9039


Epoch 2/7: 100%|██████████| 1080/1080 [00:57<00:00, 18.92it/s]


Epoch 2/7, Train Loss: 0.9188
Validation Loss: 0.8538


Epoch 3/7: 100%|██████████| 1080/1080 [00:55<00:00, 19.64it/s]


Epoch 3/7, Train Loss: 0.8607
Validation Loss: 0.8625


Epoch 4/7: 100%|██████████| 1080/1080 [00:54<00:00, 19.99it/s]


Epoch 4/7, Train Loss: 0.8382
Validation Loss: 0.8203


Epoch 5/7: 100%|██████████| 1080/1080 [00:53<00:00, 20.17it/s]


Epoch 5/7, Train Loss: 0.8010
Validation Loss: 0.8206


Epoch 6/7: 100%|██████████| 1080/1080 [00:53<00:00, 20.26it/s]


Epoch 6/7, Train Loss: 0.7801
Validation Loss: 0.8083


Epoch 7/7: 100%|██████████| 1080/1080 [00:53<00:00, 20.29it/s]


Epoch 7/7, Train Loss: 0.7543
Validation Loss: 0.8202


In [None]:
#Function to evaluate model
def evaluate_model(model, test_loader, device='cuda'):
    model.to(device)
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            with autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1

In [None]:
# Evaluate model on test set
print("Evaluating model on test set...")
metrics = evaluate_model(model, test_loader, device=device)
print("RoBERTa Evaluation Metrics:")
print(f"Accuracy: {metrics[0]:.4f}, Precision: {metrics[1]:.4f}, Recall: {metrics[2]:.4f}, F1-Score: {metrics[3]:.4f}")

Evaluating model on test set...


Evaluating: 100%|██████████| 300/300 [00:03<00:00, 84.42it/s]

RoBERTa Evaluation Metrics:
Accuracy: 0.6708, Precision: 0.6673, Recall: 0.6708, F1-Score: 0.6680





In [None]:
# Sample queries
queries = [
    "This product is amazing! It works perfectly and exceeded my expectations.",
    "Terrible purchase. The item broke after one use and was poorly made.",
    "The product is okay, does the job but nothing special about it.",
    "I love this! Super fast delivery and great quality for the price.",
    "Really disappointed with this. It stopped working and customer service was unhelpful."
]
print("\nProcessing sample queries...")
model.to(device)
for query in queries:
    result = predict_sentiment(model, roberta_tokenizer, query, max_length=64, device=device)
    print("\nQuery:", result['original_text'])
    print("Processed Text:", result['processed_text'])
    print("Predicted Sentiment:", result['predicted_sentiment'])
    print("Probabilities:")
    print(f"  Negative: {result['probabilities']['Negative']:.4f}")
    print(f"  Neutral: {result['probabilities']['Neutral']:.4f}")
    print(f"  Positive: {result['probabilities']['Positive']:.4f}")


Processing sample queries...

Query: This product is amazing! It works perfectly and exceeded my expectations.
Processed Text: product amazing works perfectly exceeded expectations
Predicted Sentiment: Positive
Probabilities:
  Negative: 0.0379
  Neutral: 0.0212
  Positive: 0.9409

Query: Terrible purchase. The item broke after one use and was poorly made.
Processed Text: terrible purchase item broke one use poorly made
Predicted Sentiment: Negative
Probabilities:
  Negative: 0.8745
  Neutral: 0.0839
  Positive: 0.0415

Query: The product is okay, does the job but nothing special about it.
Processed Text: product okay job nothing special
Predicted Sentiment: Neutral
Probabilities:
  Negative: 0.2759
  Neutral: 0.6318
  Positive: 0.0923

Query: I love this! Super fast delivery and great quality for the price.
Processed Text: love super fast delivery great quality price
Predicted Sentiment: Positive
Probabilities:
  Negative: 0.0174
  Neutral: 0.0241
  Positive: 0.9585

Query: Really di

ui

In [None]:
!pip install flask-ngrok
from flask import Flask, request, jsonify, render_template_string
from flask_ngrok import run_with_ngrok
import torch.nn.functional as F


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
!rm -rf /root/.ngrok2/ngrok.yml
!rm -rf /root/.config/ngrok/ngrok.yml


In [None]:
!pip install flask flask-ngrok pyngrok --quiet


In [None]:

!wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz

!tar -xvzf ngrok-v3-stable-linux-amd64.tgz


--2025-05-10 23:34:40--  https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
Resolving bin.equinox.io (bin.equinox.io)... 13.248.244.96, 99.83.220.108, 35.71.179.82, ...
Connecting to bin.equinox.io (bin.equinox.io)|13.248.244.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9395172 (9.0M) [application/octet-stream]
Saving to: ‘ngrok-v3-stable-linux-amd64.tgz.2’


2025-05-10 23:34:41 (12.8 MB/s) - ‘ngrok-v3-stable-linux-amd64.tgz.2’ saved [9395172/9395172]

ngrok


In [None]:
!./ngrok authtoken 2wvReJKDW6JjpNIjfg1o2iigJT8_46bewZ9eFw8MWuHoPzSWC


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from flask import Flask, request, render_template_string
from threading import Thread
import torch.nn.functional as F
import torch

# Define Flask app
app = Flask(__name__)

# HTML template for the app
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sentiment Prediction</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            background-color: #f4f7fc;
            margin: 0;
            padding: 0;
            display: flex;
            justify-content: center;
            align-items: center;
            height: 100vh;
        }

        .container {
            background-color: #fff;
            padding: 30px;
            border-radius: 8px;
            box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
            width: 400px;
        }

        h2 {
            text-align: center;
            color: #333;
            margin-bottom: 20px;
        }

        form {
            display: flex;
            flex-direction: column;
            gap: 15px;
        }

        input[type="text"] {
            padding: 10px;
            font-size: 16px;
            border: 1px solid #ddd;
            border-radius: 4px;
            width: 100%;
            box-sizing: border-box;
        }

        input[type="submit"] {
            padding: 10px;
            background-color: #4CAF50;
            color: white;
            border: none;
            border-radius: 4px;
            cursor: pointer;
            font-size: 16px;
            transition: background-color 0.3s ease;
        }

        input[type="submit"]:hover {
            background-color: #45a049;
        }

        .result {
            margin-top: 30px;
        }

        .result h3 {
            text-align: center;
            color: #333;
        }

        .result ul {
            list-style-type: none;
            padding: 0;
        }

        .result li {
            background-color: #f1f1f1;
            padding: 10px;
            margin-bottom: 10px;
            border-radius: 5px;
            font-size: 18px;
            display: flex;
            justify-content: space-between;
        }

        .result li span {
            font-weight: bold;
        }

        .query {
            margin-top: 20px;
            font-size: 18px;
            text-align: center;
            font-weight: bold;
            color: #333;
        }

    </style>
</head>
<body>
    <div class="container">
        <h2>Sentiment Prediction</h2>
        <form method="post">
            <input name="text" type="text" placeholder="Enter your text here..." value="{{ request.form['text'] if request.method == 'POST' else '' }}" required>
            <input type="submit" value="Predict">
        </form>

        {% if result %}
            <div class="query">
                <p><strong>Your Query:</strong> "{{ request.form['text'] }}"</p>
            </div>
            <div class="result">
                <h3>Prediction Result</h3>
                <ul>
                    <li>
                        <span>Negative:</span> {{ result['Negative']*100 | round(2) }}%
                    </li>
                    <li>
                        <span>Neutral:</span> {{ result['Neutral']*100 | round(2) }}%
                    </li>
                    <li>
                        <span>Positive:</span> {{ result['Positive']*100 | round(2) }}%
                    </li>
                </ul>
            </div>
        {% endif %}
    </div>
</body>
</html>

"""

# Define a function to predict sentiment (replace with your model and tokenizer)
def predict_sentiment(model, tokenizer, text, device='cuda', max_length=64):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        return {
            'Negative': probs[0],
            'Neutral': probs[1],
            'Positive': probs[2]
        }

# Flask route for handling form and displaying results
@app.route("/", methods=["GET", "POST"])
def home():
    result = None
    if request.method == "POST":
        query = request.form['text']
        # Replace 'model' and 'roberta_tokenizer' with your actual model and tokenizer
        result = predict_sentiment(model, roberta_tokenizer, query, device=device)
    return render_template_string(html_template, result=result)

# Function to run Flask app in a thread
def run_flask():
    app.run()

# Start Flask app in a separate thread
thread = Thread(target=run_flask)
thread.start()


 * Serving Flask app '__main__'
 * Debug mode: off


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


In [None]:
from pyngrok import ngrok

# Open ngrok tunnel for Flask app running on port 5000
public_url = ngrok.connect(5000, bind_tls=True)
print(f"Public URL: {public_url}")


Public URL: NgrokTunnel: "https://e48e-34-16-228-130.ngrok-free.app" -> "http://localhost:5000"
