In [5]:
!pip install pandas scikit-learn numpy torch transformers tqdm
!pip install datasets




In [6]:
!pip install -q pandas scikit-learn numpy torch transformers tqdm


In [7]:
import pandas as pd

# If you haven't uploaded yet: run this once, then choose your reviews.csv
from google.colab import files
uploaded = files.upload()  # choose reviews.csv

df = pd.read_csv("reviews.csv")
print(df.head())
print("Columns:", df.columns.tolist())


Saving reviews.csv to reviews (1).csv
                                              review     label
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Columns: ['review', 'label']


In [8]:
import re

# Map text labels to 0/1
# assuming values are "positive"/"negative"
df["label_num"] = df["label"].map({"positive": 1, "negative": 0})

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"<br\s*/?>", " ", text)          # remove HTML breaks
    text = re.sub(r"http\S+|www\S+", "", text)      # remove URLs
    text = re.sub(r"[^a-z0-9\s]", " ", text)        # keep letters/numbers/spaces
    text = re.sub(r"\s+", " ", text).strip()        # collapse spaces
    return text

df["clean_review"] = df["review"].apply(clean_text)

# Drop duplicates & missing
df = df.dropna(subset=["clean_review", "label_num"]).drop_duplicates(subset=["clean_review"])

print(df.head())
print(df["label_num"].value_counts())


                                              review     label  label_num  \
0  One of the other reviewers has mentioned that ...  positive          1   
1  A wonderful little production. <br /><br />The...  positive          1   
2  I thought this was a wonderful way to spend ti...  positive          1   
3  Basically there's a family where a little boy ...  negative          0   
4  Petter Mattei's "Love in the Time of Money" is...  positive          1   

                                        clean_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production the filming tech...  
2  i thought this was a wonderful way to spend ti...  
3  basically there s a family where a little boy ...  
4  petter mattei s love in the time of money is a...  
label_num
1    24881
0    24695
Name: count, dtype: int64


In [9]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label_num"]
)

train_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=42,
    stratify=train_df["label_num"]
)

train_df.shape, val_df.shape, test_df.shape


((35694, 4), (3966, 4), (9916, 4))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_tf = vectorizer.fit_transform(train_df["clean_review"])
X_val_tf   = vectorizer.transform(val_df["clean_review"])
X_test_tf  = vectorizer.transform(test_df["clean_review"])

y_train = train_df["label_num"].values
y_val   = val_df["label_num"].values
y_test  = test_df["label_num"].values

baseline_clf = LogisticRegression(max_iter=1000)
baseline_clf.fit(X_train_tf, y_train)

print("Validation performance (TF-IDF baseline):")
val_preds = baseline_clf.predict(X_val_tf)
print(classification_report(y_val, val_preds))

print("Test performance (TF-IDF baseline):")
test_preds = baseline_clf.predict(X_test_tf)
print(classification_report(y_test, test_preds))


Validation performance (TF-IDF baseline):
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1976
           1       0.89      0.89      0.89      1990

    accuracy                           0.89      3966
   macro avg       0.89      0.89      0.89      3966
weighted avg       0.89      0.89      0.89      3966

Test performance (TF-IDF baseline):
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4939
           1       0.89      0.91      0.90      4977

    accuracy                           0.89      9916
   macro avg       0.89      0.89      0.89      9916
weighted avg       0.89      0.89      0.89      9916



In [11]:
import torch
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertModel
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

def get_bert_embeddings(text_list):
    """
    Convert a list of texts into a numpy array of BERT [CLS] embeddings.
    """
    all_embeddings = []
    for text in tqdm(text_list):
        inputs = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = bert_model(**inputs)
        # CLS token representation (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embedding[0])

    return np.vstack(all_embeddings)


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
!pip install -q sentence-transformers

import torch
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

st_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)


Using device: cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Create embeddings using MiniLM (much faster)
X_train_bert = st_model.encode(
    train_df["clean_review"].tolist(),
    batch_size=128,
    show_progress_bar=True
)

X_val_bert = st_model.encode(
    val_df["clean_review"].tolist(),
    batch_size=128,
    show_progress_bar=True
)

X_test_bert = st_model.encode(
    test_df["clean_review"].tolist(),
    batch_size=128,
    show_progress_bar=True
)

y_train = train_df["label_num"].values
y_val   = val_df["label_num"].values
y_test  = test_df["label_num"].values

X_train_bert.shape, X_val_bert.shape, X_test_bert.shape


Batches:   0%|          | 0/279 [00:00<?, ?it/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

((35694, 384), (3966, 384), (9916, 384))

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

bert_clf = LogisticRegression(max_iter=2000)
bert_clf.fit(X_train_bert, y_train)

print("Validation performance (BERT embeddings):")
val_preds_bert = bert_clf.predict(X_val_bert)
print(classification_report(y_val, val_preds_bert))

print("Test performance (BERT embeddings):")
test_preds_bert = bert_clf.predict(X_test_bert)
print(classification_report(y_test, test_preds_bert))


Validation performance (BERT embeddings):
              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1976
           1       0.83      0.82      0.83      1990

    accuracy                           0.83      3966
   macro avg       0.83      0.83      0.83      3966
weighted avg       0.83      0.83      0.83      3966

Test performance (BERT embeddings):
              precision    recall  f1-score   support

           0       0.83      0.82      0.83      4939
           1       0.83      0.83      0.83      4977

    accuracy                           0.83      9916
   macro avg       0.83      0.83      0.83      9916
weighted avg       0.83      0.83      0.83      9916



In [20]:
import random

def summarize_review(text, pred_label):
    sentiment_word = "positive" if pred_label == 1 else "negative"
    return (
        f"This review expresses a {sentiment_word} opinion about the movie, "
        f"highlighting aspects related to its story, characters, and overall experience."
    )

# pick a random test review
sample = test_df.sample(1).iloc[0]
sample_text = sample["review"]
true_label = sample["label_num"]

# FIXED PREDICTION LINE ↓↓↓
pred_label = bert_clf.predict(st_model.encode([sample["clean_review"]]))[0]

print("Original review:\n", sample_text)
print("\nTrue sentiment:", "Positive" if true_label == 1 else "Negative")
print("Predicted sentiment:", "Positive" if pred_label == 1 else "Negative")
print("\nGenerated summary:")
print(summarize_review(sample_text, pred_label))


Original review:
 I can't believe people are looking for a plot in this film. This is Laural and Hardy. Lighten up already. These two were a riot. Their comic genius is as funny today as it was 70 years ago. Not a filthy word out of either mouth and they were able to keep audiences in stitches. Their comedy wasn't sophisticated by any stretch. If a whoopee cushion can't make you grin, there's no reason to watch any of the stuff these guys did. It was a simpler time, and people laughed at stuff that was funny without a plot. I guess it takes a simple mind to enjoy this stuff, so I qualify. Two man comedy teams don't compute, We're just too sophisticated... Aren't we fortunate?

True sentiment: Positive
Predicted sentiment: Negative

Generated summary:
This review expresses a negative opinion about the movie, highlighting aspects related to its story, characters, and overall experience.


In [21]:
!pip install gradio

import gradio as gr

def predict_and_summarize(text):
    clean = clean_text(text)
    pred = bert_clf.predict(st_model.encode([clean]))[0]
    sentiment = "Positive" if pred == 1 else "Negative"
    return sentiment, summarize_review(text, pred)

demo = gr.Interface(fn=predict_and_summarize,
                    inputs="text",
                    outputs=["text", "text"],
                    title="🎬 Movie Review Sentiment Analyzer")

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e350eb16717c0909f6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


