In [5]:
# Optimized Netflix Review Sentiment Classification

# Step 1: Install & Import Libraries
!pip install -q transformers datasets nltk spacy scikit-learn
!python -m spacy download en_core_web_sm

import pandas as pd
import numpy as np
import re
import nltk
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer, create_optimizer
import tensorflow as tf


# Download NLP resources
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:

# Step 2: Load Dataset
data = pd.read_csv('netflix_reviews.csv')
data['content'] = data['content'].fillna('').astype(str)


In [7]:
# Step 3: Clean Text
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha]
    stop_words = set(nltk.corpus.stopwords.words('english'))
    return ' '.join([token for token in tokens if token not in stop_words])

data['processed_content'] = data['content'].apply(clean_text)


In [8]:

# Step 4: Label Encoding
data['label'] = data['score'].apply(lambda x: 1 if x >= 3 else 0)


In [9]:
# Step 5: Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['processed_content'], data['label'], test_size=0.2, random_state=42
)


In [10]:

# Step 6: Tokenization for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def encode(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

train_enc = encode(train_texts)
test_enc = encode(test_texts)

train_labels_tf = tf.convert_to_tensor(train_labels.values)
test_labels_tf = tf.convert_to_tensor(test_labels.values)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
# Step 7: RoBERTa Model
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

batch_size = 16
epochs = 5
steps = len(train_labels) // batch_size * epochs
warmup = int(0.1 * steps)

optimizer, _ = create_optimizer(init_lr=2e-5, num_train_steps=steps, num_warmup_steps=warmup)
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [12]:

# Step 8: Train RoBERTa
model.fit(
    [train_enc['input_ids'], train_enc['attention_mask']],
    train_labels_tf,
    validation_data=([test_enc['input_ids'], test_enc['attention_mask']], test_labels_tf),
    batch_size=batch_size,
    epochs=epochs
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7aa55d727550>

In [13]:
# Step 9: Evaluate RoBERTa
preds_bert = model.predict([test_enc['input_ids'], test_enc['attention_mask']])
preds_bert = tf.argmax(preds_bert.logits, axis=1).numpy()

print("\n=== RoBERTa Classification Report ===")
print(classification_report(test_labels, preds_bert))



=== RoBERTa Classification Report ===
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     11060
           1       0.87      0.83      0.85     11554

    accuracy                           0.85     22614
   macro avg       0.85      0.85      0.85     22614
weighted avg       0.85      0.85      0.85     22614



In [14]:
# Step 10: TF-IDF + Logistic Regression
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_test_tfidf = vectorizer.transform(test_texts)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, train_labels)
preds_lr = log_reg.predict(X_test_tfidf)

print("\n=== TF-IDF + Logistic Regression Report ===")
print(classification_report(test_labels, preds_lr))



=== TF-IDF + Logistic Regression Report ===
              precision    recall  f1-score   support

           0       0.81      0.86      0.84     11060
           1       0.86      0.81      0.84     11554

    accuracy                           0.84     22614
   macro avg       0.84      0.84      0.84     22614
weighted avg       0.84      0.84      0.84     22614



In [15]:
# Step 11: Comparison
acc_bert = accuracy_score(test_labels, preds_bert)
acc_lr = accuracy_score(test_labels, preds_lr)

print("\n📊 Accuracy Comparison:")
print(f"🔹 RoBERTa Accuracy: {acc_bert:.4f}")
print(f"🔹 TF-IDF + Logistic Regression Accuracy: {acc_lr:.4f}")



📊 Accuracy Comparison:
🔹 RoBERTa Accuracy: 0.8499
🔹 TF-IDF + Logistic Regression Accuracy: 0.8367
