In [1]:
## üõ†Ô∏è Corrected Code Block

# --- INSTALLING LIBRARIES ---

!pip install --upgrade transformers accelerate datasets scikit-learn pyarrow


# --- IMPORTING LIBRARIES ---
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, RobertaModel, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, OneHotEncoder  # Added OHE for Part 2/3
from sklearn.utils.class_weight import compute_class_weight
from packaging import version # Added for sklearn check
import sklearn # Added for sklearn check
import os

RNG = 42

# --- LOADING THE DATASET ---

FILE_PATH = '/kaggle/input/non-english-reviews/Non English reviews deleted All_Language_Data - Copy.csv'
MODEL_CHECKPOINT = "roberta-base"

print(f"Loading dataset from: {FILE_PATH}")

try:
    raw_df = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print("\n‚ùå ERROR: File not found! Please verify the path.")
    print("Tip: Ensure this path is correct for your Kaggle/Colab input files.")
    raise

# --- SANITY CHECK ---
needed_cols = [
    'title', 'reviewText', 'audienceScore', 'tomatoMeter', 'runtimeMinutes',
    'genre', 'language_encoded', 'director_encoded', 'sentiment_label'
]

missing = [c for c in needed_cols if c not in raw_df.columns]
if missing:
    raise ValueError(f"‚ùå CRITICAL ERROR: Your CSV is missing these columns: {missing}")

print("--- Part 1: Setup and Data Loading Complete ---")
print(f"Dataset loaded with {raw_df.shape[0]} rows.")
print(f"Target Label Source (to be binarized/filtered):\\n{raw_df['sentiment_label'].value_counts().sort_index()}")

Collecting transformers
  Downloading transformers-4.57.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pyarrow
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime

2025-11-25 12:27:31.609197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764073651.790924      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764073651.839974      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading dataset from: /kaggle/input/non-english-reviews/Non English reviews deleted All_Language_Data - Copy.csv
--- Part 1: Setup and Data Loading Complete ---
Dataset loaded with 194801 rows.
Target Label Source (to be binarized/filtered):\nsentiment_label
0    97036
1    97765
Name: count, dtype: int64


In [2]:
# --- PART 2: PREPROCESSING & FEATURE ENGINEERING (BINARY - TOP 4) ---

from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight 
import numpy as np
import torch

if 'raw_df' not in globals():
    raise ValueError("‚ùå 'raw_df' is missing! Please run Part 1 to load the data first.")

df = raw_df.copy()

# --- CLEANING & BINARIZATION ---
print("Cleaning data (already binary, no filtering needed)...")
df = df.dropna(subset=['reviewText', 'sentiment_label']).copy() 
df['label'] = df['sentiment_label'].astype(int) 

class_names = ['Negative', 'Positive'] 
N_CLASSES = 2 
                 
# Text Prep
df['title'] = df['title'].fillna("")
df['genre'] = df['genre'].fillna("Unknown")
df['combined_text'] = df['title'] + " [SEP] " + df['reviewText']

# --- STRATIFIED SPLIT ---
print("Splitting data (Stratified)...")
train_idx, test_idx = train_test_split(
    df.index, test_size=0.2, random_state=RNG, stratify=df['label']
)
train_df = df.loc[train_idx].reset_index(drop=True)
test_df  = df.loc[test_idx].reset_index(drop=True)

y_train = train_df['label'].values.astype(int)

# ==========================================
# CRITICAL: CALCULATE CLASS WEIGHTS
# ==========================================
print("Calculating class weights...")
classes_in_y = np.unique(y_train)

class_weights_np = compute_class_weight(
    class_weight="balanced",
    classes=classes_in_y,
    y=y_train
)
class_weights = torch.tensor(class_weights_np, dtype=torch.float)

if len(class_weights) == 1 and N_CLASSES == 2:
    print("[WARNING] Only one class found in y_train. Forcing weights to [1.0, 1.0].")
    class_weights = torch.tensor([1.0, 1.0], dtype=torch.float)

print("\n‚öñÔ∏è Calculated Class Weights (Neg, Pos):")
print(class_weights)

# ==========================================
# FEATURE ENGINEERING (Fit on Train) - MODIFIED FOR TOP 4
# ==========================================
print("Starting Feature Engineering (Top 4: tomatoMeter & genre)...")

# 1. Numeric: tomatoMeter
NUMERIC_COLS = ['tomatoMeter'] # Only tomatoMeter
train_numeric = train_df[NUMERIC_COLS].copy()
train_medians = train_numeric.median(numeric_only=True)
train_numeric = train_numeric.fillna(train_medians)
scaler = StandardScaler()
scaler.fit(train_numeric.values)

# 2. Categorical: Genre (Multi-Label Binarizer)
print("-> Processing Genre (MultiLabel Binarizer)...")
train_genre_list = train_df['genre'].fillna("Unknown").str.split(', ')
mlb = MultiLabelBinarizer()
mlb.fit(train_genre_list)

# NOTE: The other categorical features (language, director) and numerical (audienceScore, runtimeMinutes) 
# from the original notebook are deliberately excluded here for the "Top 4" feature set.

print("--- Part 2: Preprocessing & Weights Complete ---")

Cleaning data (already binary, no filtering needed)...
Splitting data (Stratified)...
Calculating class weights...

‚öñÔ∏è Calculated Class Weights (Neg, Pos):
tensor([1.0038, 0.9963])
Starting Feature Engineering (Top 4: tomatoMeter & genre)...
-> Processing Genre (MultiLabel Binarizer)...
--- Part 2: Preprocessing & Weights Complete ---


In [3]:
# --- PART 3: DATASET CREATION & TOKENIZATION (TOP 4) ---

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import numpy as np

# Ensure prerequisites from Part 2 exist
required_vars = ['train_df', 'test_df', 'scaler', 'mlb', 'train_medians']
if not all(v in globals() for v in required_vars):
    raise ValueError("‚ùå Missing variables from Part 2. Please run the previous cell first.")

NUMERIC_COLS = ['tomatoMeter'] 
MODEL_CHECKPOINT = "roberta-base"

# Define helper function (MODIFIED)
def build_features(split_df):
    out = {}
    out['combined_text'] = split_df['combined_text'].tolist()
    out['label'] = split_df['label'].astype(int).tolist()

    # Numeric (1 feature)
    numeric = split_df[NUMERIC_COLS].copy().fillna(train_medians)
    out['numerical_features'] = scaler.transform(numeric.values).astype(np.float32)

    # Genre (multi-hot)
    genre_list = split_df['genre'].fillna("Unknown").str.split(', ')
    
    # Categorical features (Only Genre)
    out['categorical_features'] = mlb.transform(genre_list).astype(np.float32)
    
    return out

print("Building features dictionaries...")
train_feats = build_features(train_df)
test_feats  = build_features(test_df)

# Dimensions
num_numerical_features = len(NUMERIC_COLS) # Should be 1
num_categorical_features = train_feats['categorical_features'].shape[1]
print(f"Feature Dims: Numeric={num_numerical_features}, Categorical={num_categorical_features}")

# HF Dataset
print("Converting to Hugging Face Datasets...")
raw_datasets = DatasetDict({
    'train': Dataset.from_dict(train_feats),
    'test':  Dataset.from_dict(test_feats),
})

# Tokenization
print(f"Downloading Tokenizer ({MODEL_CHECKPOINT})...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT) 
except Exception as e:
    print("\n‚ùå ERROR: Could not download tokenizer. Check internet connection.")
    raise e

def tokenize_fn(examples):
    # Truncate/pad to 256 tokens for memory efficiency
    return tokenizer(examples["combined_text"], padding="max_length", truncation=True, max_length=256)

print("Tokenizing dataset (this may take a moment)...")
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["combined_text"])
tokenized_datasets.set_format("torch")

print("--- Part 3: Dataset Ready & Tokenized ---")

Building features dictionaries...
Feature Dims: Numeric=1, Categorical=32
Converting to Hugging Face Datasets...
Downloading Tokenizer (roberta-base)...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing dataset (this may take a moment)...


Map:   0%|          | 0/155840 [00:00<?, ? examples/s]

Map:   0%|          | 0/38961 [00:00<?, ? examples/s]

--- Part 3: Dataset Ready & Tokenized ---


In [4]:
# --- PART 4: MODEL SETUP & TRAINING (BINARY RoBERTa - TOP 4) ---

import torch
import torch.nn as nn
from transformers import Trainer, TrainingArguments, RobertaModel 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# NOTE: Variables N_CLASSES, MODEL_CHECKPOINT, class_weights, 
# num_numerical_features, and num_categorical_features are assumed to be 
# defined in Parts 1, 2, and 3.

# --- MODEL ARCHITECTURE ---
class MultimodalClassifier(nn.Module):
    def __init__(self, num_labels, num_numerical, num_categorical):
        super().__init__()
        # Initialize RobertaModel
        self.roberta = RobertaModel.from_pretrained(MODEL_CHECKPOINT) 
        self.dropout = nn.Dropout(0.2)
        # Hidden size for roberta-base is 768
        self.classifier = nn.Linear(
            self.roberta.config.hidden_size + num_numerical + num_categorical,
            num_labels
        )
        self.num_labels = num_labels

    def forward(self, input_ids, attention_mask, numerical_features, categorical_features, labels=None):
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # RoBERTa uses the first token's output for classification
        pooled_output = self.dropout(roberta_output.last_hidden_state[:, 0])

        # Reshape the numerical_features if needed (Crucial for 1D auxiliary features)
        if numerical_features.dim() == 1:
             numerical_features = numerical_features.unsqueeze(1)
        elif numerical_features.dim() == 2 and numerical_features.shape[1] == 1:
             # Already correct for 1 feature
             pass

        combined = torch.cat([pooled_output, numerical_features, categorical_features], dim=1)
        logits = self.classifier(combined)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else (None, logits)

# --- CUSTOM TRAINER (Handles weighted loss) ---
class WeightedTrainer(Trainer):
    # üí° FIX: Added **kwargs to catch unexpected arguments like 'num_items_in_batch'
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs[1]

        classifier_layer = model.module.classifier if hasattr(model, "module") else model.classifier
        weights = class_weights.to(classifier_layer.weight.device)

        # Weighted Loss Calculation
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# --- DATA COLLATOR, METRICS (Binary specific) ---
class MultimodalDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, features):
        batch = self.tokenizer.pad(
            [{"input_ids": f["input_ids"], "attention_mask": f["attention_mask"]} for f in features],
            return_tensors="pt"
        )
        batch['labels'] = torch.tensor([f['label'] for f in features], dtype=torch.long)
        batch['numerical_features'] = torch.stack([f['numerical_features'] for f in features])
        batch['categorical_features'] = torch.stack([f['categorical_features'] for f in features])
        return batch

def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    preds = predictions.argmax(-1)
    
    # Use 'binary' average for the binary task
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0) 
    acc = accuracy_score(labels, preds)
    
    return {'accuracy': acc, 'f1': f1, 'precision': prec, 'recall': rec}

# --- INIT MODEL ---
multimodal_model = MultimodalClassifier(
    num_labels=N_CLASSES,
    num_numerical=num_numerical_features,
    num_categorical=num_categorical_features
)

OUTPUT_DIR = "/kaggle/working/Model_Results_Binary_Weighted_RoBERTa_Top4" 

multimodal_training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5,
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    num_train_epochs=3, 
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy", 
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
    logging_steps=100,
    fp16=True 
)

multimodal_trainer = WeightedTrainer(
    model=multimodal_model,
    args=multimodal_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=MultimodalDataCollator(tokenizer),
    compute_metrics=compute_metrics,
)

print(f"--- Part 4: RoBERTa Base Weighted Training Setup Complete (Top 4 Features) ---")
print(f"Saving checkpoints to: {OUTPUT_DIR}")
print(f"Training Batch Size: {multimodal_training_args.per_device_train_batch_size}")

multimodal_trainer.train()
print("--- Training Complete ---") # Corrected print statement after .train()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


--- Part 4: RoBERTa Base Weighted Training Setup Complete (Top 4 Features) ---
Saving checkpoints to: /kaggle/working/Model_Results_Binary_Weighted_RoBERTa_Top4
Training Batch Size: 16




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2783,0.276375,0.89143,0.893751,0.878178,0.909886
2,0.1967,0.265321,0.901466,0.902388,0.8973,0.907533
3,0.148,0.293803,0.902338,0.903385,0.897075,0.909784




--- Training Complete ---


In [5]:
# --- PART 5: EVALUATION & SAVING (KAGGLE VERSION - BINARY - TOP 4) ---

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import shutil 

# NOTE: The line below assumes training (multimodal_trainer.train()) was executed in Part 4.
# If you run this without training, it will raise an error.

# =========================
# EVALUATE ‚Ä¢ REPORT ‚Ä¢ PLOTS ‚Ä¢ SAVE
# =========================
print("\n--- Final Evaluation on Test Set ---")

# Use the trainer and dataset from previous parts
trainer = multimodal_trainer 
tokenized_datasets = tokenized_datasets 

# 1. PLOT LOSS CURVES 
# NOTE: This requires log_history to be populated by the trainer.train() call.
try:
    log_history = trainer.state.log_history
    train_logs = [log for log in log_history if ('loss' in log and 'eval_loss' not in log)]
    eval_logs = [log for log in log_history if ('eval_loss' in log)]

    plt.figure(figsize=(12, 5))

    # Loss Plot
    plt.subplot(1, 2, 1)
    plt.plot([log.get('epoch', i) for i, log in enumerate(train_logs, 1)], [log['loss'] for log in train_logs], label='Training Loss')
    plt.plot([log.get('epoch', i) for i, log in enumerate(eval_logs, 1)], [log['eval_loss'] for log in eval_logs], label='Validation Loss')
    plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Training vs Validation Loss'); plt.legend()

    # Accuracy Plot
    plt.subplot(1, 2, 2)
    if len(eval_logs) > 0 and 'eval_accuracy' in eval_logs[0]:
        plt.plot([log.get('epoch', i) for i, log in enumerate(eval_logs, 1)], [log['eval_accuracy'] for log in eval_logs], label='Validation Accuracy')
        plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Validation Accuracy'); plt.legend()
    plt.tight_layout()
    plt.savefig("loss_and_accuracy_top4.png")
    plt.close()

except AttributeError:
    print("Skipping plots: Training history not available (trainer.train() might be commented out or failed).")


# 2. PREDICTIONS 
print("Generating predictions...")
preds_output = trainer.predict(tokenized_datasets["test"])

logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions 

pred_labels = np.argmax(logits, axis=-1)
true_labels = np.array(tokenized_datasets["test"]["label"])

# 3. CLASSIFICATION REPORT 
print("\n--- Classification Report (BINARY RoBERTa - Top 4 Features) ---")
report_text = classification_report(true_labels, pred_labels, target_names=class_names, digits=4, zero_division=0)
print(report_text)

# 4. CONFUSION MATRIX 
cm = confusion_matrix(true_labels, pred_labels)
row_sums = cm.sum(axis=1, keepdims=True); row_sums[row_sums == 0] = 1 
cm_norm = cm.astype('float') / row_sums

plt.figure(figsize=(12, 5))

# Counts (Integers)
plt.subplot(1, 2, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix (Counts)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Normalized (Floats with 4 decimals)
plt.subplot(1, 2, 2)
sns.heatmap(cm_norm, annot=True, fmt='.4f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix (Normalized)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.savefig("confusion_matrix_top4.png")
plt.close()

# 5. PER-CLASS BAR CHART
report_dict = classification_report(true_labels, pred_labels, target_names=class_names, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report_dict).transpose()
report_df_classes = report_df.loc[[cn for cn in class_names if cn in report_df.index]]

ax = report_df_classes[['precision', 'recall', 'f1-score']].plot(kind='bar', figsize=(10, 6))
plt.title('Per-Class Metrics (BINARY RoBERTa - Top 4 Features)')
plt.xlabel('Classes'); plt.ylabel('Score')
plt.xticks(rotation=0); plt.grid(axis='y', linestyle='--'); plt.legend(loc='lower right')
plt.savefig("per_class_metrics_top4.png")
plt.close()

# 6. SAVE (KAGGLE SPECIFIC)
FINAL_MODEL_DIR = "Final_Model_Binary_RoBERTa_Base_Top4_Features" 
FINAL_MODEL_PATH = f"/kaggle/working/{FINAL_MODEL_DIR}"

print(f"\nüíæ Saving model to {FINAL_MODEL_PATH}...")
trainer.save_model(FINAL_MODEL_PATH)

# --- ZIP FOR DOWNLOAD ---
print("üì¶ Zipping model for easy download...")
shutil.make_archive(f"/kaggle/working/{FINAL_MODEL_DIR}", 'zip', FINAL_MODEL_PATH)

print(f"‚úÖ DONE! You can now download '{FINAL_MODEL_DIR}.zip' from the 'Output' tab.")


--- Final Evaluation on Test Set ---
Generating predictions...





--- Classification Report (BINARY RoBERTa - Top 4 Features) ---
              precision    recall  f1-score   support

    Negative     0.9078    0.8948    0.9013     19408
    Positive     0.8971    0.9098    0.9034     19553

    accuracy                         0.9023     38961
   macro avg     0.9024    0.9023    0.9023     38961
weighted avg     0.9024    0.9023    0.9023     38961


üíæ Saving model to /kaggle/working/Final_Model_Binary_RoBERTa_Base_Top4_Features...
üì¶ Zipping model for easy download...
‚úÖ DONE! You can now download 'Final_Model_Binary_RoBERTa_Base_Top4_Features.zip' from the 'Output' tab.
