<a href="https://colab.research.google.com/github/salehasharmeen/NLP-Task-MultiLingual-Sentiment-Analysis/blob/main/Multilingual_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Step 1: Install Required Libraries
!pip install transformers datasets accelerate evaluate streamlit pyngrok pandas torch -q

In [None]:
# Step 2: Import Libraries and Set Up Environment
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_utils import get_last_checkpoint
import numpy as np
import evaluate
import os
from pyngrok import ngrok

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Step 3: Loading and preparing dataset

print("Loading and preparing dataset...")

# Load the 'amazon_polarity' dataset from the Hugging Face Hub
# This dataset is a stable and reliable alternative for binary sentiment analysis.
dataset = load_dataset("amazon_polarity", split="train[:10000]")

# 'amazon_polarity' already has a 'label' column (1=positive, 0=negative), so
# we don't need the custom mapping function. We just need to rename the
# 'content' column to 'review_body' for consistency.
dataset = dataset.rename_column("content", "review_body")

# The original dataset had a split, so we'll create one here for consistency.
dataset = dataset.train_test_split(test_size=0.2, seed=42)

print("Dataset prepared. Train/Test split created.")
print(f"Training examples: {len(dataset['train'])}")
print(f"Testing examples: {len(dataset['test'])}")

Loading and preparing dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset prepared. Train/Test split created.
Training examples: 8000
Testing examples: 2000


In [None]:
# Step 4: Tokenization
print("Loading tokenizer and tokenizing data...")

model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['review_body'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove original columns not needed for training
tokenized_datasets = tokenized_datasets.remove_columns(["title", "review_body"])

# Set format for PyTorch
tokenized_datasets.set_format("torch")

print("Tokenization complete.")

Loading tokenizer and tokenizing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenization complete.


In [None]:
# Step 5: Load Model and Define Training Arguments
print("Loading model and setting up training...")

num_labels = 2 # Negative, Positive
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.to(device)

# Define evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none", # Disable reporting to Hugging Face Hub
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

Loading model and setting up training...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 6: Fine-tune the Model
print("Starting model training...")
trainer.train()
print("Model training complete.")

# Save the fine-tuned model and tokenizer
model_path = "./fine-tuned-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print("Model and tokenizer saved to disk.")


Starting model training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3991,0.37026,0.848
2,0.2238,0.344287,0.8735
3,0.1285,0.473119,0.881


Model training complete.
Model and tokenizer saved to disk.


In [None]:
# Step 6b: Save the Model to Google Drive
#
# Run this cell once after Step 6 to save your fine-tuned model
# to a permanent location for later use.
# ==============================================================================
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Define the path to save the model on your Drive
drive_path = "/content/drive/My Drive/sentiment_model"
os.makedirs(drive_path, exist_ok=True)

# Save the model and tokenizer to Google Drive
trainer.save_model(drive_path)
tokenizer.save_pretrained(drive_path)

print(f"Model and tokenizer saved to Google Drive at: {drive_path}")


Mounted at /content/drive
Model and tokenizer saved to Google Drive at: /content/drive/My Drive/sentiment_model


In [None]:
# Step 7: Evaluate the Fine-Tuned Model
print("Starting model evaluation...")
evaluation_results = trainer.evaluate()
print("Model evaluation complete.")

# Print the evaluation results
print("\n=== Model Evaluation Results ===")
for key, value in evaluation_results.items():
    print(f"{key}: {value}")

print("\nEvaluation complete. The model is ready for inference.")


Starting model evaluation...


Model evaluation complete.

=== Model Evaluation Results ===
eval_loss: 0.34428706765174866
eval_accuracy: 0.8735
eval_runtime: 13.6144
eval_samples_per_second: 146.904
eval_steps_per_second: 9.181
epoch: 3.0

Evaluation complete. The model is ready for inference.


In [None]:
# Step 8: Streamlit App
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to load model and tokenizer
@st.cache_resource
def load_model():
    """
    Loads the fine-tuned model and tokenizer from the saved directory.
    This function is cached to prevent reloading the model on every interaction.
    """
    model_path = "./fine-tuned-model"
    if not os.path.exists(model_path):
        st.error(f"Error: The model directory '{model_path}' was not found. Please make sure you have run the training step (Step 6) to fine-tune and save the model.")
        return None, None
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        model.to(device)
        return tokenizer, model
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None, None

# Load the model and tokenizer
tokenizer, model = load_model()

# Set up the Streamlit app
st.set_page_config(page_title="Multilingual Sentiment Analysis", layout="wide")
st.title("🌎 Multilingual Sentiment Analysis")
st.markdown("Enter a product review in a language like English, Spanish, or French and get the sentiment prediction.")

# Text input for the user
user_input = st.text_area("Enter review text here:", height=200, placeholder="e.g., This product is amazing!")

if st.button("Analyze Sentiment"):
    if user_input and tokenizer and model:
        # Define labels for the output
        labels = ["Negative", "Positive"] # Updated labels for the new dataset

        # Tokenize the user input
        inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True).to(device)

        with st.spinner("Analyzing..."):
            with torch.no_grad():
                # Get the model's predictions
                outputs = model(**inputs)
                logits = outputs.logits
                probabilities = torch.nn.functional.softmax(logits, dim=1)
                predicted_class_id = torch.argmax(probabilities, dim=1).item()

        # Display the results
        st.subheader("Results")
        st.success(f"**Predicted Sentiment:** {labels[predicted_class_id]}")
        st.info(f"**Confidence:** {probabilities[0][predicted_class_id].item():.2f}")
    else:
        st.warning("Please enter some text to analyze and ensure the model is loaded correctly.")


Overwriting app.py


In [None]:
!pip install pyngrok



In [None]:
# Step 9: Launch the Streamlit App with Ngrok
import time
import subprocess
import os
from pyngrok import ngrok

# Install Streamlit to ensure it's available to the current Python environment
# We use `sys.executable` to install to the correct Python interpreter.
import sys
!{sys.executable} -m pip install streamlit

# Kill any existing ngrok processes to free up tunnels
!killall ngrok

# Get your Ngrok authentication token from https://dashboard.ngrok.com/auth/your-authtoken
# You must paste your token here or set it as a Colab secret.
# In a real project, you would store this securely.
!ngrok authtoken 31aOG5rc2eTiMll0szeQJIWrf36_254v194Qn4WdcdQoFdvbr

# Start the Streamlit app using subprocess.Popen and capture output
process = subprocess.Popen(["python", "-m", "streamlit", "run", "app.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print("Starting Streamlit app... Please wait a moment.")

# Give the app a moment to start up
time.sleep(10)

# Check for errors from the Streamlit process
if process.poll() is not None:
    stdout_output, stderr_output = process.communicate()
    print("Streamlit App has terminated with an error:")
    print("--- Standard Output ---")
    print(stdout_output.decode())
    print("--- Standard Error ---")
    print(stderr_output.decode())
else:
    # Start the ngrok tunnel
    public_url = ngrok.connect(addr="8501", proto="http")
    print("Streamlit App URL:", public_url)

    print("Streamlit app is running! Click the URL above to access it.")


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Starting Streamlit app... Please wait a moment.
Streamlit App URL: NgrokTunnel: "https://32ecc379643d.ngrok-free.app" -> "http://localhost:8501"
Streamlit app is running! Click the URL above to access it.
