In [None]:

!pip install transformers datasets torch scikit-learn pandas numpy


In [None]:
!pip install datasets transformers torch pandas

### Import Libraries

This cell imports all the necessary libraries for the project, including `pandas`, `numpy`, `torch`, `transformers`, `sklearn`, and `datasets`.

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import datasets
from datasets import Dataset
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

### Data Loading

This cell loads the dataset to be used for training and evaluation.

In [None]:
# Load the Best Social Media Dataset
# Option 1: Use a pre-processed social media dataset
dataset = load_dataset("cardiffnlp/tweet_eval", "sentiment")

# Option 2: Combine multiple social media datasets(optional)
def load_best_social_media_dataset():
    # Load tweet evaluation dataset
    tweet_data = load_dataset("cardiffnlp/tweet_eval", "sentiment")

    # Load other social media data if available
    return tweet_data

dataset = load_best_social_media_dataset()
print("Dataset loaded successfully!")
print(dataset)

README.md: 0.00B [00:00, ?B/s]

sentiment/train-00000-of-00001.parquet:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

sentiment/test-00000-of-00001.parquet:   0%|          | 0.00/901k [00:00<?, ?B/s]

sentiment/validation-00000-of-00001.parq(…):   0%|          | 0.00/167k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45615 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12284 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset loaded successfully!
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [None]:
# The dataset structure:
#  train: training data
#  validation: validation data
#  test: test data

# Labels: 0=negative, 1=neutral, 2=positive
label_map = {0: "negative", 1: "neutral", 2: "positive"}

# Convert to pandas for easier manipulation
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)
print("Label distribution in training set:")
print(train_df['label'].value_counts())


Training set shape: (45615, 2)
Validation set shape: (2000, 2)
Test set shape: (12284, 2)
Label distribution in training set:
label
1    20673
2    17849
0     7093
Name: count, dtype: int64


In [None]:
train_df.head()

Unnamed: 0,text,label
0,"""QT @user In the original draft of the 7th boo...",2
1,"""Ben Smith / Smith (concussion) remains out of...",1
2,Sorry bout the stream last night I crashed out...,1
3,Chase Headley's RBI double in the 8th inning o...,1
4,@user Alciato: Bee will invest 150 million in ...,2


### Data Preprocessing

This cell preprocesses the text data by cleaning and standardizing it for model training.

In [None]:
# Data Preprocessing (Social Media Specific)
def preprocess_social_media_text(text):
    """
    Clean social media text - handle emojis, mentions, hashtags, etc.
    """
    import re
    text = str(text).lower()

    # Handle common social media patterns
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '@user', text)  # Replace mentions with @user
    text = re.sub(r'#\w+', '', text)       # Remove hashtags
    text = re.sub(r'\d+', 'number', text)  # Replace numbers with 'number'
    text = re.sub(r'[^\w\s]', ' ', text)   # Replace special characters with space

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Apply preprocessing
train_df['cleaned_text'] = train_df['text'].apply(preprocess_social_media_text)
print("Text preprocessing completed!")

Text preprocessing completed!


In [None]:
train_df.head()

Unnamed: 0,text,label,cleaned_text
0,"""QT @user In the original draft of the 7th boo...",2,qt user in the original draft of the numberth ...
1,"""Ben Smith / Smith (concussion) remains out of...",1,ben smith smith concussion remains out of the ...
2,Sorry bout the stream last night I crashed out...,1,sorry bout the stream last night i crashed out...
3,Chase Headley's RBI double in the 8th inning o...,1,chase headley s rbi double in the numberth inn...
4,@user Alciato: Bee will invest 150 million in ...,2,user alciato bee will invest number million in...


### Data Preparation and Validation

This cell prepares the data for training and validation by splitting it and checking label distribution.

In [None]:
# Data Preparation and Validation
# Check your label column (adjust 'label' to your actual column name)
print("Unique labels in your dataset:", train_df['label'].unique())
print("Label distribution:")
print(train_df['label'].value_counts())

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['cleaned_text'].tolist(),
    train_df['label'].tolist(),
    test_size=0.15,  # 15% for validation
    stratify=train_df['label'],  # Maintain label distribution
    random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Unique labels in your dataset: [2 1 0]
Label distribution:
label
1    20673
2    17849
0     7093
Name: count, dtype: int64
Training samples: 38772
Validation samples: 6843


### Model and Tokenizer Loading

This cell loads the pre-trained sentiment analysis model and its corresponding tokenizer.

In [None]:
# Using a model pre-trained on social media data
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# Alternative: "distilbert-base-uncased" (faster) or "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(train_df['label'].unique()),  # Automatically detect number of classes
    ignore_mismatched_sizes=True  # Handle potential size mismatches
)

print("Model loaded successfully!")
print(f"Model has {len(train_df['label'].unique())} classes")

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model loaded successfully!
Model has 3 classes


### Tokenization

This cell tokenizes the training and validation datasets using the loaded tokenizer.

In [None]:
# Tokenization with Optimal Parameters
def tokenize_function(examples):
    """
    Tokenize the text with optimal parameters for social media
    """
    return tokenizer(
        examples,
        truncation=True,
        padding=True,
        max_length=128,  # Good for social media (tweets, comments)
        return_tensors="pt"
    )

# Tokenize training and validation data
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

### Evaluation Metrics

This cell defines the evaluation metrics to be used during training (accuracy, F1-score, precision, and recall).

In [None]:
# Define Evaluation Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Training Configuration (Optimized for Maximum Accuracy)
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./sentiment_model',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",  # 👈 use this instead of evaluation_strategy
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    seed=42,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### Disable WANDB (Optional)

This cell disables the Weights & Biases logging to avoid clutter or if you don't plan to use it.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

### Training Configuration

This cell sets up the training arguments and initializes the Trainer.

In [None]:
training_args = TrainingArguments(
    output_dir='./sentiment_model',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
    eval_strategy="steps",
    eval_steps=2000,       # less frequent
    save_strategy="steps",
    save_steps=2000,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    seed=42,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # optional:
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
pip install --upgrade transformers


### Start Training

This cell starts the training process for the sentiment analysis model. This will take some time depending on the dataset size and hardware.

In [None]:
# Start Training (This will take time)
print("Starting training...")
trainer.train()

# Save the final model
trainer.save_model('./final_sentiment_model')
tokenizer.save_pretrained('./final_sentiment_model')
print("Training completed and model saved!")

Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
2000,0.6583,0.647237,0.71606,0.710607,0.721957,0.71606
4000,0.4941,0.681924,0.708607,0.710703,0.721897,0.708607
6000,0.3144,0.845931,0.725559,0.725031,0.724871,0.725559
8000,0.2077,1.187081,0.719714,0.718052,0.719634,0.719714
10000,0.1244,1.428756,0.729943,0.729963,0.730614,0.729943
12000,0.137,1.514283,0.722929,0.722427,0.722224,0.722929


Training completed and model saved!


### Load and Use the Saved Model

Once you have downloaded the `final_sentiment_model` directory, you can load the model and tokenizer in any Python environment where `transformers` and `torch` are installed.

In [None]:
import os

file_path = '/content/final_sentiment_model/model.safetensors'
file_size_bytes = os.path.getsize(file_path)

# Convert bytes to megabytes for easier reading
file_size_mb = file_size_bytes / (1024 * 1024)

print(f"The size of the file '{file_path}' is {file_size_bytes} bytes ({file_size_mb:.2f} MB).")

The size of the file '/content/final_sentiment_model/model.safetensors' is 498615900 bytes (475.52 MB).


### Make Predictions

Now you can use the loaded model to predict the sentiment of new text inputs.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the path to your saved model directory
saved_model_path = './final_sentiment_model' # Or the path where you saved it

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [None]:
def predict_sentiment(text):
    """
    Predicts the sentiment of a given text using the loaded model.
    """
    # Preprocess the text (optional, but recommended if you used it during training)
    # from your previous preprocessing function:
    import re
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '@user', text)  # Replace mentions with @user
    text = re.sub(r'#\w+', '', text)       # Remove hashtags
    text = re.sub(r'\d+', 'number', text)  # Replace numbers with 'number'
    text = re.sub(r'[^\w\s]', ' ', text)   # Replace special characters with space
    text = ' '.join(text.split())

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs.to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)

    # Map prediction to sentiment label
    label_map = {0: "negative", 1: "neutral", 2: "positive"} # Make sure this matches your training labels
    predicted_label = label_map[predictions.item()]

    return predicted_label

# Example usage:
new_texts = [
    "This is a great day!",
    "I am not happy with this service.",
    "It's just okay."
]

for text in new_texts:
    sentiment = predict_sentiment(text)
    print(f"Text: '{text}' -> Sentiment: {sentiment}")

Text: 'This is a great day!' -> Sentiment: positive
Text: 'I am not happy with this service.' -> Sentiment: negative
Text: 'It's just okay.' -> Sentiment: positive


In [None]:
from google.colab import files

# Specify the path to the file you want to download
file_path = '/content/final_sentiment_model/model.safetensors'

try:
  files.download(file_path)
except FileNotFoundError:
  print(f"File not found: {file_path}")
except Exception as e:
  print(f"An error occurred during download: {e}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>