# **Data Loading and Handeling**

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
data=pd.read_csv('/content/twitter_training.csv')

In [None]:
data

In [None]:
print("Shape:", data.shape)
print("Columns:", data.columns.tolist())

In [None]:
data = pd.read_csv("/content/twitter_training.csv", header=0, names=["TweetID", "location", "Sentiment", "Tweet content"])
print("Shape:", data.shape)
print("Columns:", data.columns.tolist())
data.head()


In [None]:
data = data[["location", "Sentiment","Tweet content"]]
data = data[data["Sentiment"].isin(["Positive", "Negative", "Neutral"])]

In [None]:
data

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna(subset=['Tweet content'])



In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

data['Sentiment'] = data['Sentiment'].str.lower().str.strip()

le = LabelEncoder()
data['label'] = le.fit_transform(data['Sentiment'])

print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['clean_text'] = data['Tweet content'].apply(clean_text)

In [None]:
data

# **Setup , Installations & Imports**

In [None]:
# !pip uninstall -y transformers


In [None]:
# !rm -rf /usr/local/lib/python3.11/dist-packages/transformers*


In [None]:
!pip install -U pip
!pip install -U transformers==4.46.3 datasets evaluate scikit-learn


In [None]:
import transformers
print("Transformers version:", transformers.__version__)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

print("Imports successful!")


In [None]:
import pandas as pd
import numpy as np
import re

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [None]:
train_df, temp_df = train_test_split(data, test_size=0.2, random_state=42, stratify=data["label"])


val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

# Convert pandas : Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.remove_columns("__index_level_0__")
val_dataset = val_dataset.remove_columns("__index_level_0__")
test_dataset = test_dataset.remove_columns("__index_level_0__")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load pretrained tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["clean_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Apply tokenization to all splits
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load BERT model for classification (3 sentiment labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


# **Train and Evaluate**

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
import os

os.environ["WANDB_DISABLED"] = "true"

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {**acc, **f1}

# Smaller dataset subsets for faster training
train_dataset_small = train_dataset.select(range(40000))
val_dataset_small = val_dataset.select(range(2000))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_small,
    eval_dataset=val_dataset_small,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


# **Test**

In [None]:
metrics = trainer.evaluate(test_dataset)
print(metrics)


# **SAVING**

In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


In [None]:
!zip -r sentiment_model.zip sentiment_model


In [None]:
from google.colab import files
files.download("sentiment_model.zip")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")


# **viaualizing the predicted output**


In [None]:
import torch

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_label = torch.argmax(probs, dim=-1).item()

    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    return label_map[pred_label], probs[0][pred_label].item()

# Example:
text = "I love this product!"
label, confidence = predict_sentiment(text)
print(f"Predicted sentiment: {label} with confidence {confidence:.2f}")


In [None]:
import matplotlib.pyplot as plt

def visualize_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].detach().numpy()

    labels = ["negative", "neutral", "positive"]

    plt.figure(figsize=(6,4))
    bars = plt.bar(labels, probs, color=['red', 'gray', 'green'])
    plt.ylim(0,1)
    plt.title(f"Sentiment probabilities for: '{text}'")

    for bar, prob in zip(bars, probs):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() - 0.1, f"{prob:.2f}", ha='center', color='white', fontsize=12)

    plt.show()

# Example:
visualize_sentiment("I like this product!")


In [None]:
def visualize_sentiment_pie(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].detach().numpy()

    labels = ["negative", "neutral", "positive"]
    colors = ['red', 'gray', 'green']

    plt.figure(figsize=(6,6))
    plt.pie(probs, labels=labels, autopct='%1.1f%%', colors=colors, startangle=140)
    plt.title(f"Sentiment distribution for: '{text}'")
    plt.show()

# Example:
visualize_sentiment_pie("This product is perfect, but not great for me.")


In [None]:
def predict_and_visualize(text):
    label, confidence = predict_sentiment(text)
    print(f"Predicted sentiment: {label} with confidence {confidence:.2f}")
    visualize_sentiment(text)

# Example:
predict_and_visualize("I'm really unhappy with this service.")
