In [8]:
import pandas as pd
import numpy as np
import kagglehub
import re

from kagglehub import KaggleDatasetAdapter

from transformers import pipeline
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

In [2]:
file_path = "Twitter_Data.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "saurabhshahane/twitter-sentiment-dataset",
  file_path,
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:                                           clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [3]:
# negative(-1), neutral(0), and positive(+1)
df.head(5)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [5]:
df['clean_text'][2]

'what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax'

In [6]:
def text_preprocessing(sentiment_df):
    # Create a new column for the processed text
    sentiment_df['processed_text'] = ''

    for index, row in sentiment_df.iterrows():
        text = row['clean_text']
        if not isinstance(text, str):
            # If not a string, replace with empty string or default value
            print(text)
            text = ''
        else:
            text = text.lower()
        
        # Remove links
        text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', text)
        # Remove usernames
        text = re.sub(r'@[^\s]+', '', text)
        # Replace hashtags with just the word
        text = re.sub(r'#([^\s]+)', r'\1', text)
        # Fix multiple white spaces to a single space
        text = re.sub(r'[\s]+', ' ', text)
        # Remove words shorter than 4 characters and strip spaces
        text = re.sub(r'\W*\b\w{1,3}\b', '', text)
        text = text.strip()

        # Save the cleaned text in the DataFrame
        sentiment_df.at[index, 'processed_text'] = text


In [9]:
# Df cleaning
text_preprocessing(df)

nan
nan
nan
nan


In [None]:
df.head(10)

Unnamed: 0,clean_text,category,processed_text
0,when modi promised “minimum government maximum...,-1.0,when modi promised “minimum government maximum...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama will vote modi
2,what did just say vote for modi welcome bjp t...,1.0,what just vote modi welcome told rahul main ca...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar their names...
4,answer who among these the most powerful world...,1.0,answer among these most powerful world leader ...
5,kiya tho refresh maarkefir comment karo,0.0,kiya refresh maarkefir comment karo
6,surat women perform yagna seeks divine grace f...,0.0,surat women perform yagna seeks divine grace n...
7,this comes from cabinet which has scholars lik...,0.0,this comes from cabinet which scholars like mo...
8,with upcoming election india saga going import...,1.0,with upcoming election india saga going import...
9,gandhi was gay does modi,1.0,gandhi does modi


In [None]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name)

In [None]:
tqdm.pandas()
df["pred"] = df["processed_text"].progress_apply(lambda x: sentiment_pipeline(x[:512])[0]['label'])  # taglia a 512 token


In [None]:
mapping = {
    "LABEL_0": "negative",
    "LABEL_1": "neutral",
    "LABEL_2": "positive"
}
df["pred_label"] = df["pred"].map(mapping)

In [None]:
# Mappa le etichette originali del dataset (0=neg, 1=pos)
df["true_label"] = df["label"].map({0: "negative", 1: "positive"})

In [None]:
# 6️⃣ Valutazione
print("\n📊 Report di classificazione:")
print(classification_report(df["true_label"], df["pred_label"], zero_division=0))

In [None]:
# 7️⃣ (Facoltativo) Matrice di confusione
cm = confusion_matrix(df["true_label"], df["pred_label"], labels=["negative","neutral","positive"])
cm_df = pd.DataFrame(cm, index=["true_neg","true_neu","true_pos"], columns=["pred_neg","pred_neu","pred_pos"])
print("\nMatrice di confusione:")
print(cm_df)

In [None]:
# 8️⃣ Esplorazione qualitativa
print("\n🔍 Esempi di tweet classificati erroneamente:")
df[df["true_label"] != df["pred_label"]].sample(5, random_state=42)[["text","true_label","pred_label"]]