In [11]:

!pip install kagglehub nltk --quiet

import pandas as pd
import os
import string
import nltk
import kagglehub
import re
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.multiclass import unique_labels

nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:

path = kagglehub.dataset_download("kazanova/sentiment140")
csv_src = f"{path}/training.1600000.processed.noemoticon.csv"
csv_dst = "/content/sentiment140.csv"
!cp "{csv_src}" "{csv_dst}"

print("✅ Dataset ready at:", csv_dst)


✅ Dataset ready at: /content/sentiment140.csv


In [13]:

start = time.time()
df = pd.read_csv(
    csv_dst,
    encoding='latin-1',
    header=None,
    usecols=[0, 5],
    nrows=1600000
)
print(f"✅ Loaded CSV in {time.time() - start:.2f} seconds")

df.columns = ['sentiment_code', 'review']
df['sentiment'] = df['sentiment_code'].map({0: 'negative', 2: 'neutral', 4: 'positive'})
df = df[['review', 'sentiment']]
df['label'] = df['sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

print("✅ Original class counts:\n", df['label'].value_counts())


✅ Loaded CSV in 1.94 seconds
✅ Original class counts:
 label
-1    800000
 1    800000
Name: count, dtype: int64


In [14]:

counts = df['label'].value_counts()
min_count = counts.min()
print(f"✅ Balancing all classes to: {min_count} samples each")

dfs = []
for lbl in counts.index:
    sampled = df[df['label'] == lbl].sample(min_count, random_state=42)
    dfs.append(sampled)

df_balanced = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)
print("✅ Balanced class counts:\n", df_balanced['label'].value_counts())


✅ Balancing all classes to: 800000 samples each
✅ Balanced class counts:
 label
-1    800000
 1    800000
Name: count, dtype: int64


In [15]:

stop_words = set(stopwords.words('english'))
trans_table = str.maketrans('', '', string.punctuation + string.digits)

def clean(text):
    text = re.sub(r'(.)\1{2,}', r'\1\1',
            re.sub(r'\s+', ' ',
            text.lower().translate(trans_table))).strip()
    return ' '.join(w for w in text.split() if w not in stop_words)

df_balanced['clean_review'] = df_balanced['review'].apply(clean)
print(df_balanced[['review', 'clean_review']].head())


                                              review  \
0  15 minutes until the listening  1 hour 15 minu...   
1                                    monday morning    
2  just got this twitter  and now im at my grandp...   
3  brushing my teeferz then heading my butt into ...   
4  new bears are looking great!....makes me want ...   

                                        clean_review  
0  minutes listening hour minutes classical japanese  
1                                     monday morning  
2                        got twitter im grandparents  
3  brushing teeferz heading butt work feel like z...  
4  new bears looking greatmakes want work mine so...  


In [16]:

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['clean_review'],
    df_balanced['label'],
    test_size=0.2, random_state=42, stratify=df_balanced['label']
)


In [17]:

vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [18]:

model = MultinomialNB()
model.fit(X_train_vec, y_train)


In [19]:

y_pred = model.predict(X_test_vec)

print(f"✅ Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

labels_present = sorted(unique_labels(y_test, y_pred))
names_present = ['negative' if x==-1 else 'neutral' if x==0 else 'positive' for x in labels_present]

print("\n✅ Classification Report:\n", classification_report(
    y_test, y_pred,
    labels=labels_present,
    target_names=names_present
))


✅ Accuracy: 77.14%

✅ Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.78      0.77    160000
    positive       0.77      0.77      0.77    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [20]:

def predict_sentiment(text):
    clean_text = clean(text)
    vec = vectorizer.transform([clean_text])
    pred = model.predict(vec)[0]
    if pred == 1:
        return "Positive 😊"
    elif pred == 0:
        return "Neutral 😐"
    else:
        return "Negative 😞"

while True:
    user_text = input("💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):\n> ")
    if user_text.lower() == 'exit':
        print("👋 Exiting.")
        break
    result = predict_sentiment(user_text)
    print(f"✅ Predicted Sentiment: {result}\n")


💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> hello
✅ Predicted Sentiment: Positive 😊

💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> I baked a cake today
✅ Predicted Sentiment: Positive 😊

💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> my friend had failed his exams
✅ Predicted Sentiment: Negative 😞

💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> its raining too heavily 
✅ Predicted Sentiment: Negative 😞

💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> im going for a tour tomorrow
✅ Predicted Sentiment: Negative 😞

💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> quit
✅ Predicted Sentiment: Negative 😞

💬 Enter a sentence to analyze sentiment (or type 'exit' to quit):
> exit
👋 Exiting.
