In [1]:
!pip install kagglehub nltk



In [2]:
import pandas as pd
import os
import string
import nltk
import kagglehub
import re
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
path = kagglehub.dataset_download("kazanova/sentiment140")
csv_src = f"{path}/training.1600000.processed.noemoticon.csv"
csv_dst = "/content/sentiment140.csv"
!cp "{csv_src}" "{csv_dst}"

csv_path = csv_dst
print("Dataset ready at:", csv_path)


Dataset ready at: /content/sentiment140.csv


In [28]:
start = time.time()

df = pd.read_csv(
    csv_path,
    encoding='latin-1',
    header=None,
    usecols=[0, 5],
    nrows=1600000  # load more data
)

print(f"Loaded CSV in {time.time() - start:.2f} seconds")

df.columns = ['sentiment_code', 'review']
df['sentiment'] = df['sentiment_code'].map({0: 'negative', 2: 'neutral', 4: 'positive'})
df = df[['review', 'sentiment']]
df['label'] = df['sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

print(df['label'].value_counts())


Loaded CSV in 2.84 seconds
label
-1    800000
 1    800000
Name: count, dtype: int64


In [29]:
counts = df['label'].value_counts()
print("Class counts:\n", counts)

min_count = counts.min()
print(f"Balancing all available classes to: {min_count} samples each")

dfs = []

for lbl in counts.index:
    sampled = df[df['label'] == lbl].sample(min_count, random_state=42)
    dfs.append(sampled)

df_balanced = pd.concat(dfs).sample(frac=1).reset_index(drop=True)

print(df_balanced['label'].value_counts())


Class counts:
 label
-1    800000
 1    800000
Name: count, dtype: int64
✅ Balancing all available classes to: 800000 samples each
label
-1    800000
 1    800000
Name: count, dtype: int64


In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['clean_review'], df_balanced['label'],
    test_size=0.2, random_state=42, stratify=df_balanced['label']
)

print(y_train.value_counts())


label
 1    640000
-1    640000
Name: count, dtype: int64


In [39]:
vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [40]:
model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(X_train_vec, y_train)


In [41]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [42]:
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_vec)

print(len(y_test), len(y_pred))  # debug

labels_present = sorted(unique_labels(y_test, y_pred))
names_present = ['negative' if x==-1 else 'neutral' if x==0 else 'positive' for x in labels_present]

print("\n✅ Classification Report:\n", classification_report(
    y_test, y_pred,
    labels=labels_present,
    target_names=names_present
))



320000 320000

✅ Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.77      0.77    160000
    positive       0.77      0.77      0.77    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [43]:
def predict_sentiment(text):
    clean_text = clean(text)
    vec = vectorizer.transform([clean_text])
    pred = model.predict(vec)[0]
    if pred == 1:
        return "Positive 😊"
    elif pred == 2:
        return "Neutral 😐"
    else:
        return "Negative 😞"

while True:
    user_text = input("Enter a sentence to analyze sentiment (or type 'exit' to quit):\n> ")
    if user_text.lower() == 'exit':
        break
    result = predict_sentiment(user_text)
    print(f"Predicted Sentiment: {result}\n")


Enter a sentence to analyze sentiment (or type 'exit' to quit):
> hello
Predicted Sentiment: Positive 😊

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> exit exit
Predicted Sentiment: Negative 😞

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> fuck off
Predicted Sentiment: Negative 😞

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> pretty
Predicted Sentiment: Positive 😊

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> not good
Predicted Sentiment: Positive 😊

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> beautiful 
Predicted Sentiment: Positive 😊

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> sexy
Predicted Sentiment: Positive 😊

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> fuck you
Predicted Sentiment: Negative 😞

Enter a sentence to analyze sentiment (or type 'exit' to quit):
> it rained too heavy today
Predicted Sentiment: Negative 😞

Enter a sentence 