In [2]:
# 1: Import Libraries

import pandas as pd
import numpy as np
import re
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [3]:
# 2. Load Dataset
train_df = pd.read_csv(r'C:\Users\Rajendra\Desktop\Sentiment_Analyzer\data\Twitter(Training,Validation_csvFiles)\twitter_training.csv', header=None)
valid_df = pd.read_csv(r'C:\Users\Rajendra\Desktop\Sentiment_Analyzer\data\Twitter(Training,Validation_csvFiles)\twitter_validation.csv', header=None)
train_df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [7]:
# 3. Inspect the colums for further use
#Assign proper column names
cols = ['tweet_id', 'entity', 'sentiment', 'text']
train_df.columns = cols
valid_df.columns = cols

#Combine both datasets
df = pd.concat([train_df, valid_df], ignore_index=True)

#Convert sentiment to lowercase to ensure consistency
df['sentiment'] = df['sentiment'].str.lower()

#Filter only valid sentiment classes
df = df[df['sentiment'].isin(['positive', 'neutral', 'negative'])]

# Drop missing values
df = df[['text', 'sentiment']].dropna()

#Show results
print("Dataset Loaded and Cleaned")
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)
print("\nSample Tweets:\n")
print(df.sample(5))

Dataset Loaded and Cleaned
Columns: ['text', 'sentiment']
Shape: (61949, 2)

Sample Tweets:

                                                    text sentiment
52575  10 wonderful years of Red Dead Redemption .  ....  positive
37924  3 Best iPhone Mobile Game Industry Updates : ‘...   neutral
28970  Watch TazzWolff with me on YouTube! @ Tazzolff...   neutral
64270                                        to Salute!!  positive
21811  If an amazing stream of wins and failures from...  positive


In [9]:
# 4. Text Cleaning 
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["cleaned_text"] = df["text"].astype(str).apply(clean_text)

df[["text", "cleaned_text"]].head()


Unnamed: 0,text,cleaned_text
0,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
3,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder yo...


In [12]:
# 5: Encode Sentiment Labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["sentiment"])
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


In [13]:
# 6. Split Data
X = df["cleaned_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# 7. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [15]:
# 8. Train the model using lgical regression
model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

In [16]:
# 9. Evaluate teh model
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.79      0.81      0.80      4518
     neutral       0.73      0.69      0.71      3635
    positive       0.77      0.78      0.77      4237

    accuracy                           0.77     12390
   macro avg       0.76      0.76      0.76     12390
weighted avg       0.77      0.77      0.77     12390



In [17]:
#10. Save the model & assets
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model, vectorizer, and label encoder saved.")

Model, vectorizer, and label encoder saved.
