In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...


True

In [9]:
df = pd.read_csv("test.csv", encoding="latin1")
df = df[['text', 'sentiment']].astype(str)
df.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


In [10]:
df.describe()

Unnamed: 0,text,sentiment
count,4815.0,4815
unique,3535.0,4
top,,neutral
freq,1281.0,1430


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       4815 non-null   object
 1   sentiment  4815 non-null   object
dtypes: object(2)
memory usage: 75.4+ KB


In [12]:
df.isnull().sum()

text         0
sentiment    0
dtype: int64

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)   
    text = re.sub(r"@\w+|#\w+", "", text)        
    text = re.sub(r"[^a-z\s]", "", text)        
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(preprocess_text)

In [14]:
vectorizer = TfidfVectorizer(max_features=6000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['sentiment']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6915887850467289
              precision    recall  f1-score   support

         nan       1.00      1.00      1.00       256
    negative       0.63      0.43      0.51       200
     neutral       0.51      0.72      0.60       286
    positive       0.72      0.53      0.61       221

    accuracy                           0.69       963
   macro avg       0.71      0.67      0.68       963
weighted avg       0.71      0.69      0.69       963



In [18]:
pickle.dump(model, open("sentiment_model.pkl", "wb"))
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))