In [1]:
import pandas as pd 
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SanjaiA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SanjaiA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SanjaiA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1', header=None)
data.head()


Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
data.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
df = data.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=0.25, random_state=42)).reset_index(drop=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  400000 non-null  int64 
 1   id      400000 non-null  int64 
 2   date    400000 non-null  object
 3   flag    400000 non-null  object
 4   user    400000 non-null  object
 5   text    400000 non-null  object
dtypes: int64(2), object(4)
memory usage: 18.3+ MB


In [5]:
df.isnull().sum()


target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [6]:
df['target'] = df['target'].replace({0: 0, 4: 1})  # Binary sentiment
df['target'].value_counts()


target
0    200000
1    200000
Name: count, dtype: int64

In [7]:
stopwords_list = stopwords.words('english')
print(stopwords_list)  # Preview first 20 stopwords


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [8]:
def process_text(text):
    lem = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text_cleaned = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-letters
    words = text_cleaned.lower().split()
    lemmatized_words = [lem.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(lemmatized_words)


In [9]:
df['clean_text'] = df['text'].apply(process_text)
df[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,@xnausikaax oh no! where did u order from? tha...,xnausikaax oh u order horrible
1,A great hard training weekend is over. a coup...,great hard training weekend couple day rest le...
2,"Right, off to work Only 5 hours to go until I...",right work hour go free xd
3,I am craving for japanese food,craving japanese food
4,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow gotta work ...


In [10]:
X = df['clean_text']
y = df['target']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 320000, Testing samples: 80000


In [12]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [13]:
print(X_train_tfidf.shape)


(320000, 5000)


In [14]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [15]:
y_pred = model.predict(X_test_tfidf)


In [16]:
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy Score: 0.76495

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76     40050
           1       0.75      0.78      0.77     39950

    accuracy                           0.76     80000
   macro avg       0.77      0.76      0.76     80000
weighted avg       0.77      0.76      0.76     80000


Confusion Matrix:
 [[29849 10201]
 [ 8603 31347]]


In [36]:
import pickle

# Save model
with open('logistic_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
