In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Download stopwords
nltk.download('stopwords')

# Load the data
df = pd.read_csv('twitter.csv', encoding='latin-1', header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

#  0 -Negative, 2 - Neutral, 4 -Positive
df['sentiment'] = df['sentiment'].map({0: 'Negative', 2: 'Neutral', 4: 'Positive'})

# Text cleaning function
def clean_text(text):
    text = re.sub(r"http\S+|@\w+|#\w+|[^A-Za-z0-9 ]+", "", str(text))  
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in text.split() if word not in stop_words]
    return " ".join(filtered_words)

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Split features and target
X = df['clean_text']
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Vectorization
tfidf = TfidfVectorizer(max_features=10000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# Logistic Regression model training
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_vec, y_train)

# Predictions and evaluation
pred_lr = model_lr.predict(X_test_vec)
print(classification_report(y_test, pred_lr))
print(confusion_matrix(y_test, pred_lr))

# Save model and vectorizer
joblib.dump(model_lr, 'logreg_model.pkl')
joblib.dump(tfidf, 'tfidf.pkl')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

    Negative       0.79      0.76      0.77    160000
    Positive       0.77      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000

[[121223  38777]
 [ 31885 128115]]


['tfidf.pkl']