In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import pickle

# Ensure stopwords are available
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Load the data
df = pd.read_csv('PROJECT_FINAL DRAFT.csv')

# Drop rows with missing review
df = df.dropna(subset=['review'])

# Remove duplicates
df = df.drop_duplicates(subset=['review'])

# Create Sentiment column
def get_sentiment(star):
    if star <= 2:
        return 0  # Negative
    elif star == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['Sentiment'] = df['star'].apply(get_sentiment)

# Check class distribution
print(df['Sentiment'].value_counts())

# Preprocess function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text)
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split() if word not in stop_words])
    return text

# Apply to review column
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform cleaned reviews
X = tfidf.fit_transform(df['cleaned_review']).toarray()

# Target variable
y = df['Sentiment']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Initialize and tune Logistic Regression model
param_grid = {'C': [0.1, 1, 10], 'max_iter': [1000]}
grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train, y_train)
lr_model = grid.best_estimator_

# Predict
y_pred_lr = lr_model.predict(X_test)

# Evaluation
print("Logistic Regression Results:")
label_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
y_test_str = [label_map[label] for label in y_test]
y_pred_str = [label_map[label] for label in y_pred_lr]
print(classification_report(y_test_str, y_pred_str))
print("Accuracy:", accuracy_score(y_test_str, y_pred_str))

# Save Logistic Regression model
with open('lr_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# Save TF-IDF vectorizer
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

Sentiment
2    735
0    607
1    179
Name: count, dtype: int64
Logistic Regression Results:
              precision    recall  f1-score   support

    Negative       0.82      0.80      0.81       122
     Neutral       0.32      0.22      0.26        36
    Positive       0.84      0.92      0.88       147

    accuracy                           0.79       305
   macro avg       0.66      0.65      0.65       305
weighted avg       0.77      0.79      0.78       305

Accuracy: 0.7901639344262295
