In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset (only required columns and drop missing values)
df = pd.read_csv('Reviews.csv')[['Text', 'Score']].dropna()

# Map scores to sentiment
df['sentiment'] = df['Score'].apply(lambda score: 'negative' if score <= 2 else ('neutral' if score == 3 else 'positive'))

# Clean text function: Use regex and handle negation efficiently
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\bnot\s+(\w+)', r'not_\1', text)   # handle negation
    text = re.sub(r'[^a-zA-Z_]', ' ', text)            # remove punctuation/numbers
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply text cleaning only once
df['cleaned_text'] = df['Text'].apply(clean_text)

# Balance the dataset by downsampling the majority classes
min_len = df['sentiment'].value_counts().min()

# Sample each class to the minimum length
df_pos = df[df['sentiment'] == 'positive'].sample(min_len, random_state=42)
df_neg = df[df['sentiment'] == 'negative'].sample(min_len, random_state=42)
df_neu = df[df['sentiment'] == 'neutral'].sample(min_len, random_state=42)

# Combine the balanced data
df_balanced = pd.concat([df_pos, df_neg, df_neu]).sample(frac=1, random_state=42)  # Shuffle the data

# Split data into train and test
X = df_balanced['cleaned_text']
y = df_balanced['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize using TF-IDF with bigrams, without applying a max feature limit (so that we capture more detail)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression with default settings
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate the model's performance
y_pred = model.predict(X_test_vec)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Visualize Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()

# Efficient prediction function for new text
def analyze_sentiment(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    prediction = model.predict(vec)
    return prediction[0]

# Predict on user input (only if it's valid input)
while True:
    user_input = input("\nEnter a review (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting sentiment analysis...")
        break
    if user_input.strip():
        result = analyze_sentiment(user_input)
        print(f"Predicted Sentiment: {result}")
    else:
        print("Please enter a valid review. Text cannot be empty.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df_balanced = df.groupby('sentiment', group_keys=False).apply(
