In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv('Reviews.csv')[['Text', 'Score']].dropna()

# Map scores to sentiment
def score_to_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

df['sentiment'] = df['Score'].apply(score_to_sentiment)

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\bnot\s+(\w+)', r'not_\1', text)   # handle negation
    text = re.sub(r'[^a-zA-Z_]', ' ', text)            # remove punctuation/numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['Text'].apply(clean_text)

# Balance the dataset
pos = df[df.sentiment == 'positive']
neg = df[df.sentiment == 'negative']
neu = df[df.sentiment == 'neutral']
min_len = min(len(pos), len(neg), len(neu))
df_balanced = pd.concat([
    resample(pos, replace=False, n_samples=min_len, random_state=42),
    resample(neg, replace=False, n_samples=min_len, random_state=42),
    resample(neu, replace=False, n_samples=min_len, random_state=42)
])

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42)

# Split data
X = df_balanced['cleaned_text']
y = df_balanced['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize using TF-IDF with bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluation
y_pred = model.predict(X_test_vec)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Function for predicting new text
def analyze_sentiment(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    prediction = model.predict(vec)
    return prediction[0]

# Predict on user input
while True:
    user_input = input("\nEnter a review (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    result = analyze_sentiment(user_input)
    print(f"Predicted Sentiment: {result}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:
               precision    recall  f1-score   support

    negative       0.76      0.76      0.76      8565
     neutral       0.68      0.68      0.68      8594
    positive       0.82      0.81      0.81      8425

    accuracy                           0.75     25584
   macro avg       0.75      0.75      0.75     25584
weighted avg       0.75      0.75      0.75     25584

Confusion Matrix:
 [[6534 1594  437]
 [1647 5833 1114]
 [ 414 1162 6849]]



Enter a review (or type 'exit' to quit):  it is worst


Predicted Sentiment: negative



Enter a review (or type 'exit' to quit):  it is very very good


Predicted Sentiment: positive



Enter a review (or type 'exit' to quit):  it is somewhat ok ok


Predicted Sentiment: neutral



Enter a review (or type 'exit' to quit):  not good


Predicted Sentiment: negative



Enter a review (or type 'exit' to quit):  too bad


Predicted Sentiment: negative



Enter a review (or type 'exit' to quit):  somewhat good and bad


Predicted Sentiment: neutral



Enter a review (or type 'exit' to quit):  its very nice


Predicted Sentiment: positive



Enter a review (or type 'exit' to quit):  worst product


Predicted Sentiment: negative
