In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load Dataset
df = pd.read_csv("/Users/saitejaalladi/Downloads/Reviews.csv", nrows =50000)  # Download from Kaggle
df = df[['Text', 'Score']]  # Selecting relevant columns

# Convert scores to sentiment labels
df['Sentiment'] = df['Score'].apply(lambda x: 'Positive' if x >= 4 else ('Negative' if x <= 2 else 'Neutral'))

# Preprocessing Function
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Convert Text to Vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned_Text'])
y = df['Sentiment'].map({'Positive': 1, 'Negative': 0, 'Neutral': 2})

# Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Test on New Review
new_review = "I love this product, it's amazing!"
processed_review = preprocess_text(new_review)
vectorized_review = vectorizer.transform([processed_review])
prediction = model.predict(vectorized_review)[0]
print(f"Predicted Sentiment: {['Negative', 'Positive', 'Neutral'][prediction]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saitejaalladi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saitejaalladi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saitejaalladi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

           0       0.74      0.61      0.67      1545
           1       0.87      0.97      0.92      7617
           2       0.51      0.10      0.17       838

    accuracy                           0.84     10000
   macro avg       0.71      0.56      0.59     10000
weighted avg       0.82      0.84      0.82     10000

Predicted Sentiment: Positive


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
