In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import re
import io
import requests
import zipfile

# Download stopwords if not already present
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

# Load a larger, publicly available dataset (e.g., from UCI Machine Learning Repository)
# This example uses the Sentiment Labelled Sentences Data Set
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment labelled sentences.zip"
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
    with zf.open("sentiment labelled sentences/yelp_labelled.txt") as f:
        df = pd.read_csv(f, sep='\t', header=None, names=['review', 'sentiment'])


# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['processed_review'] = df['review'].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_review'], df['sentiment'], test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Function to predict sentiment for new reviews
def predict_sentiment(review):
    processed_review = preprocess_text(review)
    review_tfidf = tfidf_vectorizer.transform([processed_review])
    prediction = model.predict(review_tfidf)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example usage (interactive)
# while True:
#     user_input = input("Enter a review (or type 'exit'): ")
#     if user_input.lower() == 'exit':
#         break
#     sentiment = predict_sentiment(user_input)
#     print(f"Sentiment: {sentiment}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.785
              precision    recall  f1-score   support

           0       0.73      0.86      0.79        96
           1       0.85      0.71      0.77       104

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.78       200
weighted avg       0.79      0.79      0.78       200

