In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import re

In [52]:
# Load dataset
# df = pd.read_csv("../data/reviews-short.csv")
df = pd.read_csv("../data/reviews.csv")

In [53]:
# Preprocess reviews
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['processed_reviews'] = df['Review'].apply(preprocess_text)

In [55]:
# Convert labels into sentiment categories
def label_to_sentiment(label):
    if label <= 2:
        return 'negative'
    elif label == 3:
        return 'neutral'
    else:
        return 'positive'

df['sentiment'] = df['Label'].apply(label_to_sentiment)

In [56]:
# Feature Extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['processed_reviews']).toarray()
y = df['sentiment']

In [57]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
from joblib import dump, load


# To load the model later
# model = load('logistic_regression_model.joblib')
# Model Training
model = LogisticRegression()
model.fit(X_train, y_train)
# Assuming 'model' is your trained logistic regression model
dump(model, 'sheep_two.joblib')

['sheep_two.joblib']

In [59]:
# Evaluation
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.71      0.39      0.51      1095
     neutral       0.35      0.08      0.13      1168
    positive       0.94      0.99      0.97     25801

    accuracy                           0.93     28064
   macro avg       0.67      0.49      0.53     28064
weighted avg       0.91      0.93      0.91     28064



In [60]:
def predict_sentiment(review):
    # Preprocess the review
    processed_review = preprocess_text(review)
    
    # Convert to features
    features = tfidf.transform([processed_review]).toarray()
    
    # Predict sentiment
    sentiment = model.predict(features)[0]
    return sentiment