In [4]:
# LOGISTIC REGRESSION

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

df = df[['Review Text', 'Rating']]

df.dropna(inplace=True)

# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower() 
    text = ' '.join([word for word in word_tokenize(text) if word.isalnum()])
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word not in stop_words])
    return text

df['Cleaned Text'] = df['Review Text'].apply(preprocess_text)

# labeling sentiments
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else 'negative' if x < 3 else 'neutral')

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned Text'])
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model building
model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/riyacherlakola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/riyacherlakola/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

    negative       0.58      0.44      0.50       457
     neutral       0.49      0.22      0.30       588
    positive       0.87      0.97      0.92      3484

    accuracy                           0.82      4529
   macro avg       0.64      0.54      0.57      4529
weighted avg       0.79      0.82      0.79      4529



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# MAJORITY BASELINE


import pandas as pd
from sklearn.metrics import classification_report

df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

# Map ratings to sentiment labels
df['Sentiment'] = df['Rating'].apply(lambda x: 'negative' if x < 3 else 'neutral' if x == 3 else 'positive')

majority_sentiment = df['Sentiment'].mode()[0]

# Predict the majority sentiment label for all instances
predictions = [majority_sentiment] * len(df['Sentiment'])
print(classification_report(df['Sentiment'], predictions))


              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      2407
     neutral       0.00      0.00      0.00      2871
    positive       0.78      1.00      0.87     18208

    accuracy                           0.78     23486
   macro avg       0.26      0.33      0.29     23486
weighted avg       0.60      0.78      0.68     23486



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Better preforming model (logistc regression on a 2nd dataset)
# Trip Advisor Hotels

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df = pd.read_csv("tripadvisor_hotel_reviews.csv")

df = df[['Review', 'Rating']]

df.dropna(inplace=True)

# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower() 
    text = ' '.join([word for word in word_tokenize(text) if word.isalnum()])
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word not in stop_words])
    return text

df['Cleaned Text'] = df['Review'].apply(preprocess_text)

# labeling sentiments
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else 'negative' if x < 3 else 'neutral')

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned Text'])
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model building
model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/riyacherlakola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/riyacherlakola/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

    negative       0.79      0.76      0.77       625
     neutral       0.52      0.19      0.27       432
    positive       0.89      0.98      0.93      3042

    accuracy                           0.86      4099
   macro avg       0.73      0.64      0.66      4099
weighted avg       0.83      0.86      0.84      4099



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
