In [30]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

df = df[['Review Text', 'Rating']]

df.dropna(inplace=True)

# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower() 
    text = ' '.join([word for word in word_tokenize(text) if word.isalnum()])
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word not in stop_words])
    return text

df['Cleaned Text'] = df['Review Text'].apply(preprocess_text)

# labeling sentiments
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else 'negative')




In [31]:
#Logistic Regression

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned Text'])
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.82      0.59      0.69      1045
    positive       0.89      0.96      0.92      3484

    accuracy                           0.88      4529
   macro avg       0.85      0.78      0.81      4529
weighted avg       0.87      0.88      0.87      4529



In [32]:
# SVM
X = df['Cleaned Text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_vectorized, y_train)


y_pred = svm_classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.78      0.64      0.71      1045
    positive       0.90      0.95      0.92      3484

    accuracy                           0.88      4529
   macro avg       0.84      0.79      0.81      4529
weighted avg       0.87      0.88      0.87      4529



In [33]:
# Better preforming model (SVM on a 2nd dataset)
# Trip Advisor Hotels

df = pd.read_csv("tripadvisor_hotel_reviews.csv")

df = df[['Review', 'Rating']]

df.dropna(inplace=True)

# Text preprocessing
# nltk.download('stopwords')
# nltk.download('punkt')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower() 
    text = ' '.join([word for word in word_tokenize(text) if word.isalnum()])
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word not in stop_words])
    return text

df['Cleaned Text'] = df['Review'].apply(preprocess_text)

# labeling sentiments
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else 'negative')


# SVM
X = df['Cleaned Text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_vectorized, y_train)


y_pred = svm_classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.85      0.75      0.80      1057
    positive       0.92      0.95      0.93      3042

    accuracy                           0.90      4099
   macro avg       0.88      0.85      0.87      4099
weighted avg       0.90      0.90      0.90      4099

