In [50]:
### Created with help of https://realpython.com/logistic-regression-python/, 
### https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python,
### and ChatGPT


import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib

df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

df = df[['Review Text', 'Rating']]

df.dropna(inplace=True)

# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower() 
    text = ' '.join([word for word in word_tokenize(text) if word.isalnum()])
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word not in stop_words])
    return text

df['Cleaned Text'] = df['Review Text'].apply(preprocess_text)

# labeling sentiments
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else 'negative')




In [31]:
#Logistic Regression

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned Text'])
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.82      0.59      0.69      1045
    positive       0.89      0.96      0.92      3484

    accuracy                           0.88      4529
   macro avg       0.85      0.78      0.81      4529
weighted avg       0.87      0.88      0.87      4529



In [51]:
# SVM
X = df['Cleaned Text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_vectorized, y_train)

joblib.dump(svm_classifier, "svm_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")



y_pred = svm_classifier.predict(X_test_vectorized)


print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.78      0.64      0.71      1045
    positive       0.90      0.95      0.92      3484

    accuracy                           0.88      4529
   macro avg       0.84      0.79      0.81      4529
weighted avg       0.87      0.88      0.87      4529



In [75]:
# Using the model on a new dataset (corpus)

import pandas as pd
import joblib

# Load the dataset
df = pd.read_csv("amazon_clothing_review.csv")
df['Review'].fillna('', inplace=True)

# Load the saved SVM model and TF-IDF vectorizer
svm_model = joblib.load("svm_model.pkl")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")

X_vectorized = tfidf_vectorizer.transform(df['Review'])

predictions = svm_model.predict(X_vectorized)

df['Predictions'] = predictions
df['Cons_sentiment'] = df['Cons_rating'].apply(lambda x: 'negative' if x <= 3 else 'positive')

# create a cross-tabulation table
cross_tab = pd.crosstab(df['Predictions'], df['Cons_sentiment'], margins=True, margins_name="Total")

print(cross_tab)


Cons_sentiment  negative  positive  Total
Predictions                              
negative            3301      1405   4706
positive            9197     35435  44632
Total              12498     36840  49338
