In [1]:
#imports

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

In [2]:
# Load data
data_path = 'Womens_Clothing_E-Commerce_Reviews.csv'
data = pd.read_csv(data_path)
data = data.dropna(subset=['Review Text', 'Recommended IND'])

In [3]:

def feature_extraction(text):
    # Lowercase the text to standardize it
    text = text.lower()
    features = defaultdict(float)
    
    # Update features to check for presence instead of count
    features['contains_cheap'] = 1 if 'cheap' in text else 0
    features['contains_itchy'] = 1 if 'itchy' in text else 0
    features['contains_why'] = 1 if 'why' in text else 0
    features['contains_return'] = 1 if 'return' in text else 0
    features['contains_look'] = 1 if 'look' in text else 0
    features['contains_beautiful'] = 1 if 'beautiful' in text else 0
    features['contains_go-to'] = 1 if 'go-to' in text else 0
    features['contains_not_worth'] = 1 if 'not worth' in text else 0
    features['contains_quality'] = 1 if 'quality' in text else 0
    
    # Convert features to a list in a consistent order
    feature_vector = [
        features['contains_cheap'],
        features['contains_itchy'],
        features['contains_why'],
        features['contains_return'],
        features['contains_look'],
        features['contains_beautiful'],
        features['contains_go-to'],
        features['contains_not_worth'],
        features['contains_quality']
    ]
    return feature_vector

In [4]:
# Apply feature extraction to each review
features = np.array(list(data['Review Text'].apply(feature_extraction)))

# Labels
labels = data['Recommended IND'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model
model = SVC(kernel='linear', C=0.01)
model.fit(X_train_scaled, y_train)

# Predict and evaluate the model
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

print(f"Training Accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")

Training Accuracy: 0.8251987632508834
Test Accuracy: 0.8302053433429013


In [5]:
# F1 score with accuracy_score
# F1 precision and recall (read)