In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import math
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB


In [17]:
# Reading the data
dataset = pd.read_json('../data/renttherunway_final_data.json.gz', lines=True)
dataset = dataset.dropna()
dataset = dataset.drop(dataset[dataset['rented for'] == "party: cocktail"].index)

In [18]:
# Converting the data into list of dictionaries
data = pd.DataFrame(dataset).to_dict('record')

for d in data:
    d['weight'] = int(d['weight'].split('lbs')[0])
    if(int(d['height'].split(' ')[1].split('"')[0])<10):
        height2 = int(d['height'].split(' ')[1].split('"')[0])*10
    else:
        height2 = int(d['height'].split(' ')[1].split('"')[0])
    d['height'] = int(d['height'][0][0])*100+height2
    
# Converting categroies type to one hot
catogeries = ['rented for','body type']
for cat in catogeries:
    categories_list = defaultdict(int)
    for d in data:
        categories_list[d[cat]] += 1
        
    categories_id = defaultdict(int)

    i = 0
    for cID in  categories_list:
        categories_id[cID] = i
        i+=1
    for d in data:
        f = [0]*len(categories_list)
        f[categories_id[d[cat]]] = 1
        d[cat] = f[:len(categories_list)-1]

  


In [19]:
# Extracting features from the data
def feature(d):
    f = [1, d['rating'], len(d['review_text']), len(d['review_summary']), d['weight'], d['height'], int(d['bust size'][0:2])] + d['rented for'] + d['body type']
    return f

In [20]:
X = [feature(d) for d in data]
y = [d['fit'] for d in data]

In [21]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [22]:
# # Generating train features
# X_train = [feature(d) for d in train_data]
# y_train = [d['fit'] for d in train_data]

In [23]:
X_train = X[0:int(0.8*len(data))]
X_test = X[int(0.8*len(data)):]

y_train = y[0:int(0.8*len(data))]
y_test = y[int(0.8*len(data)):]

In [24]:
sampling_strategy = "not majority"
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X_train, y_train)

In [33]:
# model = LogisticRegression(random_state=0, multi_class='multinomial', class_weight='balanced')

# model = RandomForestClassifier(random_state=42, class_weight='balanced')

model = GradientBoostingClassifier(random_state=42)

# model = KNeighborsClassifier(n_neighbors=3)

# model = MultinomialNB()

clf = model.fit(X_train, y_train)

In [34]:
y_pred_train = clf.predict(X_train)

# Measuring performace of train data
train_accuracy = accuracy_score(y_train, y_pred_train)
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
train_precision_score = precision_score(y_train, y_pred_train, average='weighted')
train_recall_score = recall_score(y_train, y_pred_train, average='weighted')

print('Train Accuracy:', train_accuracy)
print('Train F1 Score:', train_f1_score)
print('Train Precision Score:', train_precision_score)
print('Train Recall Score:', train_recall_score)

Train Accuracy: 0.7388816778248395
Train F1 Score: 0.6430591986819424
Train Precision Score: 0.6734881731610273
Train Recall Score: 0.7388816778248395


In [35]:
# Generating test features
# X_test = [feature(d) for d in test_data]
# y_test = [d['fit'] for d in test_data]
y_pred_test = clf.predict(X_test)

# Measuring performace of test data
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')
test_precision_score = precision_score(y_test, y_pred_test, average='weighted')
test_recall_score = recall_score(y_test, y_pred_test, average='weighted')

print('Test Accuracy:', test_accuracy)
print('Test F1 Score:', test_f1_score)
print('Test Precision Score:', test_precision_score)
print('Test Recall Score:', test_recall_score)

Test Accuracy: 0.7385913376144282
Test F1 Score: 0.6425221751402064
Test Precision Score: 0.6636045259300719
Test Recall Score: 0.7385913376144282
