In [None]:
!pip install torch
!pip install fastai==1.0.61

In [2]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from typing import Dict
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [5]:
RENTTHERUNWAY_1_FILE_PATH = Path('./renttherunway_data_1.json')
rtr_df_1 = pd.read_json(RENTTHERUNWAY_1_FILE_PATH, lines=True)
RENTTHERUNWAY_2_FILE_PATH = Path('./renttherunway_data_2.json')
rtr_df_2 = pd.read_json(RENTTHERUNWAY_2_FILE_PATH, lines=True)
MOD_CLOTH_FILE_PATH = Path('./modcloth_data.json')
#mc_df = pd.read_json(MOD_CLOTH_FILE_PATH, lines=True)

rtr_df_1 = pd.concat([rtr_df_1,rtr_df_2],ignore_index=True)


## Clean Text

In [None]:
rtr_df_1['review_text'] = rtr_df_1['review_text'].str.replace("[^a-zA-Z]", " ")
nltk.download('stopwords')
stop_words = stopwords.words('english')
tokenized_doc = rtr_df_1['review_text'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
detokenized_doc = [] 
for i in range(len(rtr_df_1)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

rtr_df_1['review_text'] = detokenized_doc

In [7]:
labels = {"small":0,"fit":1,"large":2}
rtr_df_1['label'] = rtr_df_1['fit'].apply(lambda x: labels[x])

### Keep only most relevant features

In [8]:
rtr_df_1 = rtr_df_1.drop(columns=['user_id','bust size','item_id','weight','rating','rented for','body type','height','age','review_date'])

In [9]:
rtr_df_1_trn, rtr_df_1_test = train_test_split(rtr_df_1, stratify = rtr_df_1['fit'], test_size = 0.2, random_state = 12)
rtr_df_1_trn, rtr_df_1_val = train_test_split(rtr_df_1_trn, stratify = rtr_df_1_trn['fit'], test_size = 0.2, random_state = 11)

In [10]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df=rtr_df_1_trn, valid_df=rtr_df_1_val, test_df=rtr_df_1_test, path=".", text_cols=["review_text","review_summary"], label_cols=["label"])

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = ".", train_df = rtr_df_1_trn, valid_df = rtr_df_1_val, test_df=rtr_df_1_test, vocab=data_lm.train_ds.vocab, bs=32, text_cols=["review_text","review_summary"], label_cols=["label"])

## Language Model

In [11]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3,pretrained=True, metrics=[accuracy])

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


In [12]:
learn.fit_one_cycle(1, 1.45e-2)

epoch,train_loss,valid_loss,accuracy,time
0,4.190208,4.080743,0.261185,04:33


In [13]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.922355,3.892767,0.281607,05:58
1,3.791088,3.777313,0.293458,05:58
2,3.699809,3.709336,0.298465,05:59
3,3.614547,3.668128,0.303666,05:58
4,3.521658,3.650615,0.305876,05:59
5,3.426055,3.650783,0.306454,05:59
6,3.339106,3.66698,0.305502,05:59
7,3.253825,3.682922,0.304536,05:58
8,3.178952,3.703623,0.303086,05:59
9,3.146857,3.711982,0.302656,05:59


In [14]:
learn.save_encoder('finetuned')

## Classifier

In [None]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.7, metrics=[accuracy])
learn.load_encoder('finetuned')

In [19]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.621506,0.566433,0.78638,02:49


In [18]:
rtr_df_1.head()


Unnamed: 0,fit,review_text,review_summary,category,size,label
0,fit,An adorable romper Belt zipper little hard nav...,So many compliments!,romper,14,1
1,fit,I rented dress photo shoot The theme Hollywood...,I felt so glamourous!!!,gown,12,1
2,fit,This hugged right places It perfect dress even...,It was a great time to celebrate the (almost) ...,sheath,4,1
3,fit,I rented company black tie awards banquet I li...,Dress arrived on time and in perfect condition.,dress,8,1
4,fit,I always petite upper body extremely athletic ...,Was in love with this dress !!!,gown,12,1


In [21]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.487946,0.452208,0.835622,03:23


In [22]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.476607,0.442015,0.840783,05:11


In [23]:
learn.unfreeze()
learn.fit_one_cycle(12, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.458848,0.440374,0.841692,06:49
1,0.463778,0.437292,0.841919,06:57
2,0.435088,0.437415,0.844126,05:56
3,0.444873,0.433527,0.844321,06:09
4,0.424185,0.432606,0.845652,06:38
5,0.432352,0.434413,0.846463,06:14
6,0.420299,0.439691,0.845587,06:53
7,0.375893,0.43929,0.845977,06:30
8,0.393565,0.442589,0.845749,07:07
9,0.374063,0.447802,0.843802,06:44


In [24]:
learn.save('classifier')

In [25]:
preds, y, losses = learn.get_preds(ds_type=DatasetType.Test ,with_loss=True)
#interp = ClassificationInterpretation(learn, preds, rtr_df_1_test['label'], losses)
#interp.plot_confusion_matrix()

In [26]:
from sklearn.metrics import confusion_matrix
y_pred = np.argmax(preds.numpy(),axis=1)

confusion_matrix(rtr_df_1_test['label'], y_pred)

array([[ 2790,  2254,   112],
       [  795, 26946,   671],
       [  127,  2114,  2700]])

In [35]:
train_preds, train_y, train_losses = learn.get_preds(ds_type='train', with_loss=True)

In [48]:
trn_values = rtr_df_1_trn.category.unique()
test_values = rtr_df_1_test.category.unique()
category_map = {}
for i,value in enumerate(trn_values):
  category_map[value] = i
for i,value in enumerate(test_values):
  category_map[value] = i

In [None]:
rtr_df_1_trn['category_map'] = rtr_df_1_trn['category'].apply(lambda x: category_map[x])
rtr_df_1_test['category_map'] = rtr_df_1_test['category'].apply(lambda x: category_map[x])

In [55]:
train_sizes = np.array(rtr_df_1_trn['size']).reshape((-1,1))
train_cats = np.array(rtr_df_1_trn['category_map']).reshape((-1,1))
X_train = np.concatenate((train_preds, train_sizes, train_cats),axis=1)

In [56]:
y_train = rtr_df_1_trn['label']

In [57]:
test_sizes = np.array(rtr_df_1_test['size']).reshape((-1,1))
test_cats = np.array(rtr_df_1_test['category_map']).reshape((-1,1))
X_test = np.concatenate((preds, test_sizes, test_cats),axis=1)

In [58]:
y_test = rtr_df_1_test['label']

In [62]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=800)
classifier.fit(X_train, y_train)
final_predictions = classifier.predict(X_test)

In [64]:
confusion_matrix(y_test, final_predictions)

array([[ 2910,  2123,   123],
       [  941, 26639,   832],
       [  132,  2000,  2809]])