In [None]:
!pip install torch
!pip install fastai==1.0.61

In [3]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from typing import Dict
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [4]:
RENTTHERUNWAY_1_FILE_PATH = Path('./renttherunway_data_1.json')
rtr_df_1 = pd.read_json(RENTTHERUNWAY_1_FILE_PATH, lines=True)
RENTTHERUNWAY_2_FILE_PATH = Path('./renttherunway_data_2.json')
rtr_df_2 = pd.read_json(RENTTHERUNWAY_2_FILE_PATH, lines=True)
MOD_CLOTH_FILE_PATH = Path('./modcloth_data.json')
mc_df = pd.read_json(MOD_CLOTH_FILE_PATH, lines=True)

rtr_df_1 = pd.concat([rtr_df_1,rtr_df_2],ignore_index=True)


## Clean Text

In [5]:
rtr_df_1['review_text'] = rtr_df_1['review_text'].str.replace("[^a-zA-Z]", " ")
nltk.download('stopwords')
stop_words = stopwords.words('english')
tokenized_doc = rtr_df_1['review_text'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
detokenized_doc = [] 
for i in range(len(rtr_df_1)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

rtr_df_1['review_text'] = detokenized_doc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
labels = {"small":0,"fit":1,"large":2}
rtr_df_1['label'] = rtr_df_1['fit'].apply(lambda x: labels[x])

### Keep only most relevant features

In [7]:
rtr_df_1 = rtr_df_1.drop(columns=['user_id','bust size','item_id','weight','rating','rented for','body type','height','age','review_date'])

In [8]:
rtr_df_1_trn, rtr_df_1_test = train_test_split(rtr_df_1, stratify = rtr_df_1['fit'], test_size = 0.2, random_state = 12)
rtr_df_1_trn, rtr_df_1_val = train_test_split(rtr_df_1_trn, stratify = rtr_df_1_trn['fit'], test_size = 0.2, random_state = 11)

In [13]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df=rtr_df_1_trn, valid_df=rtr_df_1_val, test_df=rtr_df_1_test, path=".", text_cols=["review_text","review_summary"], label_cols=["label"])

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = ".", train_df = rtr_df_1_trn, valid_df = rtr_df_1_val, test_df=rtr_df_1_test, vocab=data_lm.train_ds.vocab, bs=32, text_cols=["review_text","review_summary"], label_cols=["label"])

## Language Model

In [10]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3,pretrained=True, metrics=[accuracy])

In [None]:
learn.fit_one_cycle(1, 1.45e-2)

In [41]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.907842,3.893356,0.281477,05:56
1,3.814726,3.776103,0.293238,05:56
2,3.711,3.706917,0.299819,05:56
3,3.630761,3.670982,0.303275,05:57
4,3.520856,3.651979,0.305562,05:56
5,3.433097,3.651414,0.306217,05:56
6,3.346932,3.664851,0.305406,05:56
7,3.240492,3.682674,0.304216,05:57
8,3.183269,3.702075,0.303103,05:57
9,3.138128,3.711712,0.302361,05:56


In [42]:
learn.save_encoder('finetuned')

## Classifier

In [14]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.7, metrics=[accuracy])
learn.load_encoder('finetuned')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (123228 items)
x: TextList
xxbos i rented pleasantly surprised see fit xxmaj best taller person hips though pockets add volume convenient xxmaj the dress was romantic and comfortable , understated but special . xxmaj great for work .,xxbos i loved dress got tons compliments i wore size regular touch short side i dresses usually long i wear low heal dress xxmaj the hem front little weird i ended pinning lining longer rest dress xxmaj but overall dress great xxmaj comfortable and flattering for a xxup nye wedding !,xxbos xxmaj very short front must wear high waisted skirt shorts xxmaj pulling option since v low xxmaj pretty color / style for a dinner date,xxbos xxmaj my first choice event big last minute xxup rtr saved day sent replacement xxmaj unfortunately bunched much waist bit tight hips personal comfort xxmaj overall bad fit fabric looked felt cheap doubt dress look cute festive right body mine certainly xxmaj thanks xxup rtr gre

In [15]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.658127,0.579381,0.781348,02:29


In [16]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.517696,0.454409,0.833869,03:08


In [17]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.486628,0.443537,0.840199,04:46


In [18]:
learn.unfreeze()
learn.fit_one_cycle(12, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.461388,0.441694,0.8414,06:26
1,0.466172,0.439644,0.841659,06:32
2,0.467331,0.506581,0.841887,06:30
3,0.429711,0.437919,0.843704,07:00
4,0.457737,0.434511,0.844808,06:02
5,0.44349,0.436334,0.843607,06:24
6,0.398444,0.437674,0.84497,06:31


Buffered data was truncated after reaching the output size limit.

In [19]:
learn.save('classifier')

In [None]:
#learn.data.add_test(rtr_df_1_test.drop(columns='label'),label=rtr_df_1_test['label'])

In [None]:
preds, y, losses = learn.get_preds(ds_type=DatasetType.Test ,with_loss=True)
#interp = ClassificationInterpretation(learn, preds, rtr_df_1_test['label'], losses)
#interp.plot_confusion_matrix()

In [43]:
y_pred

tensor(1.0000)

In [46]:
from sklearn.metrics import confusion_matrix
y_pred = np.argmax(preds.numpy(),axis=1)

confusion_matrix(rtr_df_1_test['label'], y_pred)

array([[ 2842,  2199,   115],
       [  820, 26937,   655],
       [  127,  2155,  2659]])

In [47]:
y_pred.shape

(38509,)

In [48]:
rtr_df_1_test.shape

(38509, 6)