In [1]:
import sys, getopt, re

def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv,"i:")
        # print(opts,args)
    except getopt.GetoptError:
        print('DriversAnalysis.py -i <BRAND>')
        sys.exit(2)
    
    for opt, arg in opts:
        if opt == "-i":
            inputfile = arg
    
    if len(inputfile)==0:
        print('usage: DriversAnalysis.py -i <BRAND>')
        sys.exit(2)
    print('Input BRAND is ', inputfile)
    return inputfile

In [2]:
BRAND      = 'HAIRCARE'

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

---

In [4]:
from fastai.tabular import *  # Quick accesss to tabular functionality

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# Pre-Processing

## Data Loading (from saved verbatim)

In [6]:
dirname = './data/excels'

df=1; del df
for filename in os.listdir(dirname):
    df_tmp = pd.read_excel(os.path.join(dirname, filename), header=None, skiprows=1)
    df_tmp['topic'] = filename.split('Aspect_')[-1].replace('_2.xls','')
    
    try:
        df = pd.concat([df,df_tmp])
    except:
        df = df_tmp.copy()

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,topic
0,"I could see tons of dandruff, but the itching ...",Dove Derma Care Scalp Dryness & Itch Relief An...,4.0,My scalp was so itchy I was convinced I must h...,1,Positive,11/10/2018,TARGET.COM,https://www.target.com/p/dove-derma-care-scalp...,Weather
1,The flakes are gone but the itching and sores ...,Head & Shoulders 2 Piece Clinical Solutions Sh...,5.0,Best I’ve ever used. I used to have severe dan...,1,Positive,10/26/2018,AMZ,https://www.amazon.com/Head-Shoulders-Clinical...,Weather
2,In warm weather my fine hair does seem more dr...,Dove Dermacare Scalp Anti-Dandruff Shampoo Dry...,5.0,Thank you for your entire line of Dermacare ha...,1,Positive,9/23/2018,-,https://www.walmart.com/ip/Dove-Dermacare-Scal...,Weather
3,This product always cures my itch during dry w...,Head & Shoulders Volume Boost Hair Shampoo - 1...,4.0,This product always cures my itch during dry w...,1,Positive,9/12/2018,TARGET.COM,https://www.target.com/p/head-shoulders-volume...,Weather
4,Great shampoo for the winter!,Head & Shoulders Volume Boost Hair Shampoo - 1...,5.0,Great shampoo for the winter! l use it around ...,1,Positive,9/5/2018,TARGET.COM,https://www.target.com/p/head-shoulders-volume...,Weather


## Data Cleaning

Extract the text snippets and associated topic; remove duplicates and too short verbatum 

In [7]:
# only keep snippets
df = df[[3, 'topic',2,5]].rename(columns = {3:'verbatum', 2:'rating', 5:'sentiment'})

# enforce format
df['sentiment'] = df['sentiment'].astype(str).\
    apply(lambda x: 1. if x=='Positive' else -1. if x=='Negative' else float('nan')).astype(float)

df.drop_duplicates(inplace=True)

In [8]:
def float_(x):
    try:
        return float(x)
    except:
        return float('nan')
df['rating'] = df['rating'].apply(float_)

df.dropna(inplace=True)

In [22]:
df_ = pd.pivot_table(df, index='verbatum', values='sentiment',columns='topic').reset_index().\
    merge(df[['verbatum','rating']].drop_duplicates()).drop('verbatum', axis=1).fillna(0.)

df_.head()

Unnamed: 0,2in1,Cleanliness,Color Treated Hair,Customer Service,Customer Wishlist,Design,Discontinued Products,Ease of Use,Economical,Environmental,...,Spray Application,Styling,Texture,Thickness,Time or Frequency of Use,Treatment Time,Viscosity,Volume & Fullness,Weather,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0


In [24]:
df_['rating'] = df_['rating'].apply(lambda x: [0.,0.,float('nan'),1.,2.][int(x)-1])
df_.dropna(inplace=True)

In [25]:
df_.reset_index(inplace=True)
df_['rating'] = df_['rating'].astype(int)

In [26]:
df_.describe()

Unnamed: 0,index,2in1,Cleanliness,Color Treated Hair,Customer Service,Customer Wishlist,Design,Discontinued Products,Ease of Use,Economical,...,Spray Application,Styling,Texture,Thickness,Time or Frequency of Use,Treatment Time,Viscosity,Volume & Fullness,Weather,rating
count,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,...,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0
mean,4604.482551,0.028413,0.146781,0.015916,0.000118,-0.009668,0.006484,0.000118,0.017685,0.012379,...,0.000589,0.013322,0.028413,0.02464,0.085121,0.009314,0.018156,0.030064,0.054822,1.558241
std,2679.072315,0.192463,0.369236,0.153123,0.018807,0.124381,0.102235,0.039151,0.163727,0.114763,...,0.024274,0.123567,0.177815,0.184869,0.301422,0.102016,0.166532,0.204689,0.235788,0.61615
min,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,2280.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,4592.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,6923.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,9251.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


# Drive's Analysis

In [13]:
path = './'

path = Path('./')

Train/test split

In [27]:
seq_trn, seq_tst, y_train, y_test = train_test_split(np.array(range(len(df_)))[:,np.newaxis], df_['rating'],
                                                    stratify=df_['rating'], 
                                                    test_size=0.1)

df_train = df_.iloc[seq_trn.squeeze()]
df_test  = df_.iloc[seq_tst.squeeze()]

define column types

In [28]:
cont_names = list(df_train.columns)[:-1]
dep_var = 'rating'

In [29]:
test = TabularList.from_df(df.iloc[seq_tst.squeeze()], path=path, cont_names=cont_names)

In [30]:
data = (TabularList.from_df(df_, path=path, cat_names=None, cont_names=cont_names)
                           .split_by_idx(seq_trn.squeeze())
                           .label_from_df(cols=dep_var)
                           .databunch())

In [31]:
learner = tabular_learner(data, layers=[200,100], metrics=accuracy)
learner.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,1.081420,1.046426,0.629503
2,0.971609,0.820720,0.651120
3,0.845638,0.749916,0.654264
4,0.743574,0.868140,0.639722
5,0.672209,0.841190,0.635399
6,0.604562,0.917675,0.632124
7,0.545921,0.984176,0.630159
8,0.487582,1.057778,0.626490
9,0.435795,1.131687,0.615878
10,0.400869,1.147508,0.620202


In [32]:
learner.unfreeze()
learner.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,0.310372,1.180184,0.625835
2,0.297333,1.352730,0.628455
3,0.315317,1.468799,0.627014
4,0.353856,1.679512,0.592035
5,0.353298,1.475937,0.622298
6,0.346036,1.533724,0.598847
7,0.327085,1.645455,0.619023
8,0.300318,1.664119,0.601991
9,0.275676,1.664442,0.613913
10,0.254052,1.679109,0.609983


In [33]:
learner.predict(df_.iloc[3]), df_.iloc[3].values[-1]

((Category 2, tensor(2), tensor([2.1392e-05, 3.0170e-03, 9.9696e-01])), 1.0)

In [35]:
learner.predict(df_.iloc[9]), df_.iloc[9].values[-1]

((Category 2, tensor(2), tensor([3.4644e-04, 2.3504e-01, 7.6461e-01])), 1.0)

In [36]:
learner.save('driversanalysis_{:}'.format(BRAND))