In [1]:
import sys, getopt, re

def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv,"i:")
        # print(opts,args)
    except getopt.GetoptError:
        print('TopicClassifier.py -i <BRAND>')
        sys.exit(2)
    
    for opt, arg in opts:
        if opt == "-i":
            inputfile = arg
    
    if len(inputfile)==0:
        print('usage: TopicClassifier.py -i <BRAND>')
        sys.exit(2)
    print('Input BRAND is ', inputfile)
    return inputfile

In [2]:
BRAND      = 'HEAD & SHOULDERS'

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

---

In [4]:
from fastai.text import * # Quick access to NLP functionality

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# Pre-Processing

## Data Loading (from saved verbatim)

In [6]:
df = pd.read_csv('./data/sample_data.csv', dtype={'verbatum':object})

df.head()

Unnamed: 0,snippet,product,rating,verbatum,sentiment_binary,sentiment,date,retailer,website,topic
0,This shampoo sprays crazy good and it has save...,Head And Shoulders Smooth & Silky Dandruff Sha...,5.0,This shampoo sprays crazy good and it has save...,1,Positive,5/2/2018,-,https://www.walmart.com/ip/Head-and-Shoulders-...,Spray Application
1,The thing I always remember about Head & Shoul...,Head And Shoulders Green Apple Anti-Dandruff S...,5.0,My teenaged daughter has been using Head & Sho...,1,Positive,9/29/2017,AMZ,https://www.amazon.com/Head-Shoulders-Anti-Dan...,Spray Application
2,"My long, thick, wavy, hair is frequently abuse...",Dove Dermacare Scalp Anti-Dandruff Shampoo Inv...,4.0,"To my surprise, Dove DermaCare Scalp Invigorat...",1,Positive,1/27/2017,WALMART,https://www.walmart.com/ip/Dove-Dermacare-Scal...,Spray Application
3,"From their body sprays, to their deodorant, so...",Dove Dermacare Scalp Anti-Dandruff Shampoo Dry...,5.0,I let my boyfriend try this product. He loves ...,1,Positive,1/27/2017,-,https://www.walmart.com/ip/Dove-Dermacare-Scal...,Spray Application
4,"Its conveniently packaged, but if this bottle ...","Head And Shoulders Dry Scalp Care, Almond Oil,...",4.0,"Its conveniently packaged, but if this bottle ...",1,Positive,1/14/2016,AMZ,https://www.amazon.com/Head-Shoulders-Almond-D...,Spray Application


## Data Cleaning

Extract the text snippets and associated topic; remove duplicates and too short verbatum 

In [7]:
# only keep snippets
df = df[['snippet','topic']].rename(columns = {'snippet':'text', 'topic':'label'})

# enforce format
df['text'] = df['text'].astype(str)

# remove short comments (keep 2nd percentile and higher lenghts)
thresh = df['text'].apply(len).quantile(0.02)

df = df.loc[df['text'].apply(lambda x: len(x)>=thresh)].\
    drop_duplicates()

In [8]:
# keep only topics with enough samples
df_counts = df.groupby('label').size()
df = df.loc[df.label.isin(df_counts[df_counts>df_counts.max()/100].index.tolist())]

In [9]:
# reset index
df = df.reset_index().drop('index', axis=1)

## Pre-Tokenization

In [10]:
# # Setup stop words
# from spacy.lang.en.stop_words import STOP_WORDS

# for word in STOP_WORDS:
#     for w in (word, word[0].capitalize(), word.upper()):
#         lex = nlp.vocab[w]
#         lex.is_stop = True

In [11]:
# from nltk.corpus import stopwords
import spacy
import string

# stop = set(stopwords.words('english'))
exclude = set(string.punctuation)-set(['!','?'])

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

def preprocess(doc, tags=['NOUN', 'ADJ','VERB','ADV']):
    # make entire text lower case
    doc = doc.lower()    
    
    # replace "n't" with " not" & remove unwanted characters, numbers and symbols
    doc = doc.replace("n\'t", " not").replace("[^a-zA-Z#]", " ")
       
    # remove stop words
    # doc = " ".join([i for i in doc.split() if i not in stop])
    
    # remove short words (length < 3)
    # doc = " ".join([r for r in doc.split() if len(r)>2])
    
    # remove punctuation
    # doc = ''.join(ch for ch in doc if ch not in exclude)
    
    # lemmatization
    # doc = " ".join([token.lemma_ for token in nlp(doc) if token.pos_ in tags])
    return doc

In [12]:
# removes stop words, short words, and punctuation; lowercase and lemmatize all
print(' Pre-Tokenization')
df['text']        = df['text'].apply(preprocess)

 Pre-Tokenization


# Topic Classification

In [13]:
path = './'

path = Path('./')

Train/test split

In [14]:
seq_trn, seq_tst, y_train, y_test = train_test_split(np.array(range(len(df)))[:,np.newaxis], df['label'],
                                                    stratify=df['label'], 
                                                    test_size=0.25)

df_train = df.iloc[seq_trn.squeeze()]
df_test  = df.iloc[seq_tst.squeeze()]

Parameter for classifier model

In [15]:
moms = (0.8,0.7)

Load the language model:

In [16]:
with open('./models/vocab_{:}.pkl'.format(BRAND.replace(' ','')), 'rb') as handle:
    vocab = pickle.load(handle)

Create a classifier:

In [17]:
data_clas = TextClasDataBunch.from_df(path, train_df=df_train, valid_df=df_test, 
                                      label_cols='label', text_cols='text', \
                                      vocab=vocab, bs=64)

Compute weights for balancing classes

In [18]:
train_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(df_train['label']),
                                                 df_train['label'])

In [21]:
learner = text_classifier_learner(data_clas, AWD_LSTM)
learner.load_encoder('encoder_{:}'.format(BRAND.replace(' ','')))

# use class weights
loss_weights = torch.FloatTensor(train_weights).cuda()
learner.crit = partial(F.cross_entropy, weight=loss_weights)

learner.freeze()
learner.fit_one_cycle(4, moms=moms)

epoch,train_loss,valid_loss,accuracy
1,2.779095,2.36762,0.347422
2,2.731513,2.327848,0.349788
3,2.695337,2.286179,0.359008
4,2.693579,2.264864,0.36423


In [22]:
learner.unfreeze()
learner.fit_one_cycle(8, slice(1e-5,1e-3), moms=moms)

epoch,train_loss,valid_loss,accuracy
1,2.614967,2.237946,0.369125
2,2.576445,2.20269,0.381854
3,2.565689,2.152503,0.392298
4,2.518616,2.088439,0.407719
5,2.475763,2.023911,0.428198
6,2.424518,2.020624,0.427627
7,2.427917,1.991204,0.436276
8,2.423493,1.992771,0.43187


In [23]:
learner.freeze()
learner.fit_one_cycle(4, moms=moms)

epoch,train_loss,valid_loss,accuracy
1,2.410148,1.975993,0.442232
2,2.454553,1.963489,0.441253
3,2.387548,1.946502,0.44623
4,2.384453,1.932074,0.448678


learner.predict(preprocess("the product left me with itch scalp"))

learner.predict(preprocess("the product did not perform well"))

learner.predict(preprocess("the rain gave me frizzy hair"))

learner.predict(preprocess("the cold temperatures affected my hair"))

probs, y_correct = learner.get_preds(ds_type = DatasetType.Valid)

preds = np.argmax(probs.data.numpy(),axis=1)

from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_correct.data.numpy(), preds)

print(classification_report(y_correct.data.numpy(), preds))

np.set_printoptions(precision=2)

cm

In [24]:
learner.save('topicclassifier_{:}'.format(BRAND.replace(' ','')))

In [35]:
learner.export(fname = 'models/topicclassifier_{:}.pkl'.format(BRAND.replace(' ','')))