In [1]:
import sys, getopt, re

def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv,"i:")
        # print(opts,args)
    except getopt.GetoptError:
        print('LanguageModeling.py -i <BRAND>')
        sys.exit(2)
    
    for opt, arg in opts:
        if opt == "-i":
            inputfile = arg
    
    if len(inputfile)==0:
        print('usage: LanguageModeling.py -i <BRAND>')
        sys.exit(2)
    print('Input BRAND is ', inputfile)
    return inputfile

In [2]:
BRAND      = 'HAIRCARE'

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

---

In [4]:
from fastai.text import * # Quick access to NLP functionality
from fastai.callbacks import EarlyStoppingCallback, SaveModelCallback

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Pre-Processing

## Data Loading (from saved verbatim)

In [6]:
df = pd.read_csv('./data/sample_data.csv', dtype={'verbatum':object})

df.head()

Unnamed: 0,snippet,product,rating,verbatum,sentiment_binary,sentiment,date,retailer,website,topic
0,This shampoo sprays crazy good and it has save...,Head And Shoulders Smooth & Silky Dandruff Sha...,5.0,This shampoo sprays crazy good and it has save...,1,Positive,5/2/2018,-,https://www.walmart.com/ip/Head-and-Shoulders-...,Spray Application
1,The thing I always remember about Head & Shoul...,Head And Shoulders Green Apple Anti-Dandruff S...,5.0,My teenaged daughter has been using Head & Sho...,1,Positive,9/29/2017,AMZ,https://www.amazon.com/Head-Shoulders-Anti-Dan...,Spray Application
2,"My long, thick, wavy, hair is frequently abuse...",Dove Dermacare Scalp Anti-Dandruff Shampoo Inv...,4.0,"To my surprise, Dove DermaCare Scalp Invigorat...",1,Positive,1/27/2017,WALMART,https://www.walmart.com/ip/Dove-Dermacare-Scal...,Spray Application
3,"From their body sprays, to their deodorant, so...",Dove Dermacare Scalp Anti-Dandruff Shampoo Dry...,5.0,I let my boyfriend try this product. He loves ...,1,Positive,1/27/2017,-,https://www.walmart.com/ip/Dove-Dermacare-Scal...,Spray Application
4,"Its conveniently packaged, but if this bottle ...","Head And Shoulders Dry Scalp Care, Almond Oil,...",4.0,"Its conveniently packaged, but if this bottle ...",1,Positive,1/14/2016,AMZ,https://www.amazon.com/Head-Shoulders-Almond-D...,Spray Application


## Data Cleaning

Extract the full verbatum; remove duplicates and too short verbatum 

In [7]:
# get full verbatum and enforce
df = df[['verbatum']].rename(columns = {'verbatum':'text'})

# enforce format
df['text'] = df['text'].astype(str)

# remove short comments (keep 2nd percentile and higher lenghts)
thresh = df['text'].apply(len).quantile(0.02)

df = df.loc[df['text'].apply(lambda x: len(x)>=thresh)].\
    drop_duplicates()

In [8]:
df.head()

Unnamed: 0,text
0,This shampoo sprays crazy good and it has save...
1,My teenaged daughter has been using Head & Sho...
2,"To my surprise, Dove DermaCare Scalp Invigorat..."
3,I let my boyfriend try this product. He loves ...
4,"Its conveniently packaged, but if this bottle ..."


## Pre-Tokenization

In [9]:
# from nltk.corpus import stopwords
import spacy
import string

# stop = set(stopwords.words('english'))
exclude = set(string.punctuation)-set(['!','?'])

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

def preprocess(doc, tags=['NOUN', 'ADJ','VERB','ADV']):
    # make entire text lower case
    doc = doc.lower()    
    
    # replace "n't" with " not" & remove unwanted characters, numbers and symbols
    doc = doc.replace("n\'t", " not").replace("[^a-zA-Z#]", " ")
       
    # remove stop words
    # doc = " ".join([i for i in doc.split() if i not in stop])
    
    # remove short words (length < 3)
    # doc = " ".join([r for r in doc.split() if len(r)>2])
    
    # remove punctuation
    # doc = ''.join(ch for ch in doc if ch not in exclude)
    
    # lemmatization
    # doc = " ".join([token.lemma_ for token in nlp(doc)])# if token.pos_ in tags])
    return doc

In [10]:
# removes stop words, short words, and punctuation; lowercase and lemmatize all
print(' Pre-Tokenization')
df['text']        = df['text'].apply(preprocess)

 Pre-Tokenization


# Language Modeling

In [11]:
path = './'

path = Path('./')

Train/test split

In [12]:
seq = np.random.rand(len(df)) < .10
df_train = df[ seq]
df_test  = df[~seq]

Parameter for both language and classifier model

In [13]:
moms = (0.8,0.7)

Create a `DataBunch` for each of the language model and the classifier:

In [None]:
data_lm = TextLMDataBunch.from_df(path, train_df=df_train, valid_df=df_test, text_cols='text')#,\
#                                  tokenizer=Tokenizer(BaseTokenizer))

# callbacks
callback_fns = [partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.01, patience=10),\
                partial(SaveModelCallback, monitor='accuracy', every='improvement', name='lm_{:}.mdl'.format(BRAND))]

# We'll fine-tune the language model. [fast.ai](http://www.fast.ai/) has a 
# pre-trained English model available that we can download, we jsut have to specify it like this:
learner = language_model_learner(data_lm, pretrained_model=URLs.WT103_1, \
    callback_fns=callback_fns)

learner.unfreeze()
learner.fit_one_cycle(8, slice(1e-2), moms=moms)

# Save our language model's encoder:
learner.save_encoder('encoder_{:}'.format(BRAND))

# Save vocab
vocab = data_lm.train_ds.vocab
with open('./models/vocab_{:}.pkl'.format(BRAND), 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
learner.unfreeze()
learner.fit_one_cycle(8, slice(1e-2), moms=moms)

Save model

In [16]:
learner.save('languagemodel_{:}'.format(BRAND))

Save encoder matrix in array format

In [17]:
m = learner.model
m.eval()
layers = list(m.children())

emb_mtx = layers[0].encoder.weight.cpu().data.numpy()

with open('./models/embedding_mtx_{:}.pkl'.format(BRAND), 'wb') as handle:
    pickle.dump(emb_mtx, handle, protocol=pickle.HIGHEST_PROTOCOL) 

In [19]:
if True: print(learner.predict("the product leave -pron- with itch scalp", n_words=20))

the product leave -pron- with itch scalp . just like that . i 'm happy with the product so i do feel like it works on the
