In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

---

In [2]:
from fastai.text import * # Quick access to NLP functionality
from fastai.callbacks import EarlyStoppingCallback, SaveModelCallback

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [18]:
BRAND = 'Health_and_Personal_Care'

# Pre-Processing

## Data Loading (from saved verbatim)

In [9]:
df = pd.read_json("./data/reviews_data.json", lines=True) #
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,159985130X,"[1, 1]",5,This is a great little gadget to have around. ...,"01 5, 2011",ALC5GH8CAMAI7,AnnN,Handy little gadget,1294185600
1,159985130X,"[1, 1]",4,I would recommend this for a travel magnifier ...,"02 18, 2012",AHKSURW85PJUE,"AZ buyer ""AZ buyer""",Small & may need to encourage battery,1329523200
2,159985130X,"[75, 77]",4,What I liked was the quality of the lens and t...,"06 8, 2010",A38RMU1Y5TDP9,"Bob Tobias ""Robert Tobias""",Very good but not great,1275955200
3,159985130X,"[56, 60]",4,Love the Great point light pocket magnifier! ...,"02 8, 2008",A1XZUG7DFXXOS4,Cat lover,great addition to your purse,1202428800
4,159985130X,"[1, 1]",5,This is very nice. You pull out on the magnifi...,"08 16, 2011",A1MS3M7M7AM13X,Cricketoes,Very nice and convenient.,1313452800


## Data Cleaning

Extract the text snippets and associated topic; remove duplicates and too short verbatum 

In [10]:
# only keep snippets
df = df[['reviewText']].rename(columns = {'reviewText':'text'})

# enforce format
df['text'] = df['text'].astype(str)

# remove short comments (keep 2nd percentile and higher lenghts)
thresh = df['text'].apply(len).quantile(0.02)

df = df.loc[df['text'].apply(lambda x: len(x)>=thresh)].\
    drop_duplicates()

In [11]:
df.head()

Unnamed: 0,text
0,This is a great little gadget to have around. ...
1,I would recommend this for a travel magnifier ...
2,What I liked was the quality of the lens and t...
3,Love the Great point light pocket magnifier! ...
4,This is very nice. You pull out on the magnifi...


## Pre-Tokenization

In [12]:
# from nltk.corpus import stopwords
import spacy
import string

# stop = set(stopwords.words('english'))
exclude = set(string.punctuation)-set(['!','?'])

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

def preprocess(doc, tags=['NOUN', 'ADJ','VERB','ADV']):
    # make entire text lower case
    # doc = doc.lower()    
    
    # replace "n't" with " not" & remove unwanted characters, numbers and symbols
    doc = doc.replace("n\'t", " not")#.replace("[^a-zA-Z#]", " ")
       
    # remove stop words
    # doc = " ".join([i for i in doc.split() if i not in stop])
    
    # remove short words (length < 3)
    # doc = " ".join([r for r in doc.split() if len(r)>2])
    
    # remove punctuation
    # doc = ''.join(ch for ch in doc if ch not in exclude)
    
    # lemmatization
    # doc = " ".join([token.lemma_ for token in nlp(doc)])# if token.pos_ in tags])
    return doc

In [13]:
# removes stop words, short words, and punctuation; lowercase and lemmatize all
print(' Pre-Tokenization')
df['text']        = df['text'].apply(preprocess)

 Pre-Tokenization


# Language Modeling

In [14]:
path = './'

path = Path('./')

Train/test split

In [15]:
seq = np.random.rand(len(df)) < .10
df_train = df[ seq]
df_test  = df[~seq]

Parameter for both language and classifier model

In [16]:
moms = (0.8,0.7)

Create a `DataBunch` for each of the language model and the classifier:

In [None]:
data_lm = TextLMDataBunch.from_df(path, train_df=df_train, valid_df=df_test, text_cols='text')

# callbacks
callback_fns = [partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.01, patience=10),\
                partial(SaveModelCallback, monitor='accuracy', every='improvement', name='lm_{:}.mdl'.format(BRAND))]

# We'll fine-tune the language model. [fast.ai](http://www.fast.ai/) has a 
# pre-trained English model available that we can download, we jsut have to specify it like this:
learner = language_model_learner(data_lm, pretrained_model=URLs.WT103_1, \
    callback_fns=callback_fns)

learner.unfreeze()
learner.fit_one_cycle(8, slice(1e-2), moms=moms)

# learner.unfreeze()
# learner.fit(8, slice(1e-2))

# Save our language model's encoder:
learner.save_encoder('encoder_{:}'.format(BRAND))

# Save vocab
vocab = data_lm.train_ds.vocab
# with open('../models/vocab_{:}.pkl'.format(BRAND), 'wb') as handle:
#     pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
learner.unfreeze()
learner.fit_one_cycle(8, slice(1e-2), moms=moms)

Save model

In [None]:
# learner.save('languagemodel_{:}'.format(BRAND))

Save encoder matrix in array format

In [None]:
# m = learner.model
# m.eval()
# layers = list(m.children())

# emb_mtx = layers[0].encoder.weight.cpu().data.numpy()

# with open('../models/embedding_mtx_{:}.pickle'.format(BRAND), 'wb') as handle:
#     pickle.dump(emb_mtx, handle, protocol=pickle.HIGHEST_PROTOCOL) 

In [None]:
if True: learner.predict("the product leave -pron- with itch scalp", n_words=20)