In [1]:
import sys, getopt, re

def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv,"i:")
        # print(opts,args)
    except getopt.GetoptError:
        print('TopicClassifier_Inference.py -i <BRAND>')
        sys.exit(2)
    
    for opt, arg in opts:
        if opt == "-i":
            inputfile = arg
    
    if len(inputfile)==0:
        print('usage: TopicClassifier_Inference.py -i <BRAND>')
        sys.exit(2)
    print('Input BRAND is ', inputfile)
    return inputfile

In [2]:
BRAND      = 'HEAD & SHOULDERS'

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

---

In [4]:
from fastai.text import * # Quick access to NLP functionality

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import pickle

# Pre-Processing

## Data Loading (from saved verbatim)

In [6]:
dirname = './data/pqc/'

df=1; del df
for filename in os.listdir(dirname):
    df_tmp = pd.read_csv(os.path.join(dirname, filename), index_col=0, \
        dtype={'CONTACT_VERBATIM':'str', 'CONTACT_VERBATIM_NOTES':'str'})
    
    # convert brand name to uppercase
    df_tmp.BRAND = df_tmp.BRAND.str.upper()
    
    df_tmp = df_tmp.loc[df_tmp.BRAND==BRAND]
    
    try:
        df = pd.concat([df,df_tmp])
    except:
        df = df_tmp.copy()

df['CONTACT_VERBATIM_NOTES'] = df['CONTACT_VERBATIM_NOTES'].astype(str).apply(lambda x: x if len(x)>4 else '')
        
df['FULL_CONTACT_VERBATIM'] = df.apply(lambda row: row['CONTACT_VERBATIM']+row['CONTACT_VERBATIM_NOTES'], axis=1)

df.head()

Unnamed: 0,CONTACT_DATE,CONTACT_VERBATIM,CONTACT_VERBATIM_NOTES,PRODUCT_NAME,PRODUCT_PATH,COMMENT_CATEGORY,COMMENT_SUBCATEGORY,COMMENT_DESCRIPTION,SECTOR,SUB_SECTOR,CATEGORY,BRAND,FULL_CONTACT_VERBATIM
63.0,2018-05-08 00:00:00,"Hi, I recently purchased the big bottles of yo...",,Head & Shoulders Shampoo,Head & Shoulders/Shampoo,Product Quality Complaint,Dissatisfied with performance,"Results; did not condition, moisturize as expe...",Beauty Care,Hair Care,Hair Care,HEAD & SHOULDERS,"Hi, I recently purchased the big bottles of yo..."
64.0,2018-05-16 00:00:00,I have been using head and shoulders classic c...,,Head & Shoulders Shampoo FreshClean Classic Clean,Head & Shoulders/Shampoo/Fresh & Clean/Classic...,Product Quality Complaint,Sensory attributes,"Smell, scent, aroma, odor",Beauty Care,Hair Care,Hair Care,HEAD & SHOULDERS,I have been using head and shoulders classic c...
66.0,2018-05-30 00:00:00,"I've used Head & Shoulders for years, and for ...",question whether anything in the product is co...,Head & Shoulders Shampoo Base GreenApple AntiD...,Head & Shoulders/Shampoo/Base/Green Apple/Anti...,Product Quality Complaint,Sensory attributes,"Smell, scent, aroma, odor",Beauty Care,Hair Care,Hair Care,HEAD & SHOULDERS,"I've used Head & Shoulders for years, and for ..."
67.0,2018-05-30 00:00:00,"I've used Head & Shoulders for years, and for ...",question whether anything in the product is co...,Head & Shoulders Shampoo Base Apple ADShamCond...,Head & Shoulders/Shampoo/Base/Apple/AntiDandru...,Product Quality Complaint,Sensory attributes,"Smell, scent, aroma, odor",Beauty Care,Hair Care,Hair Care,HEAD & SHOULDERS,"I've used Head & Shoulders for years, and for ..."
224.0,2018-05-17 00:00:00,Did you change the making of or fragrance of y...,ant to stop using it but I honestly can’t stan...,Head & Shoulders Shampoo,Head & Shoulders/Shampoo,Product Quality Complaint,Sensory attributes,"Smell, scent, aroma, odor",Beauty Care,Hair Care,Hair Care,HEAD & SHOULDERS,Did you change the making of or fragrance of y...


## Data Cleaning

Extract the text snippets and associated topic; remove duplicates and too short verbatum 

In [7]:
# keep original data
df_orig = df.copy()

# only keep snippets
df = df[['FULL_CONTACT_VERBATIM']].rename(columns = {'FULL_CONTACT_VERBATIM':'text'})

# enforce format
df['text'] = df['text'].astype(str)

# remove short comments (keep 2nd percentile and higher lenghts)
thresh = df['text'].apply(len).quantile(0.02)

df = df.loc[df['text'].apply(lambda x: len(x)>=thresh)].\
    drop_duplicates()

In [8]:
# reset index
df.head()

Unnamed: 0,text
63.0,"Hi, I recently purchased the big bottles of yo..."
64.0,I have been using head and shoulders classic c...
66.0,"I've used Head & Shoulders for years, and for ..."
224.0,Did you change the making of or fragrance of y...
234.0,I purchase some Head & Shoulder Active Sports ...


## Pre-Tokenization

In [9]:
# from nltk.corpus import stopwords
import spacy
import string

# stop = set(stopwords.words('english'))
exclude = set(string.punctuation)-set(['!','?'])

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

def preprocess(doc, tags=['NOUN', 'ADJ','VERB','ADV']):
    # make entire text lower case
    doc = doc.lower()    
    
    # replace "n't" with " not" & remove unwanted characters, numbers and symbols
    doc = doc.replace("n\'t", " not").replace("[^a-zA-Z#]", " ")
       
    # remove stop words
    # doc = " ".join([i for i in doc.split() if i not in stop])
    
    # remove short words (length < 3)
    # doc = " ".join([r for r in doc.split() if len(r)>2])
    
    # remove punctuation
    # doc = ''.join(ch for ch in doc if ch not in exclude)
    
    # lemmatization
    # doc = " ".join([token.lemma_ for token in nlp(doc) if token.pos_ in tags])
    return doc

In [10]:
# removes stop words, short words, and punctuation; lowercase and lemmatize all
print(' Pre-Tokenization')
df['text']        = df['text'].apply(preprocess)

 Pre-Tokenization


# Topic Classification - Inference

In [11]:
path = './'

path = Path(path)

Load the classifier:

In [13]:
clf_learner = load_learner(path, 
                       fname = 'models/topicclassifier_{:}.pkl'.format(BRAND.replace(' ','')))

Class inference:

In [14]:
df['topic'] = df['text'].apply(lambda row: str(clf_learner.predict(preprocess(row))[0]))

Load sentiment analysis

In [15]:
sent_learner = load_learner(path, 
                       fname = 'models/sentiment_{:}.pkl'.format(BRAND.replace(' ','')))

Sentiment inference:

In [16]:
df['sentiment'] = df['text'].apply(lambda row: str(sent_learner.predict(preprocess(row))[0]))

Save results

In [17]:
df_orig.join(df[['topic','sentiment']]).to_csv('./data/pqc_{:}.csv'.format(BRAND.replace(' ','')))

Notes: can consider splitting sentences every major punctuation (. ! ?)