# Load Packages

I use a lot of different python packages here. Let's just load them all right at the top to get it over with.

In [None]:
import pandas as pd
import numpy as np
import pickle

import json
import time

import csv

from nltk.tokenize import RegexpTokenizer
#from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from collections import Counter

import numpy as np
import lda
import lda.datasets
import numpy as np
import textmining

import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split



# Collect the data

The first thing I do is load up two Amazon.com review data sets. Then we need to filter through them to find the products we want and then match that to the reviews. Additionally we want to remove products with no 'price' available and reviews with no text content.

Create a Pandas Dataframe containing all the product information

In [None]:
import DataFrame_Helper as DFh
df=DFh.Create_product_DataFrame()

Filter this data to just include Headphones. Create a pandas dataframe that only contains reviews for headphones

In [None]:
dfHeadphonesRR=DFh.Filter_to_Headphones(df)

Save files along the way so we don't have to redo anything if it crashes!

In [None]:
 dfHeadphonesRR.to_csv("headphonesRR_test.csv") 

# Preparing the data

Now that I have cleaned the data I need to prepare it to be used by the LDA algorithm. This involves removing stop_words and limiting the vocabulary to only the top 10000 words. Additionally I will inject the sentiment codewords and prepare the LDA training corpus and vocabulary.

In [None]:
import DataPreparation as DP # <----- Code I wrote contains the bulk of the heavy lifting
dfHeadphones, englishstops, pos, neg = DP.load_relevant_files()

In [None]:
editedT= DP.prepare_data_1(dfHeadphones,englishstops,pos,neg)

In [None]:
X,vocab = DP.prepare_data_2(editedT)

In [None]:
with open("lda_vocab_test.csv", "wb") as f2:
    writer = csv.writer(f2)
    writer.writerows(vocab)

np.save('lda_X_test.npy', X) # A big file, but you'll be happy if your code crashes later on!

# Train the LDA

In [None]:
X=np.load('lda_X_test.npy')

In [None]:
model50 = lda.LDA(n_topics=50, n_iter=500, random_state=1)
model50.fit(X)

# LDA Visualization

Using the beautiful pyLDAvis I can have a look at the LDA output. This helps me determine what each topic is actually trying to represent.  

In [None]:
import pyLDAvis
import pyLDAvis_Helper as pyH

lengths = pyH.Create_lengths(editedT)
vocabclicks = pyH.Create_vocabclicks(X)

datastar = {'topic_term_dists': model50.topic_word_, 
            'doc_topic_dists': model50.doc_topic_,
            'doc_lengths': lengths,
            'vocab': vocab,
            'term_frequency': vocabclicks, 'sort_topics' : False}

In [None]:
f=file('data50.pkl','wb')
pickle.dump(datastar,f,pickle.HIGHEST_PROTOCOL)
f.close()

In [None]:
datastar = {'topic_term_dists': model50.topic_word_, 
            'doc_topic_dists': model50.doc_topic_,
            'doc_lengths': lengths,
            'vocab': vocab,
            'term_frequency': vocabclicks, 'sort_topics' : False,'mds':'mmds'}

In [None]:
vis_datastar = pyLDAvis.prepare(**datastar)
pyLDAvis.display(vis_datastar)

At this point things become a bit subjective. I go through each topic and try to determine what the word distribution is trying to describe. Setting lambda to 0.6-0.7 usually helps by giving a good balance between word frequency and word uniqueness in a topic.

Not shown here, but to aid in the process (specifically for those that were a bit difficult to determine) I used a ward hierarchical clustering method. I ordered each clustering step by how big the difference in distance was between step n and step n+1 and found where the biggest drop offs were. 

For Example: If at step 25 the clustering distance is 0.31 at step 26 its 0.32 and at step 27 its 0.51, we can see theres a huge jump from 26 to 27. This indicates maybe a good place to stop clustering.

At the end of the day this is a bit of a subjective process. The following code will be based off the assumptions I made during this step

# Gather LDA topics and create final dataframe

In [None]:
doc_topic=model30.doc_topic_
FinalDF=DFh.Collect_Topics(doc_topic)

I now need to turn my 'review' dataframe into a 'product' dataframe

In [None]:
proDF=DFh.ReviewDF_to_ProductDF(FinalDF)

Filter to headphones less than $40

In [None]:
dfHeadp0=proDF[proDF['Price']<40.01]

# Regression Training Procedure

In [None]:
RXarray=dfHeadp0[[ 'Good Reviews','Good CableCord','Good Handsfree','Good Levels',
                        #'Good Brand',
                  'Good Comfort','Good Durability',
                        'Good Sound','Good Case',
                        'Good Mic','Good Value','Bad Value','Bad Durability',
                                    'Bad CableCord',
                        'Bad Reviews','Bad Service','Bad Handsfree',
                        'Bad Comfort','Bad Levels'
                  
                  
                   ]].as_matrix().astype(float)
RYarray=dfHeadp0['Stars'].as_matrix().astype(float)


X_train, X_test, y_train, y_test = train_test_split(RXarray, RYarray, test_size=0.5,
                                                    random_state=9)

In [None]:
params = {'n_estimators': 349, 'max_depth': 3, 'min_samples_split': 2,
          'learning_rate': 0.0498, 'loss': 'huber','subsample':0.35}


clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)


# #############################################################################
# Plot training deviance
XNames=np.array(['Good Reviews','Good CableCord','Good Handsfree','Good Levels',
                        #'Good Brand',
                 'Good Comfort','Good Durability',
                        'Good Sound','Good Case',
                        'Good Mic','Good Value','Bad Value','Bad Durability',
                                    'Bad CableCord',
                        'Bad Reviews','Bad Service','Bad Handsfree',
                        'Bad Comfort','Bad Levels'
                ])

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(18, 6))
plt.subplot(1, 3, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

# #############################################################################
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 3, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, XNames[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')


error=[]
for i in range(0,len(y_test)):
    y=clf.predict(X_test[i].reshape(1,-1))
    loss=float(y_test[i])-float(y)
    error.append(loss)
plt.subplot(1,3,3)
plt.title('Error=[Y-Ypred]')
plt.xlabel('Number of Stars')
plt.hist(error,40)

plt.show()

The regression parameters were optimized using a randomized grid search. This can be completed using the following code.

In [None]:
pickle.dump(clf,open("regressor_test.pkl", 'wb')) 

In [None]:
import Optimizer_Helper as OpH
PosVec=OpH.create_position_vectors(500,4,[200,1000],'int',[2,5],'int',[0.01,0.2],'float',[0.1,1],'float')

In [None]:
MSEv=[]
MEv=[]
for idx, pos in enumerate(PosVec):
    if idx%10==0 : print idx
    ME,MSE=OpH.Z_Rule1(pos)
    MSEv.append(MSE)
    MEv.append(ME)

print'finished'

Print the best 5 configurations

In [None]:
MSEP=np.array([MSEv,PosVec[:,0],PosVec[:,1],PosVec[:,2],PosVec[:,3]]).T
sortedMSEP=MSEP[np.array(MSEv).argsort()]
print sortedMSEP[0:5]

The backend workflow is now complete! I pickle the LDA and the Regressor for further use in the frontend which runs on AWS