# Sentiment Analysis Pipeline

### Part 1: Subcategory Sentiment Analysis

Import required libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse

import json
import pickle
%matplotlib inline

Import IBM Watson Libraries, and add API credentials. Note: Real credentials not included here as this is a public repository.

In [None]:
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, KeywordsOptions, EntitiesOptions, EmotionOptions, SentimentOptions

ibm_pass = "pwQydFvZpz5S"
ibm_user = "0b421165-dcf4-4050-8a15-30a87e0bd498"
ibm = {'u':ibm_user, 'p':ibm_pass}

`getSentiment` is a function to make API requests and parse the responses.

`getNLU` is a function to compute the number of credits being used for a request

In [None]:
def getSentiment(text):
    if len(text) > 20:
        natural_language_understanding = NaturalLanguageUnderstandingV1(
          username=ibm['u'],
          password=ibm['p'],
          version="2017-02-27")
        try:
            response = natural_language_understanding.analyze(text=text,features=Features(sentiment=SentimentOptions(document=True)))
            report = response["sentiment"]["document"]["score"]
        except WatsonApiException:
            report = "Error"       
    else:
        report = NaN
    return(report)

def getNLU(x):
    return (x if x % 10000 == 0 else x + 10000 - x % 10000)/10000

Testing to see if the `getSentiment` function worked.

In [None]:
combined = getSentiment("This camera worked qute well, I am really happy with its image quality and eas-of-use.")
combined

Loading our data

In [None]:
df = pd.read_pickle("Electronics_meta.pickle")
df.head(1)

Processing the data to collect by subcategory and date. I.e. we want to see what are the reviews made for products in a particular subcategory on each day.

In [None]:
df = df[['sub_category_1','reviewTime','reviewText']]
df = df.groupby(['reviewTime','sub_category_1']).agg(lambda x: ". ".join(x.tolist()))
df['charlen']=list(map(lambda x:len(x),df.reviewText))
df = df[df.charlen>20]
df.head(1)

In [None]:
senti_cat = df
senti_cat['NLU'] = list(map(lambda x:getNLU(x),senti_cat.charlen))
senti_cat = senti_cat.sort_values(['NLU', 'charlen'], ascending=[1, 1])
senti_cat.head(5)

Parse through our data, and store the results in a new column. Store the data in .pickle files for future reference. Here we decided to store many small pickles as we made Sentiment Analysis requests in smaller batches to ensure API credits were not wasted in case of server errors.

In [None]:
import datetime

for x in range(0,40000,1000):
    batchN = senti_cat.iloc[x:x+1000, :]
    t = datetime.datetime.now()
    print ("Start: "+ t.strftime('%H:%M:%S'))
    batchN['sentScore'] = list(map(lambda x:getSentiment(x),batchN.reviewText))
    filename = "catDate_sentScores_"+str(int(x/1000))+"k_"+str(int((x+1000)/1000))+"k.pickle"
    pickling_on = open(filename,"wb")
    pickle.dump(batchN[['sentScore']], pickling_on)
    pickling_on.close()
    print(filename + " complete")

batchN = senti_cat.iloc[40000:, :]
t = datetime.datetime.now()
print ("Start: "+ t.strftime('%H:%M:%S'))
batchN['sentScore'] = list(map(lambda x:getSentiment(x),batchN.reviewText))
filename = "catDate_sentScores_"+str(int(x/1000))+"k_end.pickle"
pickling_on = open(filename,"wb")
pickle.dump(batchN[['sentScore']], pickling_on)
pickling_on.close()
print(filename + " complete")

### Part 2: Product Sentiment Analysis