# Text to Sentiment value converter

## Read file

In [1]:
import os
import numpy as np
import pandas as pd

inputFile = 'reg_reviews_03.csv'        # enter original file name that include review text column
outputFile = 'reg_reviews_NLP_03.csv'   # enter oupput file name that will add new sentiment features

In [2]:
# Add empty column columns
df = pd.read_csv(inputFile)
header_list_new = ['numSentence', 'numWords', 'totSentiment', 'avgSentiment', 'Sfreq0','Sfreq1','Sfreq2','Sfreq3','Sfreq4','Sfreq5']
for i, name in enumerate(header_list_new):
    df[name] = 0

In [3]:
df.head()

Unnamed: 0,cool,date,friends,funny,has_photo,localtion,photos,rating,restaurant_id,review,...,numSentence,numWords,totSentiment,avgSentiment,Sfreq0,Sfreq1,Sfreq2,Sfreq3,Sfreq4,Sfreq5
0,0,11/9/2014,3,1,True,"Los Angeles, CA",8,4,243,I visited this lovely place alone on a Sunday ...,...,0,0,0,0,0,0,0,0,0,0
1,2,4/4/2012,155,4,True,"New York, NY",1177,2,243,Wow! this place has good reviews. um... Maybe...,...,0,0,0,0,0,0,0,0,0,0
2,0,8/17/2016,227,0,False,"Irvine, CA",28,5,243,Super yummy empanadas! The ladies that work th...,...,0,0,0,0,0,0,0,0,0,0
3,1,5/8/2015,18,0,True,"Costa Mesa, CA",39,1,243,I'm writing this from the tile of my bathroom ...,...,0,0,0,0,0,0,0,0,0,0
4,9,1/29/2012,1032,5,True,"Orange, CA",1062,4,243,Woah! I will say I was expecting a lot because...,...,0,0,0,0,0,0,0,0,0,0


## Requirements to run Stanford Core NLP

#### Running Stanford Core NLP server
-Download NLP https://stanfordnlp.github.io/CoreNLP/index.html#license, and unzip

-Install Java

-Run Stanford Core NLP Server by typing below on a command prompt (Anaconda prompt) from the unzipped directory

cd Documents\Python Scripts\stanford-corenlp-full-2018-01-31

java -mx11g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 50000


#### Package required
Install a wrapper, e.g. stanfordcorenlp https://pypi.python.org/pypi/stanfordcorenlp

(base) C:\Users\ML\Documents\Python Scripts>pip install stanfordcorenlp-3.8.0.1-py2.py3-none-any.whl

Reference: https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/

## Sentiment analysis function

In [4]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

# Function; Output = # sentence, # words, avg.sentimentValue, sentimentHist
def stanford_sentiment(text_str):
    res = nlp.annotate(text_str,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 40000,
                   })
    numSentence = len(res["sentences"])
    numWords = len(text_str.split())
    
    # data arrangement
    arraySentVal = np.zeros(numSentence)

    for i, s in enumerate(res["sentences"]):
        arraySentVal[i] = int(s["sentimentValue"])

    # sum of sentiment values for all sentences in a text/review
    totSentiment = sum(arraySentVal)

    # avg. of sentiment values for all sentences in a text/review
    avgSentiment = np.mean(arraySentVal)

    # frequency of sentimentValue in a text/review; {1 : Negative, 2 : Neutral, 3 : Positive}
    bins = [0,1,2,3,4,5,6]
    freq = np.histogram(arraySentVal, bins)[0]    # getting freq. only w/o bins

    return(numSentence, numWords, totSentiment, avgSentiment, freq)   

## Text to Sentiment Score conversion

In [5]:
%%time

# sentiment score calculation
# input = review text w/ removing '\n' characters in text (which does not affect the sentiment analysis much)
# output = number of sentences and words, 
#          sum of all sentiment score from each sentence in a review
#          avg of sentiment scores
#          hist.frequency values of sentiment score (0 to 5); {1 : Negative, 2 : Neutral, 3 : Positive}

dfLength = len(df)

for i in range(dfLength):
    try:
        numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[i].replace('\n'," "))
        df.loc[i,'numSentence'] = numSentence
        df.loc[i,'numWords'] = numWords
        df.loc[i,'totSentiment'] = totSentiment
        df.loc[i,'avgSentiment'] = avgSentiment
        df.loc[i,'Sfreq0'] = freq[0]
        df.loc[i,'Sfreq1'] = freq[1]
        df.loc[i,'Sfreq2'] = freq[2]
        df.loc[i,'Sfreq3'] = freq[3]
        df.loc[i,'Sfreq4'] = freq[4]
        df.loc[i,'Sfreq5'] = freq[5]
    except:
        print("error where i =", i)

error where i = 357
error where i = 1894
error where i = 3276
error where i = 8231
error where i = 8336
error where i = 8915
error where i = 10828
error where i = 54412
error where i = 58625
error where i = 66946
error where i = 67180
error where i = 67399
error where i = 74000
Wall time: 7h 54min 59s


In [6]:
# df.review[1229]

In [7]:
# df.review[7118]

## Write output into a csv file

In [8]:
df.to_csv(outputFile, encoding='utf-8', index=False )