# Text Analytics HW2
Billy Yuan, Nikita Lakhotia, Stuti Madaan, Tyler Nicholas, Wenduo Wang

### NOTE

If you wish to run this notebook, please make sure the following files are in the same folder as this notebook:

* [Word weights for positive and negative words](https://github.com/hitesh915/sentimentstrength/blob/master/wordwithStrength.txt)

* [Yelp data](https://github.com/billy-yuan/MSBA/blob/master/Fall%202016/Text%20Analytics/HW2/yelp.csv)

Credit to the following websites for their articles on sentiment analysis:
* [Mining Twitter Data with Python Part 6 - Sentiment Analysis Basics by Marco Bonzanini](https://marcobonzanini.com/2015/05/17/mining-twitter-data-with-python-part-6-sentiment-analysis-basics/)
* [Hitesh Parmer's repo on text mining, which contained a text file with the weights of positive and negative words](https://github.com/hitesh915/sentimentstrength)

#### Importing packages

In [79]:
from collections import Counter
import math, time, re, functools
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk as nltk
from nltk import pos_tag, word_tokenize, classify, bigrams, trigrams
from nltk.corpus import stopwords as stpwds
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.metrics import ConfusionMatrix
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Initialize functions

In [3]:
# initialize a lemmatizer just in case it will be used
lmtz = WordNetLemmatizer().lemmatize
regex_tok = RegexpTokenizer(r"[\w]+")

In [227]:
def readData(portion, random_state=time.time()):
    '''Read in a certain portion of data in a random manner'''
    rd.seed(random_state)
    skip = rd.sample(xrange(1, 19999), int(math.ceil(19999*(1-portion))))
    data = pd.read_csv("yelp.csv", skiprows=skip)
    data["target"]=data.stars.map(lambda v: 1 if v>3 else 0)
    return data

In [4]:
def generateTrainTest(data, portion, random_state=time.time()):
    '''Split train and test data set'''
    rd.seed(random_state)
    train_index = rd.sample(xrange(len(data)), int(math.ceil(len(data)*portion)))
    test_index = list(set(xrange(len(data)))-set(train_index))
    train_data = data.ix[train_index]
    test_data = data.ix[test_index]
    return train_data, test_data

In [5]:
def generateFormula(data):
    '''A helper function to generate formula for regression'''
    formula = "target~0"
    for var in data.columns.values.tolist():
        if data[var].dtype in ["int64", "float64"] and var not in ["stars", "target", "wc", "Review", "prediction"]:
                
            formula += "+"+var
            
        else:
            continue
    return formula

In [6]:
def splitXY(data):
    '''Split independent and dependent variables, and return X as DataFrame Y as Series'''
    Y, X = dmatrices(generateFormula(data), data=data, return_type="dataframe")
    return X, np.ravel(Y)

In [7]:
def logistic_model(X, y):
    '''A wrapper to generate and fit a logistic regression model'''
    model = LogisticRegression(random_state=128)
    model.fit(X, y)
    return model


In [8]:
# Create list of positive and negative words with their sentiment scores
# Text file and helper function can be found at
# https://github.com/hitesh915/sentimentstrength/blob/master/wordwithStrength.txt

sentimentData = 'wordwithStrength.txt' 

def sentiment_dict(sentimentData=sentimentData):
    ''' (file) -> dictionary
    This method should take your sentiment file
    and create a dictionary in the form {word: value}
    '''
    afinnfile = open(sentimentData)
    scores = {} # initialize an empty dictionary
    for line in afinnfile:
        term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
        scores[term] = float(score) # Convert the score to an integer.

    return scores # Print every (term, score) pair in the dictionary

sentiment_values = sentiment_dict()
senti_tuples = [(key,sentiment_values[key]) for key in sentiment_values]
senti_tuples = sorted(senti_tuples, key=lambda tup: tup[1], reverse=False)

In [9]:
def printAccuracy(prediction, target):
    '''Calculate and format accuracy of prediction against target'''
    print "Accuracy: {:>6.4f}".format((prediction == target).mean())
    print "------------------------------------------"

In [10]:
def review2wc(text, lem=False):
    '''Decompose a review into tokens, removing stopwords and optionally do lemmatization'''
    wc = {}
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    if lem:
        lmtzi = lmtz
        tokens = map(lmtz, tokens)
    while "" in tokens:
        tokens.remove("")
        
    for token in tokens:
        if token not in stopwords:
            try:
                wc[token] =+ 1
            except KeyError:
                wc[token] = 1
    return wc

In [11]:
def generateTrainTest(data, portion, random_state=time.time()):
    '''Split train and test data set'''
    rd.seed(random_state)
    train_index = rd.sample(xrange(len(data)), int(math.ceil(len(data)*portion)))
    test_index = list(set(xrange(len(data)))-set(train_index))
    train_data = data.ix[train_index]
    test_data = data.ix[test_index]
    return train_data, test_data

In [12]:
def term_prob(corpus, subset):
    '''Given a corpus and a subset, calculate the probability of each word
    from the corpus appearing in the subset'''
    prob_dict = {}
    N = sum([i for (_, i) in list(corpus.items())])
    for key in corpus:
        if key not in subset:
            prob_dict[key] = 1.0 / N
        else:
            prob_dict[key] = subset[key] + 1.0 / N
    return prob_dict

def log_prob(term_prob_high, term_prob_low):
    '''Given 2 subsets, calculate log relative probability o
    a word appearing in subset 1 against in subset 2'''
    term_log_prob = {}
    log = math.log
    for key in term_prob_high:
        term_log_prob[key] = log(term_prob_high[key]/term_prob_low[key])
    return term_log_prob

def token_count(wc):
    '''Given a list of dictionaries in the form of "word:count",
    aggregate word:count in to a single dictionary'''
    tc = {}
    for dic in wc.tolist():
        if len(dic) == 0: continue
        for token, count in dic.items():
            try:
                tc[token] += count
            except KeyError:
                tc[token] = 1
                
    return tc

def totalscore(wc, prior, benchmark):
    '''Given a dictionary in the form of "word:count", 
    and reference dictionary in the form of "word:log relative probability",
    calculate the sum of count*log relative probability,
    and at the end add a prior.'''
    prob = 0
    for word, count in wc.items():
        try:
            prob += count * benchmark[word]
        except KeyError:
            prob += 0
    prob += math.log(prior/(1-prior+0.00001))
    return prob

In [13]:
class NBClassifier(object):
    '''A Naive Bayes classifier object with methods to fit on training data and 
    predict on test data'''
    
    def __init__(self):
        self.X = None
        self.y = None
        self.term_log_prob = None
        self.prior = None
    
    def fit(self, data, x_label, y_label):
        '''The core of this method is to keep a dictionary of "word:log relative probability"'''
        self.X = data[x_label]
        self.y = data[y_label]
        self.x_label = x_label
        self.y_label = y_label
        token_count_total = token_count(data[x_label])
        token_count_high = token_count(data[data[y_label]==1][x_label])
        token_count_low = token_count(data[data[y_label]==0][x_label])
        term_prob_high = term_prob(token_count_total, token_count_high)
        term_prob_low = term_prob(token_count_total, token_count_low)
        self.term_log_prob = log_prob(term_prob_high, term_prob_low)
        self.prior = len(data[data[y_label]==1])*1.0/len(data)
        
    def predict(self, test, threshold=None):
        '''Prediction can be tuned by adjusting threshold.
        If threshold is set to None, then return actual score.'''
        totalscore_partial = functools.partial(totalscore, 
                                               prior= self.prior,
                                               benchmark=self.term_log_prob)
        score = test[self.x_label].map(totalscore_partial)
        if threshold == None:
            return score
        else:
            prediction = score.map(lambda x: 1 if x>threshold else 0)
            return prediction
        


In [14]:
stopwords = stpwds.words('english')
for word in stopwords[:len(stopwords)]:
    stopwords.append(word.title())

In [31]:
sentimentData = 'wordwithStrength.txt' # enter path of sentiment weights


def sentiment_dict(sentimentData=sentimentData):
    ''' (file) -> dictionary
    This method should take your sentiment file
    and create a dictionary in the form {word: value}
    '''
    afinnfile = open(sentimentData)
    scores = {} # initialize an empty dictionary
    for line in afinnfile:
        term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
        scores[term] = float(score) # Convert the score to an integer.

    return scores # Print every (term, score) pair in the dictionary

sentiment_values = sentiment_dict()
    
senti_tuples = [(key,sentiment_values[key]) for key in sentiment_values]
senti_tuples = sorted(senti_tuples, key=lambda tup: tup[1], reverse=False)

class semanticOrientation(object):
    '''
    Calculates Semantic Orientations of bigrams from a list of reviews and performs binary classification.
    
    Note: Before running semanticOrientation(), you need to store the sentiment weights in a list of tuples
    called senti_tuples.
    
    '''
    def __init__(self):
        self.so_scores = None
        self.predict_results = None
        self.so_counts = None
        self.valid_bigrams = None
        self.word_counts = None # co-occurence matrix
        self.tokenized = None
        self.all_words = None
        self.com_dict = None
        self.bigrams_list_sep = None
        self.bigrams_list_filtered = None
        self.num_positive = 0
        self.num_negative = 0
        self.accuracy = 0
        self.yhat = 0
        self.negative_words = None
        self.test_review_stop = None
        

    
    def create_bigrams(self, reviews_, n): # n is how many words in front of and behind bigram you want
        
        stopwords = stpwds.words('english')
        for word in stopwords[:len(stopwords)]:
            stopwords.append(word.title())
            
        valid_pos_bigrams = [('JJ','NN'),('JJ','NNS'),('RB','JJ'),('RBR','JJ'),('RBS','JJ'),('JJ','JJ'),
                    ('NN','JJ'),('NNS','JJ'),('RB','VB')]
        
        test_review_token = [regex_tok.tokenize(reviews[i].decode('utf-8')) for i in xrange(len(reviews_))]
        test_review_stop = []
        
        for review in test_review_token:
            rev_store_1 = []
            for word in review:
                if word not in stopwords:
                    rev_store_1.append(word)
                else:
                    continue
            test_review_stop.append(rev_store_1)
                    
        
        
        bigrams_list_sep = [list(bigrams(item)) for item in test_review_stop]
        
        self.bigrams_list_sep = bigrams_list_sep
        self.tokenized = test_review_stop 
        
        all_words = []

        for rev in test_review_stop:
            all_words += rev
            
        self.all_words = all_words
        
        # Create list of positive and negative words
        negative_words = [senti_tuples[i][0] for i in xrange(len(senti_tuples)) if senti_tuples[i][1] < .1]
        positive_words = [senti_tuples[i][0] for i in xrange(len(senti_tuples)) if senti_tuples[i][1] > -0.1]
        self.negative_words = negative_words
        ## Word counts - positive and negative
        num_positive = len([word for word in all_words if word in positive_words])
        num_negative = len([word for word in all_words if word in negative_words])
        
        self.num_positive = num_positive
        self.num_negative = num_negative
        
        # Create list of stopwords
        stopwords = stpwds.words('english')
        

        for word in stopwords[:len(stopwords)]:
            stopwords.append(word.title())

        for word in stopwords[:len(positive_words)]:
            positive_words.append(word.title())

        for word in stopwords[:len(negative_words)]:
            negative_words.append(word.title())                  

        # Filter list of bigrams of each review using Turney's rules
        bigrams_list_filtered = []

        for review in bigrams_list_sep:
            review_store = []
            for bigram in review:
                tup_store = []
                for tup in pos_tag(bigram):
                    tup_store.append(tup[1])
                if (tup_store[0][:2],tup_store[1][:2]) in valid_pos_bigrams:
                    review_store.append(bigram)
            bigrams_list_filtered.append(review_store)
            
        self.bigrams_list_filtered = bigrams_list_filtered
        bigrams_list_filtered_all = [bigram for review in bigrams_list_filtered for bigram in review]
        
#         # list in which every item is a bigram. Will be bigrams for all articles. Only used to calculate number of
#         # positive and negative words in corpus; these are our "priors" and will be used to calculate semantic orientation
#         bigrams_list_all = [] 

#         for i in xrange(len(bigrams_list_filtered)):
#             bigrams_list_all += bigrams_list_sep[i]

        ## Create co-occurrence dictionary for every word

        com_dict_test = {}

        for review in bigrams_list_sep:
            for i in range(len(review)):
                com_dict_test[review[i]] = {}

        for count in range(1,n+1):
            for review in bigrams_list_sep:
                for i in range(len(review)-count):
                    j = i+count
                    if review[i] in com_dict_test and review[j] in com_dict_test[review[i]]:
                        com_dict_test[review[i]][review[j][1]] += 1
                    elif review[i] in com_dict_test and review[j] not in com_dict_test[review[i]]:
                        com_dict_test[review[i]][review[j][1]] = 1
                    else: 
                        com_dict_test[review[i]] = {}
                        com_dict_test[review[i]][review[j][1]] = 1

            for review in bigrams_list_sep:
                for i in range(count,len(review)):
                    j = i-count
                    if review[j][1] not in com_dict_test[review[i]]:
                        com_dict_test[review[i]][review[j][1]] = 1
                    else:
                        com_dict_test[review[i]][review[j][1]] += 1 


        com_dict_2 = {}
        

        for key in com_dict_test:
            if key in bigrams_list_filtered_all:
                com_dict_2[key] = com_dict_test[key]    
        self.com_dict = com_dict_2       
        # Calculate semantic orientations
        
        semantic_o_2 = {} # key is bigram. value is list (number of positive nearby, number of negative nearby, number of neutral)

        for key in com_dict_2:
            semantic_o_2[key] = [0,0,0]
            for value in com_dict_2[key]:
                if value in positive_words:
                    semantic_o_2[key][0] += 1
                elif value in negative_words:
                    semantic_o_2[key][1] += 1
                else:
                    semantic_o_2[key][2] += 1
        self.so_counts = semantic_o_2
        
    # Make predictions. Target should be list or series of y-values (1 or 0 for review)
    def predict(self, target):
        x = 0
        so_predict = [] # predicted values using SO
        
        for index in range(len(self.bigrams_list_sep)):
            rate_so_review = [bigram for bigram in self.bigrams_list_sep[index] if bigram in self.com_dict]

            positive_bg = []
            negative_bg = []
            so_scores = []
            
            
            
            for bigram in rate_so_review:
                if bigram in self.com_dict:
                    positive_bg.append(self.so_counts[bigram][0])
                else:
                    continue
            for bigram in rate_so_review:
                if bigram in rate_so_review:
                    negative_bg.append(self.so_counts[bigram][1])  
                else:
                    continue
# 
            # make tuples (bigram, # positive, # negative, SO score)
            for i in range(len(rate_so_review)):
                if positive_bg[i] > 0 or negative_bg[i] > 0:
                    numerator = float(positive_bg[i]) * self.num_negative
                    denominator = float(negative_bg[i]) * self.num_positive
                    so = math.log((numerator+.001)/ (denominator + .001))
                    tup_store = (rate_so_review[i], positive_bg[i], negative_bg[i], so)

                    so_scores.append(tup_store)


        #     Calculate SO score
            mean_store = []
            
            for i in range(len(so_scores)):
                mean_store.append(so_scores[i][3])

            if len(mean_store) != 0:
                mean_so = sum(mean_store)/len(mean_store)
            else:
                mean_so = 0

            if mean_so > 0:
                so_predict.append(1)
            else:
                so_predict.append(0)
        self.yhat = so_predict
        
#         self.predict_results = dict(Counter(so_predict))
        self.accuracy = sum(target[i] == so_predict[i] for i in range(len(target))) / float(len(target))
        
    
    def scores(self,n,desc=True):
        
        semantic_scores = []
    
        for key in self.so_counts:
            numerator = float(self.so_counts[key][0]) * self.num_negative
            denominator = float(self.so_counts[key][1] * self.num_positive)
            semantic_scores.append((key, math.log(numerator + .01 / (denominator + .001))))
        self.so_scores = semantic_scores
        
        return sorted(semantic_scores, key=lambda tup: tup[1], reverse=desc)[:n]
#         return self.so_scores

# x = semanticOrientation()
# x.create_bigrams(reviews,3)


### Task A
_Ignore the text (reviews) and run a classification model with the numeric data (you can use
standard methods like logistic regression, k-nearest neighbors or anything else). What is the best
accuracy of your model with numeric data?_

##### Brief discussion:

Ignoring the text, we used a logistic regression and our best accuracy of the test set was 68.6%. We split the data set as 70% training and 30% test.

The logistic regression model prediction accuracy is close to the baseline accuracy of predicting all 1 (i.e. high) in both the training and test sets.

##### Analyzing the coefficients

```
Beta Coefficients 

Vietnamese: 0.700
Expensive: 0.634
votes_cool: 0.594
Mediterranean: 0.506
Mexican: 0.466
Indian: 0.381
Greek: 0.356
French: 0.354
Thai: 0.344
Italian: 0.233
Japanese: 0.214
American: 0.192
Others: 0.130
VeryExpensive: 0.123
Chinese: 0.049
Moderate: -0.001
Cheap: -0.234
votes_funny: -0.243
votes_useful: -0.265
```

The coefficients also tell an interesting story about the most important features. The top 3 coefficients were Vietnamese, Expensive, and votes_cool. 

The high coefficient of Vietnamese cuisine suggests that holding all other features constant, Vietnamese cuisine is a strong predictor of a highly rated restaurant. However, we weren't sure why the coefficient was so high; we hypothesized that a disproportionate percentage of reviews for Vietnamese restaurants were 4 or 5 stars. However, the percentage of reviews that were 4 or 5 stars for Vietnamese restaurants wasn't higher than that for other cuisines. 

"Expensive" was the next highest coefficient, suggesting that regardless of cuisine, expensive restaurants tend to have higher ratings. However, restaurants that were "Very Expensive", "Moderate" and "Cheap" had a significantly lower coefficient. It is difficult to determine why because the lines separating "Moderate" and "Expensive" are blurred. Only 297 of the 3,999 restaurants were "Expensive" while 1,528 were "Moderate." Among the "Expensive" restaurants, 223 were rated 4 or 5 stars, while 799 of the "Moderate" restaurants were 3 stars or below.

Finally, among the 3 types of votes, a review that was "cool" was the strongest predictor of a highly rated restaurant.

In [239]:
# readData will automatically create the target column with >3 star restaurants labeled by 1, else 0
data = readData(0.2, random_state=8)
print "Percentage of >3 star restaurants in data: {:>6.4f}".format(data.target.mean())
print "------------------------------------------"
train_1, test_1 = generateTrainTest(data, 0.7, random_state=8)
# splitXY will automatically select all numeric columns, except stars, with target as y
X_train, y_train = splitXY(train_1)
model_1 = logistic_model(X_train, y_train)
X_test, y_test = splitXY(test_1)
print "Percentage of >3 star restaurants in test: {:>6.4f}".format(y_test.mean())
print "------------------------------------------"
prediction = model_1.predict(X_test)
printAccuracy(prediction, y_test)

coefficients = []

len(data.columns.values)-2
len([feature for item in model_1.coef_ for feature in item])

features = X_train.columns.values
coef = model_1.coef_[0]

coef_list = []

for index in range(len(features)):
    coef_list.append((features[index], coef[index]))
coef_list = sorted(coef_list, key=lambda tup: tup[1], reverse=True)

print "Beta Coefficients", "\n"
for tup in coef_list:
    print "{}: {:.3f}".format(tup[0], tup[1])
    


Percentage of >3 star restaurants in data: 0.6737
------------------------------------------
Percentage of >3 star restaurants in test: 0.6622
------------------------------------------
Accuracy: 0.6706
------------------------------------------
Beta Coefficients 

Vietnamese: 0.700
Expensive: 0.634
votes_cool: 0.594
Mediterranean: 0.506
Mexican: 0.466
Indian: 0.381
Greek: 0.356
French: 0.354
Thai: 0.344
Italian: 0.233
Japanese: 0.214
American: 0.192
Others: 0.130
VeryExpensive: 0.123
Chinese: 0.049
Moderate: -0.001
Cheap: -0.234
votes_funny: -0.243
votes_useful: -0.265


### Task B
_Perform a supervised classification on a subset of the corpus using the reviews only. You can
write your code in Python or R. What accuracy do you get from this text mining exercise?_

##### Approach Outline
1. Break down Review into {word: count} dictionary, but don't create word count matrix;
2. Train a Naive Bayes classifier (customized above) based on word: count and target label.

##### Result
The Multinomial Naive Bayes classifier had a test accuracy of 68.2%, compared with the baseline of 66.2% in the test set.

##### Brief discussion
The prediction accuracy of Naive Bayes classification is dubiously higher than of the previous logistic regression. The improvement varies when the sample is redrawn randomly from 0.01 to >.20. Though highly volatile, the increase is positive in all cases that have been tried, which means predicting high/low rating based on Review is more effective than based on the given numeric attributes. However the reviews themselves are highly variable.

In [240]:
# Decompose the review into a {word: count} dictionary
data["wc"] = data.Review.map(review2wc)
# Split the train and test sets
train_2, test_2 = generateTrainTest(data, 0.7, random_state=8) # Worry not, same random_state same rows
print "Percentage of >3 star restaurants in test: {:>6.4f}".format(test_2.target.mean())
print "------------------------------------------"

Percentage of >3 star restaurants in test: 0.6622
------------------------------------------


In [241]:
# Create an instance of the customized Naive Bayes classifier
classifier = NBClassifier()
# Training undergoing
classifier.fit(train_2, "wc", "target")
prediction = classifier.predict(test_2, threshold=3.5) # Threshold set by heuristics
printAccuracy(prediction, test_2.target)

Accuracy: 0.6822
------------------------------------------


### Task C
_Combine the numeric data and the text classification model (in task B) to create a “hybrid”
model. It is your task to figure out how to do this. Now run this hybrid classification model and compare
the results with those in A and B. Does the numeric data add to the predictive power relative to text?_

##### Approach Outline
1. Use NB classifier to return a *total_score* of each review, which is basically the log probability of its words
2. Create a new column with the *total_score*, and include this column in the logistic regression model.

##### Result
The test accuracy of our hybrid model was 68.2%.

##### Brief discussion
Given that we have introduced more information, we expected that the result should have been more accurate than that from the Naive Bayes model. However, because we didn't create interaction terms, any advantage from adding information from the review may have been lost. In addition, possible co-linearity between the log probability from the Naive Bayes and other numeric attributes may have been high, and may have minimized the value that we expected the log-probability score from our Naive Bayes model to add.

In [242]:
data["total_score"] = classifier.predict(data, threshold=3.5)
train_3, test_3 = generateTrainTest(data, 0.7, random_state=8)
X_train, y_train = splitXY(train_3)
model_2 = logistic_model(X_train, y_train)
X_test, y_test = splitXY(test_3)
prediction = model_2.predict(X_test)
printAccuracy(prediction, y_test)

Accuracy: 0.6822
------------------------------------------


### Task D
_Use unsupervised sentiment analysis on the reviews (with SentiStrength or any other tool) and
use the sentiment scores to predict high/low rating. Compare and contrast the results of tasks B and D.
What can you conclude from your analysis?_

##### Approach Outline
1. Run each review through Sentistrength and export the results as a .txt file
2. Read the .txt file in Python and calculate the prediction looking at the sign of the difference between positive and negative scores; a positive difference means 4 or 5 stars, and a negative difference means 3 or fewer stars.

##### Results
Using the entire data set, our model using Sentistrength had an accuracy of 58.47%.

##### Brief Discussion
Among our models using text and sentiment analysis (i.e. Parts C through E), the model using Sentistrength produced the lowest accuracy. Sentistrength is rather naive in its approach because unparsed reviews containing both positive and negative words would cancel each other out. On the contrary, our Naive Bayes model was slightly less naive than the Sentistrength model, and calculated posteriors based on the frequency of a word given a particular class, which turns out to be more accurate than Sentistrength's method, which just sums up sentiment scores with no other filters. 

To make the model less naive, we could also have tried parsing a review into phrases, and running each phrase through Sentistrength. Our next model in Part E uses PMI calculations to create a Semantic Orientation score based on words in proximity, which is a less naive way of doing this unsupervised learning task.


In [243]:
results = pd.read_table("Results_SentiStrength.txt", header = 0)
data_full = pd.read_csv("yelp.csv")
data_full["target"]=data_full.stars.map(lambda v: 1 if v>3 else 0)

results['Sentiment'] = results['Positive']+results['Negative']

def Prediction(Score):
    if int(Score) >0:
        return 1
    else:
        return 0

results['Prediction'] = results['Sentiment'].apply(Prediction)
senti_strength_predict = results['Prediction'].tolist()
actual_data = data_full['target'].tolist()

count = 0
for i in range(len(senti_strength_predict)):
    if senti_strength_predict[i] == actual_data[i]:
        count +=1
correct = float(count)/len(senti_strength_predict)
print "Accuracy: {:.4f}".format(correct)


Accuracy: 0.5847


### Task E

_Implement the PMI approach to sentiment analysis (in either Python or R), and run the
classification model with the sentiment scores. How do your results compare with those in Task D?_

##### Approach Outline
1. Filter bigrams according to modified POS rules from Turney's article
2. Calculate Semantic Orientation of each bigram using the number of positive and negative words within 3 words (excluding stopwords) of the bigram.
3. For each review, take the average of the Semantic Orientation scores of each bigram. If the average Semantic Orientation is greater than 1, then the review is positive. Otherwise, it's negative.

We used the Turney article as our basis and chose to use Semantic Orientation, which is just the difference in PMI's between a bigram near a positive word and a bigram near a negative word. Our Semantic Orientation calculation was:
$$ SO_{bigram} = log(\frac {n_{+,bigram} \times N_{negative} + .01}{n_{-, bigram} \times N_{positive} + .01}) $$

To start, we filtered the bigrams by using simplified part-of-speech rules derived from the Turney article. The only simplification we made was that we did not care what the POS of the word after the bigram was. 


$N_{negative}$ and $N_{positive}$ are the total number of positive and negative words in the entire corpus of reviews; they serve as our "priors." We checked whether a  word in the corpus was in a list of positive and negative words (the link to the document can be found at the top of the notebook). The counts of words in the corpus that were in the "master" list of positive and negative words served as our values for $N_{negative}$ and $N_{positive}$. In the first 1000 reviews:
* $ N_{negative} = 10,483 $

* $ N_{positive} = 16,874 $

$n_{+,bigram}$ and $n_{-,bigram}$ are the counts of positive and negative words within a given number of words from the bigram (excluding stopwords). We chose to look at words within 3 words of the bigram. For example, in the bigram ('fresh warm'), $n_{+,bigram}$ is 2: 'always' and 'helps' are positive words, and $n_{-,bigrams}$ is 0. 
```
(u'always', u'helps'): {u'always': 1,
                        u'excellent': 1,
                        u'food': 1,
                        u'hot': 1,
                        u'service': 1,
                        u'waitress': 1}
```

To classify a review, we took the average of the Semantic Orientations of every bigram in the review.

$$ SO_{i} =  \frac{\sum_{i=1}^{R} SO_{i j}}{R}$$
where $SO_{i}$ is the Semantic Orientation of review $i$, $R$ is the number of bigrams (filtered using Turney's POS rules) in the review and $j$ is the index of each bigram in the review.

$$Class_{i} = \begin{cases} high, &\mbox{if }SO_{i} > 0 \\ low, & \mbox{if }SO_{i} < 0 \end {cases}$$





##### Results

Testing the first 1000 reviews, our model had an accuracy score of 60.7%, which is better than the accuracy using the Senti-Strength model.

##### Brief Discussion

Depending on the sample of reviews we chose, the accuracy of our model using PMI / Semantic Orientation ranged between 60-70%. We can inspect the top 10 bigrams with the highest and lowest Semantic Orientation.

```
Top 10 Lowest Semantic Orientations (First 1000 Reviews)

 ((u'little', u'corner'), -15.576037341956656),
 ((u'less', u'richness'), -15.576037341956656),
 ((u'felt', u'sick'), -15.576037341956656),
 ((u'sweet', u'Phoenix'), -15.352893794941304),
 ((u'entirely', u'sweet'), -15.352893794941304),
 ((u'sad', u'salad'), -15.352893794941304),
 ((u'sick', u'days'), -15.352893794941304),
 ((u'appetite', u'things'), -15.352893794941304),
 ((u'flat', u'bread'), -15.352893794941304),
 ((u'horrible', u'see'), -15.352893794941304)

Top 10 Highest Semantic Orientations (First 1000 Reviews)

 ((u'happy', u'hour'), 12.934340800634144),
 ((u'really', u'good'), 12.290790561213711),
 ((u'Mexican', u'food'), 12.25800073839094),
 ((u'Indian', u'food'), 12.224099186715321),
 ((u'first', u'time'), 11.947845810087092),
 ((u'food', u'great'), 11.901325794452529),
 ((u'pretty', u'good'), 11.901325794452077),
 ((u'light', u'rail'), 11.747175114625065),
 ((u'service', u'great'), 11.690016700785623),
 ((u'great', u'service'), 11.690016700785494)
```

The words with the highest Semantic Orientations all seem to make sense. However, some of the bigrams with the lowest Semantic Orientations are a bit harder to interpret. 

For example, why does "flat bread" have such a low semantic orientation? We can inspect the words that were close to this bigram in our sample of 1000 reviews. The keys are the words near the bigram and the values are how many times a word appeared within 3 non-stopwords to a bigram.

```
{u'baked': 1, u'dark': 1, u'flat': 1, u'house': 1, u'smoke': 1, u'wood': 1}
```
It turns out that the weights in the text file for all of the words except for "house"  are negative; in this case, $n_{-,bigram}$ is 5 and $n_{+,bigram}$ is 0, which explains why the Semantic Orientation is negative.

To further improve the model, we could inspect the sentiment weights of the words that shouldn't be negative and change them. In this case, it's unclear why "smoke" and "baked" should have negative sentiment weights. Ideally, whether a word is positive or negative should be based on the context. In this case, the context is restaurants.


In [36]:
data_raw = pd.read_csv("yelp.csv")[:1000]
    
data_raw['target'] = data_raw['stars'].map(lambda t: 1 if t > 3 else 0)
reviews = data_raw['Review']
target_e = data_raw['target']

part_e_model = semanticOrientation()
part_e_model.create_bigrams(reviews,3) # find words 3 words behind and 3 words ahead of bigram (excluding stop words)
part_e_model.predict(target_e)
print "Accuracy - Part E: {:4f}".format(part_e_model.accuracy)


Accuracy - Part E: 0.607000


### Task F

_What are the top 5 “attributes” of a restaurant that are associated with (i) high and (ii) low
ratings? That is, when people rate a restaurant high or low, are they more likely to mention service,
ambiance, etc.?_

##### Approach Outline
1. For restaurants with High rating, tokenize the reviews.
2. Get the POS tag for unigrams in each of the reviews.
3. From the above POS corpus, filter words corresponding to POS="NN" i.e. Extracting all the Nouns from the Reviews
4. Calculate the freqeuncy of words and diplay it in descending order to find the most occuring Nouns specific to High Ratings.
5. Repeat the same for Low Rating restaurants.

##### Results
```
Words with Highest Relative Probabilities for High Ratings (285,443)

    (u'food', 10075 - 3.5%),
    (u'place', 9335 - 3.3%),
    (u'time', 4261 - 1.5%),
    (u'service', 4011 - 1.4%),
    (u'menu', 3319 - 1.2%),
 
Words with Highest Relative Probabilities for Low Ratings (164,184)

    (u'food', 6335 - 3.8%),
    (u'place', 4709 - 2.9%),
    (u'service', 2462 - 1.5%),
    (u'time', 2443 - 1.5%),
    (u'restaurant', 1914 - 1.2%)
```
##### Brief Discussion
For both high and low rated reviews, attributes for "food", "place", "service" and "time" were important to customers. We expected that the rankings or proportions of attributes would be different between reviews of high and low rated restaurants, but it turns out that regardless of a restaurant's rating, people mention the same attributes.

"Menu" and "restaurant" are the only words that are different in the top-5 rankings of attributes for high and low rated restaurants. One potential hypothesis is that people who give a low rating are bad-mouthing the restaurant more, and not talking about the menu as much. However, both "menu" and "restaurant" are in the top-10 list of most commonly used attributs for both high and low rated restaurants.

In [110]:
data_all = pd.read_csv("yelp.csv")
    
data_all['target'] = data_all['stars'].map(lambda t: 1 if t > 3 else 0)
reviews = data_all['Review']

pos=[]
unigram=[]

high_rated= data_all[data_all['target']==1] ## reviews with high rating
low_rated= data_all[data_all['target']==0] ## reviews with low rating

In [111]:
import itertools

pattern = r'\s?(\w+)\s?'

def freq_attribute(data):
    pos=[]
    unigram=[]
    lst_final=list(data.ix[:,'Review']) ## extracting reviews only
    for i in range(len(lst_final)): 
        new_list = lst_final[i] ## 
        text = nltk.regexp_tokenize(new_list.decode('utf-8'), pattern) ## tokenizing the review 
        unigram.append(text) 
    unigrams2=unigram
    pos2=[]
    for i in unigrams2:
        pos2.append(nltk.pos_tag(i)) ## getting pos for each word
    merged = list(itertools.chain(*pos2))
    a = [item[0] for item in merged if item[1] == 'NN'] ## filtering for nouns i.e. POS="NN"
    return Counter(a).most_common() ## fetching frequency of Noun and displaying in descending order

In [None]:
high_words = freq_attribute(high_rated)

In [118]:
for tup in high_words[:10]:
    print tup

(u'food', 10075)
(u'place', 9335)
(u'time', 4261)
(u'service', 4011)
(u's', 4004)
(u'menu', 3319)
(u'restaurant', 3289)
(u'order', 2437)
(u'lunch', 2426)
(u't', 2225)


In [115]:
low_words = freq_attribute(low_rated)

In [119]:
for tup in low_words[:10]:
    print tup

(u'food', 6335)
(u'place', 4709)
(u'service', 2462)
(u'time', 2443)
(u's', 2137)
(u't', 1978)
(u'restaurant', 1914)
(u'order', 1798)
(u'menu', 1719)
(u'lunch', 1295)
