In [99]:
import pandas as pd
import numpy as np
from tools.parsers import largemoviereviews as largeMovieReviewsParser
from tools.parsers import generalinquirer as generalInquirerParser
from tools.parsers import negation as negationParser 
import time
import string, math
from nltk.stem.snowball import SnowballStemmer

from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

In [100]:
stemmer = SnowballStemmer("english")
parserMovieReviews = largeMovieReviewsParser.LargeMovieReviews()
parserInquirer = generalInquirerParser.GeneralInquirer()
parserNegation = negationParser.Negation()
negations = parserNegation.readFileCsv(parserNegation.defaultFileNameProcessed)

In [101]:
sentencesData = parserMovieReviews.readFileCsv(parserMovieReviews.defaultFileNameProcessed)

In [102]:
sentimentDictionaries = parserInquirer.readFileCsv(parserInquirer.combinedFileLoc)
sentimentDictionaries['entry'] = sentimentDictionaries['entry'].map(lambda cell: stemmer.stem(cell))
sentimentDictionaries.drop_duplicates(subset = 'entry', inplace = True)
sentimentDictionaries.shape

(2736, 25)

In [103]:
negations['phraseStemmed'] = negations['phrase'].map(lambda cell: stemmer.stem(cell).lower())

In [104]:
sentencesClean = sentencesData[np.invert(sentencesData['id'].isnull())]

In [105]:
sentencesClean = sentencesClean[np.invert(sentencesClean['text'].isnull())] 

In [106]:
sentencesClean.shape

(532291, 5)

In [107]:
sentencesClean[sentencesClean['type'] == 1].shape

(258804, 5)

In [108]:
def binSearchDfValue(df, val, start, end, isDebug = False):
    while start <= end:
        if isDebug:
            print("Start: {};    End: {}".format(start, end))
        middle = (start + end) / 2
        if isDebug:
            print("middle: {}".format(df[middle]))
        if df[middle] == val:
            return True
        else:
            if df[middle] > val:
                end = middle - 1
            else:
                start = middle + 1
    
    return False

def countOccurenceColumn(row, lookFor, lookForLen, columnName):
    words = row['textArr']
    res = [word for word in words if binSearchDfValue(lookFor, word, 0, lookForLen)]
    row[columnName] = len(res)
    return row

def CalculateColumn(column, df, val = None, typeWord = None):
    global sentimentDictionaries
    
    compareVal = column
    if column == None:
        columnName = "All"+typeWord
    else:
        if val != None:
            compareVal = val
        columnName = column+'Count'
        if val != None:
            columnName = columnName + val
        if typeWord != None:
            columnName = columnName + typeWord
        
    if typeWord == None:
        lookFor = sentimentDictionaries[sentimentDictionaries[column] == compareVal]
    else:
        if column == None:
            lookFor = sentimentDictionaries[(sentimentDictionaries['type'] == typeWord)]
        else:
            lookFor = sentimentDictionaries[(sentimentDictionaries[column] == compareVal) & (sentimentDictionaries['type'] == typeWord)]
    
    lookForVals = lookFor['entry'].values
    lookForLen = len(lookForVals) - 1
    print("LookFor shape: {}".format(lookFor.shape))
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOccurenceColumn(row, lookForVals, lookForLen, columnName), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [109]:
import cPickle as pickle

In [13]:
with open('./large-movie-reviews.dump','rb') as fp:
    sentencesClean = pickle.load(fp)

In [110]:
sentencesTest = sentencesClean[sentencesClean['set'] == 0]

In [111]:
def cleanUpWord(w):
    return w.translate(None, string.punctuation).strip()

def processWords(row):
    global stemmer

    words = [ word for word in [cleanUpWord(w) for w in row['text'].split('|')] if len(word) > 0]    
    row['textStemmed'] = '|'.join([stemmer.stem(unicode(w, errors='ignore')) for w in words]).lower()
    row['textStemmed'] = '|' + row['textStemmed'] + '|'
    row['textArr'] = words
    return row


tStart = time.time()
sentencesTest = sentencesTest.apply(lambda row: processWords(row), axis = 1)
tEnd = time.time()
print("Timed: {}".format(str(tEnd - tStart)))

Timed: 534.390000105


In [112]:
def directClassification(row, pos_column, neg_column, suffix):
    if row[pos_column] < row[neg_column]:
        row['apriori-class-'+suffix] = -1
    elif row[pos_column] > row[neg_column]:
        row['apriori-class-'+suffix] = 1
    else:
        row['apriori-class-'+suffix] = 0
    
    return row

In [113]:
def calculateSentiment(sentences, column_predicted):
    numPositive = sentences[sentences[column_predicted] == 1].shape[0]
    numNegative = sentences[sentences[column_predicted] == -1].shape[0]
    numNeutral = sentences[sentences[column_predicted] == 0].shape[0]
#    print("Pos = {}; Neg = {}; Neutral = {}".format(numPositive, numNegative, numNeutral))
    if numPositive > numNegative:
        return 1
    elif numPositive < numNegative:
        return -1
    else:
        return 0    

In [114]:
def calculateSentimentReview(review_id, df, col):
    review_data = review_id.split('#')
    review_set_id = int(review_data[2])
    review_rating = int(review_data[1])
    review_df_id = int(review_data[0])
    sentences = df[(df['id'] == review_df_id) & (df['set'] == review_set_id) & (df['rating'] == review_rating) ]
    return calculateSentiment(sentences, col)

In [115]:
def generate_row_id(row):
    row['real-id'] = "{}#{}#{}".format(int(row['id']), int(row['rating']), int(row['set']))
#    print("{}-{}".format(row['id'], row['set']))
    return row

In [116]:
# generate expected results
sentencesTest = sentencesTest.apply(lambda row: generate_row_id(row), axis = 1)
unique_reviews = sentencesTest['real-id'].unique()
expected_classes = pd.DataFrame(np.zeros((len(unique_reviews), 2)), columns = ['id','expected'])
expected_classes['id'] = unique_reviews

In [117]:
expected_classes['expected'] = expected_classes['id'].map(lambda cell: calculateSentimentReview(cell, sentencesTest, 'type'))

In [118]:
def getRating(review_id):
    review_data = review_id.split('#')
    return int(review_data[1])

In [119]:
def calculateAccuracy(df, predicted, expected):
    return df[df[predicted] == df[expected]].shape[0] / float(df.shape[0])

In [156]:
expected_classes['rating'] = expected_classes['id'].map(lambda cell: getRating(cell))
expected_classes['rating'].value_counts()

1     5022
10    4999
8     2850
4     2635
3     2541
9     2344
7     2307
2     2302
Name: rating, dtype: int64

In [151]:
def even_values_cuts(df, ratio_partition, col):
    perm_df = df.reindex(np.random.permutation(df.index))
    res_df = pd.DataFrame({}, columns = df.columns)
    unique_parts = perm_df[col].unique()
    for unique in unique_parts:
        selected_rows = perm_df[perm_df[col] == unique]
        num_parts = int(selected_rows.shape[0] * ratio_partition)
        res_df = pd.concat([res_df, selected_rows[:num_parts]])
    
    return res_df.reindex(np.random.permutation(res_df.index))

In [155]:
test_reviews = even_values_cuts(expected_classes, 0.1, 'rating')
test_reviews['rating'].value_counts()

1     502
10    499
8     285
4     263
3     254
9     234
7     230
2     230
Name: rating, dtype: int64

In [190]:
def equal_cuts(df, size_df, col):
    perm_df = df.reindex(np.random.permutation(df.index))
    res_df = pd.DataFrame({}, columns = df.columns)
    unique_parts = perm_df[col].unique()
    num_parts = int(size_df / len(unique_parts)) 
    for unique in unique_parts:
        selected_rows = perm_df[perm_df[col] == unique]
        res_df = pd.concat([res_df, selected_rows[:num_parts]])
    
    return res_df.reindex(np.random.permutation(res_df.index))

In [191]:
test_reviews_even = equal_cuts(expected_classes, 2500, 'rating')
test_reviews_even['rating'].value_counts()

10    312
9     312
8     312
7     312
4     312
3     312
2     312
1     312
Name: rating, dtype: int64

In [121]:
expected_classes_test = pd.concat([expected_classes[:1500], expected_classes[-1500:]])

In [192]:
sentencesTestSubset = sentencesTest[sentencesTest['real-id'].isin(test_reviews_even['id'])]

In [193]:
def countNegations(row):
    words = row['text']
    negations['found'] = negations.apply(lambda row: 1 if '|'+row['phrase']+'|' in words else 0, axis = 1)
    row['negations'] = negations['found'].sum()
    return row

tStart = time.time()
sentencesTestSubset = sentencesTestSubset.apply(lambda row: countNegations(row), axis = 1)
tEnd = time.time()
print("Timed: {}".format(str(tEnd-tStart)))

Timed: 34.6390001774


In [194]:
sentencesTestSubset = CalculateColumn('positiv', sentencesTestSubset)

LookFor shape: (965, 25)
Timed(positivCount): 7.6130001545


In [195]:
sentencesTestSubset = CalculateColumn('positiv', sentencesTestSubset)
sentencesTestSubset = CalculateColumn('negativ', sentencesTestSubset)

LookFor shape: (965, 25)
Timed(positivCount): 7.38000011444
LookFor shape: (1421, 25)
Timed(negativCount): 7.76800012589


In [196]:
def countOccurenceColumnBeforeAfter(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phrase']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['text'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['textStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['textStemmed'].find(rowLook['entry'], 0, posNeg)
            posAfter = row['textStemmed'].find(rowLook['entry'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = row[columnName+'Before'] + 1 
            if posAfter != -1:
                row[columnName+'After'] = row[columnName+'After'] + 1

    return row


def WordsAroundNegations(column, df, val = None, count = False):
    global sentimentDictionaries

    columnName = column
    compareVal = val
    if val is None:
        compareVal = column
    else:
        columnName = column + val
    
    if count:
        columnName = columnName + "Count"
    
    lookFor = sentimentDictionaries[sentimentDictionaries[column] == compareVal]
    
    print("LookFor shape: {}".format(lookFor.shape))
    if count:
        df[columnName+"Before"]  = 0
        df[columnName+"After"]  = 0
    else:
        df[columnName+"Before"]  = False
        df[columnName+"After"]  = False
    tStart = time.time()
    if count:
        df = df.apply(lambda row: countOccurenceColumnBeforeAfter(row, lookFor, columnName), axis = 1)
    else:
        df[columnName+"Before"]  = df[columnName+"CountBefore"] > 0
        df[columnName+"After"]  = df[columnName+"CountAfter"] > 0

    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [197]:
sentencesTestSubset = WordsAroundNegations('hostile', sentencesTestSubset, None, True)
sentencesTestSubset = WordsAroundNegations('yes', sentencesTestSubset, None, True)
sentencesTestSubset = WordsAroundNegations('no', sentencesTestSubset, None, True)
sentencesTestSubset = WordsAroundNegations('priorpolarity', sentencesTestSubset, 'negative', True)
sentencesTestSubset = WordsAroundNegations('priorpolarity', sentencesTestSubset, 'positive', True)
sentencesTestSubset = WordsAroundNegations('active', sentencesTestSubset, None, True)
sentencesTestSubset = WordsAroundNegations('negate', sentencesTestSubset, None, True)
sentencesTestSubset = WordsAroundNegations('passive', sentencesTestSubset, None, True)

LookFor shape: (439, 25)
Timed(hostileCount): 354.480000019
LookFor shape: (5, 25)
Timed(yesCount): 37.9730000496
LookFor shape: (3, 25)
Timed(noCount): 35.7060000896
LookFor shape: (1552, 25)
Timed(priorpolaritynegativeCount): 1483.93799996
LookFor shape: (1040, 25)
Timed(priorpolaritypositiveCount): 875.638999939
LookFor shape: (554, 25)
Timed(activeCount): 629.459000111
LookFor shape: (119, 25)
Timed(negateCount): 154.56400013
LookFor shape: (319, 25)
Timed(passiveCount): 377.963000059


In [198]:
sentencesTestSubset = CalculateColumn('positiv', sentencesTestSubset, None, 'weaksubj')
sentencesTestSubset = CalculateColumn('negativ', sentencesTestSubset, None, 'weaksubj')

LookFor shape: (403, 25)
Timed(positivCountweaksubj): 25.9900000095
LookFor shape: (500, 25)
Timed(negativCountweaksubj): 29.5629999638


In [199]:
sentencesTestSubset = WordsAroundNegations('hostile', sentencesTestSubset)
sentencesTestSubset = WordsAroundNegations('yes', sentencesTestSubset)
sentencesTestSubset = WordsAroundNegations('no', sentencesTestSubset)
sentencesTestSubset = WordsAroundNegations('priorpolarity', sentencesTestSubset, 'negative')
sentencesTestSubset = WordsAroundNegations('priorpolarity', sentencesTestSubset, 'positive')
sentencesTestSubset = WordsAroundNegations('negate', sentencesTestSubset)
sentencesTestSubset = WordsAroundNegations('passive', sentencesTestSubset)

LookFor shape: (439, 25)
Timed(hostile): 0.00699996948242
LookFor shape: (5, 25)
Timed(yes): 0.000999927520752
LookFor shape: (3, 25)
Timed(no): 0.0
LookFor shape: (1552, 25)
Timed(priorpolaritynegative): 0.000999927520752
LookFor shape: (1040, 25)
Timed(priorpolaritypositive): 0.000999927520752
LookFor shape: (119, 25)
Timed(negate): 0.0
LookFor shape: (319, 25)
Timed(passive): 0.00100016593933


In [200]:
sentencesTestSubset = CalculateColumn('priorpolarity', sentencesTestSubset, 'negative', 'strongsubj')
sentencesTestSubset = CalculateColumn('priorpolarity', sentencesTestSubset, 'positive', 'strongsubj')

LookFor shape: (972, 25)
Timed(priorpolarityCountnegativestrongsubj): 32.378000021
LookFor shape: (584, 25)
Timed(priorpolarityCountpositivestrongsubj): 33.3130002022


In [201]:
sentencesTestSubset = CalculateColumn('priorpolarity', sentencesTestSubset, 'negative', 'weaksubj')
sentencesTestSubset = CalculateColumn('priorpolarity', sentencesTestSubset, 'positive', 'weaksubj')

LookFor shape: (580, 25)
Timed(priorpolarityCountnegativeweaksubj): 31.5899999142
LookFor shape: (456, 25)
Timed(priorpolarityCountpositiveweaksubj): 34.2759997845


In [202]:
sentencesTestSubset = CalculateColumn('priorpolarity', sentencesTestSubset, 'negative')
sentencesTestSubset = CalculateColumn('priorpolarity', sentencesTestSubset, 'positive')

LookFor shape: (1552, 25)
Timed(priorpolarityCountnegative): 37.1159999371
LookFor shape: (1040, 25)
Timed(priorpolarityCountpositive): 32.9509999752


In [203]:
sentencesTestSubset['morePositiveThanNegativeStrong'] = False
dt = sentencesTestSubset[sentencesTestSubset['priorpolarityCountpositivestrongsubj'] > sentencesTestSubset['priorpolarityCountnegativestrongsubj'] ]
sentencesTestSubset.loc[dt.index,'morePositiveThanNegativeStrong'] = True

sentencesTestSubset['morePositiveThanNegativeWeak'] = False
dt = sentencesTestSubset[sentencesTestSubset['priorpolarityCountpositiveweaksubj'] > sentencesTestSubset['priorpolarityCountnegativeweaksubj'] ]
sentencesTestSubset.loc[dt.index,'morePositiveThanNegativeWeak'] = True

sentencesTestSubset['morePositiveThanNegative'] = False
dt = sentencesTestSubset[sentencesTestSubset['priorpolarityCountpositive'] > sentencesTestSubset['priorpolarityCountnegative'] ]
sentencesTestSubset.loc[dt.index,'morePositiveThanNegative'] = True

In [204]:
finalData = sentencesTestSubset[['negations', 'positivCount', 'negativCount', 'priorpolaritynegativeAfter',
'hostileBefore', 'yesBefore', 'yesAfter', 'noBefore', 'negateBefore', 'priorpolaritynegativeCountBefore',
'priorpolaritypositiveCountBefore', 'activeCountAfter', 'passiveCountAfter', 'hostileCountAfter', 
'passiveAfter', 'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
'priorpolaritynegativeCountAfter', 'priorpolaritypositiveAfter', 'negateAfter','priorpolaritypositiveCountAfter']]

In [205]:
sentencesTestSubset['trained-classifier-predicted'] = trained_clasifier.predict(finalData.values)

In [210]:
#test_reviews_even.loc[(test_reviews_even['rating'] > 3) & (test_reviews_even['rating'] < 7), 'expected'] = 0
test_reviews_even.loc[test_reviews_even['rating'] > 5, 'expected'] = 1
test_reviews_even.loc[test_reviews_even['rating'] < 5, 'expected'] = -1

In [207]:
test_reviews_even['predicted-model'] = test_reviews_even['id'].map(lambda cell: calculateSentimentReview(cell, sentencesTestSubset, 'trained-classifier-predicted'))
print("Done")

Done


In [188]:
test_reviews    

Unnamed: 0,id,expected,rating,predicted-model,predicted-direct
22002,7302#10#0,1,10,1,0
8128,6066#4#0,0,4,1,1
20783,6205#9#0,1,9,1,1
13097,10538#10#0,1,10,1,1
2912,1371#1#0,-1,1,1,1
11198,882#2#0,-1,2,1,1
8773,6647#1#0,-1,1,1,1
5749,3925#2#0,-1,2,0,0
558,10502#1#0,-1,1,1,1
13138,10575#10#0,1,10,1,1


In [208]:
sentencesTestSubset = sentencesTestSubset.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)
test_reviews_even['predicted-direct'] = test_reviews_even['id'].map(lambda cell: calculateSentimentReview(cell, sentencesTestSubset, 'apriori-class-direct'))
print("Done")

Done


In [211]:
print("Direct model: {}".format(calculateAccuracy(test_reviews_even, 'predicted-direct', 'expected')))
print("Trained model: {}".format(calculateAccuracy(test_reviews_even, 'predicted-model', 'expected')))

Direct model: 0.528846153846
Trained model: 0.493189102564


In [145]:
expected_classes_test['rating'].value_counts()

1     643
10    584
8     360
3     300
9     293
4     283
2     274
7     263
Name: rating, dtype: int64