In [59]:
import pandas as pd
import numpy as np
from tools.parsers import largemoviereviews as largeMovieReviewsParser
from tools.parsers import generalinquirer as generalInquirerParser
from tools.parsers import negation as negationParser 
import time
import string, math
from nltk.stem.snowball import SnowballStemmer

from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

In [65]:
stemmer = SnowballStemmer("english")
parserMovieReviews = largeMovieReviewsParser.LargeMovieReviews()
parserInquirer = generalInquirerParser.GeneralInquirer()
parserNegation = negationParser.Negation()
negations = parserNegation.readFileCsv(parserNegation.defaultFileNameProcessed)

In [10]:
sentencesData = parserMovieReviews.readFileCsv(parserMovieReviews.defaultFileNameProcessed)

In [56]:
sentimentDictionaries = parserInquirer.readFileCsv(parserInquirer.combinedFileLoc)
sentimentDictionaries['entry'] = sentimentDictionaries['entry'].map(lambda cell: stemmer.stem(cell))
sentimentDictionaries.drop_duplicates(subset = 'entry', inplace = True)
sentimentDictionaries.shape

(2736, 25)

In [63]:
negations['phraseStemmed'] = negations['phrase'].map(lambda cell: stemmer.stem(cell).lower())

In [12]:
sentencesClean = sentencesData[np.invert(sentencesData['id'].isnull())]

In [13]:
sentencesClean = sentencesClean[np.invert(sentencesClean['text'].isnull())] 

In [None]:
sentencesClean.shape

In [None]:
sentencesClean[sentencesClean['type'] == 1].shape

In [4]:
def countOccurenceColumn(row, lookFor, columnName, normalize):
    words = '|'+row['text']+'|'
    lookFor['found'] = lookFor.apply(lambda rowLook: 1 if '|'+rowLook['entry']+'|' in words else 0, axis = 1)
    row[columnName] = lookFor['found'].sum()
    return row

def CalculateColumn(column, df, val = None, typeWord = None, normalize = False):
    global sentimentDictionaries
    
    compareVal = column
    if column == None:
        columnName = "All"+typeWord
    else:
        if val != None:
            compareVal = val
        columnName = column+'Count'
        if val != None:
            columnName = columnName + val
        if typeWord != None:
            columnName = columnName + typeWord
        
    if typeWord == None:
        lookFor = sentimentDictionaries[sentimentDictionaries[column] == compareVal].reset_index()
    else:
        if column == None:
            lookFor = sentimentDictionaries[(sentimentDictionaries['type'] == typeWord)].reset_index()
        else:
            lookFor = sentimentDictionaries[(sentimentDictionaries[column] == compareVal) & (sentimentDictionaries['type'] == typeWord)].reset_index()
    
    print("LookFor shape: {}".format(lookFor.shape))
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOccurenceColumn(row, lookFor, columnName, normalize), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [None]:
sentencesClean = CalculateColumn('priorpolarity', sentencesClean, 'negative')
sentencesClean = CalculateColumn('priorpolarity', sentencesClean, 'positive')

In [9]:
import cPickle as pickle

In [6]:
with open('./large-movie-reviews.dump','wb') as fp:
    pickle.dump(sentencesClean,fp)

NameError: name 'sentencesClean' is not defined

In [None]:
sentencesClean

In [25]:
with open('./large-movie-reviews.dump','rb') as fp:
    sentencesClean = pickle.load(fp)

In [26]:
sentencesTest = sentencesClean[sentencesClean['set'] == 0]

In [66]:
def cleanUpWord(w):
    return w.translate(None, string.punctuation).strip()

def processWords(row):
    global stemmer

    words = [ word for word in [cleanUpWord(w) for w in row['text'].split('|')] if len(word) > 0]    
    row['textStemmed'] = '|'.join([stemmer.stem(unicode(w, errors='ignore')) for w in words]).lower()
    row['textStemmed'] = '|' + row['textStemmed'] + '|'
    return row


tStart = time.time()
sentencesTest = sentencesTest.apply(lambda row: processWords(row), axis = 1)
tEnd = time.time()
print("Timed: {}".format(str(tEnd - tStart)))

Timed: 267.912999868


In [70]:
sentencesTest['textStemmed'] = sentencesTest['textStemmed'].map(lambda cell : cell.lower())

In [40]:
def directClassification(row, pos_column, neg_column, suffix):
    if row[pos_column] < row[neg_column]:
        row['apriori-class-'+suffix] = -1
    elif row[pos_column] > row[neg_column]:
        row['apriori-class-'+suffix] = 1
    else:
        row['apriori-class-'+suffix] = 0
    
    return row

In [44]:
def calculateSentiment(sentences, column_predicted):
    numPositive = sentences[sentences[column_predicted] == 1].shape[0]
    numNegative = sentences[sentences[column_predicted] == -1].shape[0]
    numNeutral = sentences[sentences[column_predicted] == 0].shape[0]
#    print("Pos = {}; Neg = {}; Neutral = {}".format(numPositive, numNegative, numNeutral))
    if numPositive > numNegative:
        return 1
    elif numPositive < numNegative:
        return -1
    else:
        return 0    

In [45]:
def calculateSentimentReview(review_id, df, col):
    review_data = review_id.split('#')
    review_set_id = int(review_data[2])
    review_rating = int(review_data[1])
    review_df_id = int(review_data[0])
    sentences = df[(df['id'] == review_df_id) & (df['set'] == review_set_id) & (df['rating'] == review_rating) ]
    return calculateSentiment(sentences, col)

In [46]:
def generate_row_id(row):
    row['real-id'] = "{}#{}#{}".format(int(row['id']), int(row['rating']), int(row['set']))
#    print("{}-{}".format(row['id'], row['set']))
    return row

In [None]:
calculateSentimentReview('10000#1')

In [47]:
# generate expected results
sentencesTest = sentencesTest.apply(lambda row: generate_row_id(row), axis = 1)
unique_reviews = sentencesTest['real-id'].unique()
expected_classes = pd.DataFrame(np.zeros((len(unique_reviews), 3)), columns = ['id','expected', 'predicted'])
expected_classes['id'] = unique_reviews

In [48]:
expected_classes['expected'] = expected_classes['id'].map(lambda cell: calculateSentimentReview(cell, sentencesTest, 'type'))

In [49]:
sentencesTest

Unnamed: 0,id,rating,set,text,type,priorpolarityCountnegative,priorpolarityCountpositive,apriori-class-direct,real-id
270863,0,2,0,once|again|mr|costner|has|dragged|out|a|movie|...,-1,0,1,1,0#2#0
270864,0,2,0,aside|from|the|terrific|sea|rescue|sequences|o...,-1,0,4,1,0#2#0
270865,0,2,0,most|of|us|have|ghosts|in|the|closet|and|costn...,-1,0,1,1,0#2#0
270866,0,2,0,the|character|we|should|really|care|about|is|a...,-1,1,1,0,0#2#0
270867,0,2,0,the|problem|is|he|comes|off|as|kid|who|thinks|...,-1,1,2,1,0#2#0
270868,0,2,0,his|only|obstacle|appears|to|be|winning|over|c...,-1,1,0,-1,0#2#0
270869,0,2,0,finally|when|we|are|well|past|the|half|way|poi...,-1,0,1,1,0#2#0
270870,0,2,0,we|are|told|why|kutcher|is|driven|to|be|the|be...,-1,0,1,1,0#2#0
270871,0,2,0,no|magic|here|it|was|all|i|could|do|to|keep|fr...,-1,0,0,0,0#2#0
270872,10000,4,0,this|is|an|example|of|why|the|majority|of|acti...,-1,0,0,0,10000#4#0


In [50]:
sentencesTest = sentencesTest.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)

In [51]:
expected_classes['predicted'] = expected_classes['id'].map(lambda cell: calculateSentimentReview(cell, sentencesTest, 'apriori-class-direct'))
print("Done")

Done


In [52]:
expected_classes[expected_classes['predicted'] != expected_classes['expected']].shape[0] / float(expected_classes.shape[0])

0.418

In [71]:
def countNegations(row):
    words = row['text']
    negations['found'] = negations.apply(lambda row: 1 if '|'+row['phrase']+'|' in words else 0, axis = 1)
    row['negations'] = negations['found'].sum()
    return row

tStart = time.time()
sentencesTest = sentencesTest.apply(lambda row: countNegations(row), axis = 1)
tEnd = time.time()
print("Timed: {}".format(str(tEnd-tStart)))

Timed: 379.041000128


In [75]:
sentencesTest = CalculateColumn('positiv', sentencesTest)
sentencesTest = CalculateColumn('negativ', sentencesTest)

LookFor shape: (965, 26)


KeyboardInterrupt: 

In [None]:
def countOccurenceColumnBeforeAfter(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phrase']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['text'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['textStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['textStemmed'].find(rowLook['entry'], 0, posNeg)
            posAfter = row['textStemmed'].find(rowLook['entry'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = row[columnName+'Before'] + 1 
            if posAfter != -1:
                row[columnName+'After'] = row[columnName+'After'] + 1

    return row


def WordsAroundNegations(column, df, val = None, count = False):
    global sentimentDictionaries

    columnName = column
    compareVal = val
    if val is None:
        compareVal = column
    else:
        columnName = column + val
    
    if count:
        columnName = columnName + "Count"
    
    lookFor = sentimentDictionaries[sentimentDictionaries[column] == compareVal]
    
    print("LookFor shape: {}".format(lookFor.shape))
    if count:
        df[columnName+"Before"]  = 0
        df[columnName+"After"]  = 0
    else:
        df[columnName+"Before"]  = False
        df[columnName+"After"]  = False
    tStart = time.time()
    if count:
        df = df.apply(lambda row: countOccurenceColumnBeforeAfter(row, lookFor, columnName), axis = 1)
    else:
        df[columnName+"Before"]  = df[columnName+"CountBefore" > 0]
        df[columnName+"After"]  = df[columnName+"CountAfter" > 0]

    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [None]:
sentencesTest = WordsAroundNegations('hostile', sentencesTest, None, True)
sentencesTest = WordsAroundNegations('hostile', sentencesTest)
sentencesTest = WordsAroundNegations('yes', sentencesTest, None, True)
sentencesTest = WordsAroundNegations('no', sentencesTest, None, True)
sentencesTest = WordsAroundNegations('yes', sentencesTest)
sentencesTest = WordsAroundNegations('no', sentencesTest)
sentencesTest = WordsAroundNegations('priorpolarity', sentencesTest, 'negative', True)
sentencesTest = WordsAroundNegations('priorpolarity', sentencesTest, 'positive', True)
sentencesTest = WordsAroundNegations('priorpolarity', sentencesTest, 'negative')
sentencesTest = WordsAroundNegations('priorpolarity', sentencesTest, 'positive')
sentencesTest = WordsAroundNegations('active', sentencesTest, None, True)
sentencesTest = WordsAroundNegations('negate', sentencesTest, None, True)
sentencesTest = WordsAroundNegations('negate', sentencesTest)
sentencesTest = WordsAroundNegations('passive', sentencesTest, None, True)
sentencesTest = WordsAroundNegations('passive', sentencesTest)

In [None]:
trained_clasifier = joblib.load('./models/sentiment/version_1/version_1.pkl') 

In [None]:
finalData = sentencesTest[['negations', 'positivCount', 'negativCount', 'priorpolaritynegativeAfter',
'hostileBefore', 'yesBefore', 'yesAfter', 'noBefore', 'negateBefore', 'priorpolaritynegativeCountBefore'
'priorpolaritypositiveCountBefore', 'activeCountAfter', 'passiveCountAfter', 'hostileCountAfter', 
'passiveAfter', 'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
'priorpolaritynegativeCountAfter', 'priorpolaritypositiveAfter', 'negateAfter','priorpolaritypositiveCountAfter']]