In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import time
import string

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

In [3]:
from tools.parsers import generalinquirer as generalInquirerParser
from tools.parsers import largemoviereviews as largeMovieReviewsParser
from tools.parsers import generalinquirer as generalInquirerParser
from tools.parsers import negation as negationParser 

from tools.sentimentanalysis import preparation
from nltk.util import ngrams
from nltk import pos_tag

In [4]:
maxWords = 25
onlyMpqa = True
doSave = False

In [5]:
stemmer = SnowballStemmer("english")
prepData = preparation.Preparation()
parserInquirer = generalInquirerParser.GeneralInquirer()
parserNegation = negationParser.Negation()
parserMovieReviews = largeMovieReviewsParser.LargeMovieReviews()

In [6]:
sentimentDictionary  = parserInquirer.readFileCsv(parserInquirer.combinedFileLoc)
sentencesMpqa = parserInquirer.readFileCsv(prepData.defaultFileNameSentimentSentencesNormalized)
negations = parserNegation.readFileCsv(parserNegation.defaultFileNameProcessed)
sentencesImdb = parserMovieReviews.readFileCsv(parserMovieReviews.defaultFileNameProcessed)
expectedResultsImdb = parserMovieReviews.readFileCsv(parserMovieReviews.defaultFileNameProcessedOverall)

In [7]:
non_english_text = ['im_401b_e73i32c22_031705-2', 'IZ-060316-01-Trans-1', '20000815_AFP_ARB.0084.IBM-HA-NEW', 'NapierDianne']
polarizedSentencesMpqa = sentencesMpqa[sentencesMpqa['sentiment-type'] == 0]
polarizedSentencesMpqa = polarizedSentencesMpqa[np.invert(polarizedSentencesMpqa['docName'].isin(non_english_text))]
polarizedSentencesMpqa.loc[polarizedSentencesMpqa.index, 'textLower'] = polarizedSentencesMpqa.loc[polarizedSentencesMpqa.index, 'text'].map( lambda cell: cell.lower())

sentimentDictionary['entryRaw'] = sentimentDictionary['entry']
sentimentDictionary['entry'] = sentimentDictionary['entry'].map(lambda cell: stemmer.stem(cell))
sentimentDictionary.drop_duplicates(subset = 'entry', inplace = True)

negations['phraseStemmed'] = negations['phrase'].map(lambda cell: stemmer.stem(cell).lower())

sentencesImdb = sentencesImdb[np.invert(sentencesImdb['id'].isnull())]
sentencesImdb = sentencesImdb[np.invert(sentencesImdb['text'].isnull())] 

In [186]:
polarizedSentencesMpqa[polarizedSentencesMpqa['docName'] == '01.09.50-14694']

Unnamed: 0,index,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text,sentiment-measured,...,ngram_3_pleasurpriorpolaritypositive,ngram_2_weakpriorpolaritynegative,ngram_3_weakpriorpolaritynegative,ngram_2_activepriorpolaritypositive,ngram_3_activepriorpolaritypositive,morePositiveThanNegativeStrong,morePositiveThanNegativeWeak,morePositiveThanNegative,morePositiveThanNeutral,moreNegativeThanNeutral
68,140,01.09.50-14694,20010818,3,2,51,49,2,|mdc|leader|tsvangirai|at|it|again|in|south|af...,1,...,0,0,0,0,0,False,False,False,False,False
69,141,01.09.50-14694,20010818,4,1488,1547,59,4,|if|sanctions|are|imposed|on|zimbabwe|it|will|...,1,...,0,0,0,0,0,True,False,True,True,False
72,144,01.09.50-14694,20010818,7,3267,3372,105,4,|but|anyone|who|wants|to|speak|the|language|of...,1,...,0,0,0,0,0,False,False,False,False,False
74,146,01.09.50-14694,20010818,9,1639,1781,142,4,|until|such|time|as|mugabe|actually|subverts|t...,1,...,0,0,0,0,0,True,False,True,True,False
77,150,01.09.50-14694,20010818,13,3373,3449,76,3,|zimbabwe|s|performance|in|mozambique|somalia|...,1,...,0,0,0,0,0,False,False,False,False,False
78,152,01.09.50-14694,20010818,15,1082,1210,128,2,|the|us|congress|is|set|to|consider|the|zimbab...,1,...,0,0,0,1,2,False,False,False,False,False
79,153,01.09.50-14694,20010818,16,80,232,152,4,|opposition|youths|are|agitating|to|get|guns|t...,1,...,0,0,0,0,0,False,False,False,False,False
80,154,01.09.50-14694,20010818,17,1217,1311,94,2,|mr|tsvangirai|called|for|international|action...,1,...,0,0,0,0,0,False,True,True,True,False
82,156,01.09.50-14694,20010818,19,3162,3260,98,4,|but|analysts|expressed|concern|that|the|mdc|l...,1,...,0,0,0,0,0,False,False,False,False,True
83,157,01.09.50-14694,20010818,20,3023,3155,132,5,|the|mdc|leader|also|alleged|that|the|governme...,1,...,0,0,0,0,0,False,False,False,False,False


In [8]:
def infoDf(df, col):
    print("Negative: {}".format(df[df[col] < 0].shape))
    print("Neutral: {}".format(df[df[col] == 0].shape))
    print("Positive: {}".format(df[df[col] > 0].shape))

In [9]:
print("MPQA")
infoDf(polarizedSentencesMpqa, 'sentiment-intensity')
print
print("IMDB")
infoDf(sentencesImdb, 'type')

MPQA
Negative: (3131, 13)
Neutral: (1472, 13)
Positive: (2978, 13)

IMDB
Negative: (273487, 5)
Neutral: (0, 5)
Positive: (258804, 5)


In [10]:
def remove_articles(row):
    tokens = row['textArrStemmed']
    articles = ['a', 'an', 'the']
    tokens = [w for w in tokens if w not in articles]
    row['tokens'] = tokens
    return row

In [11]:
def get_punctuation(row):
    last_character = row['text'][-1:]
    if last_character in '?!.':
        row['punctuation'] = ord(last_character)
    else:
        if last_character in '"\'':
            penultimate_character = row['text'][-2:-1]
            if penultimate_character in '?!.':
                row['punctuation'] = ord(penultimate_character)
            else:
                row['punctuation'] = None
        else:
            row['punctuation'] = ord('x')

    return row

In [12]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: get_punctuation(row), axis = 1)
tEnd = time.time()
print("Timed MPQA data: {}".format(str(tEnd - tStart)))
# prepare movie reviews data
if not onlyMpqa:
    tStart = time.time()
    sentencesImdb = sentencesImdb.apply(lambda row: get_punctuation(row), axis = 1) 
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

Timed MPQA data: 4.27499985695


In [13]:
polarizedSentencesMpqa = polarizedSentencesMpqa[polarizedSentencesMpqa['punctuation'].notnull()]

In [14]:
polarizedSentencesMpqa.shape

(7865, 14)

In [15]:
def cleanUpWord(w):
    return w.translate(None, string.punctuation).strip()

In [16]:
def processWordsMpqa(row):
    global stemmer

    words = [ word for word in [cleanUpWord(w) for w in word_tokenize(row['textLower'])] if len(word) > 0]
    row['text'] = '|' + '|'.join(words) + '|'
    row['wordCount'] = len(words)
    row['textArr'] = words
    
    wordsStemmed = [stemmer.stem(unicode(w, errors='ignore')) for w in words]
    row['textStemmed'] = '|'.join(wordsStemmed)
    row['textStemmed'] = '|' + row['textStemmed'] + '|'
    row['textArrStemmed'] = wordsStemmed
    return row

def processWordsMovies(row):
    global stemmer

    words = [ word for word in [cleanUpWord(w) for w in row['text'].split('|')] if len(word) > 0]    
    wordsStemmed = [stemmer.stem(unicode(w, errors='ignore')) for w in words]
    row['textStemmed'] = '|'.join(wordsStemmed).lower()
    row['textStemmed'] = '|' + row['textStemmed'] + '|'
    row['wordCount'] = len(words)
    row['textArr'] = words
    row['textArrStemmed'] = wordsStemmed
    row['text'] = '|' + row['text'] + '|'
    return row

In [17]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: processWordsMpqa(row), axis = 1)
tEnd = time.time()
print("Timed MPQA data: {}".format(str(tEnd - tStart)))

Timed MPQA data: 19.3239998817


In [18]:
polarizedSentencesMpqa = polarizedSentencesMpqa[polarizedSentencesMpqa['wordCount'] < maxWords]

In [19]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: remove_articles(row), axis = 1)
tEnd = time.time()
print("Timed MPQA data: {}".format(str(tEnd - tStart)))
# prepare movie reviews data
if not onlyMpqa:
    tStart = time.time()
    sentencesImdb = sentencesImdb.apply(lambda row: remove_articles(row), axis = 1) 
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

Timed MPQA data: 2.53399991989


In [20]:
def generate_row_id(row):
    row['real-id'] = "{}#{}#{}".format(int(row['id']), int(row['rating']), int(row['set']))
    return row

In [21]:
# prepare movie reviews data
if not onlyMpqa:
    sentencesImdb = sentencesImdb.apply(lambda row: generate_row_id(row), axis = 1) 
    expectedResultsImdb = expectedResultsImdb.apply(lambda row: generate_row_id(row), axis = 1)

In [22]:
def equal_cuts(df, size_df, col):
    perm_df = df.reindex(np.random.permutation(df.index))
    res_df = pd.DataFrame({}, columns = df.columns)
    unique_parts = perm_df[col].unique()
    num_parts = int(size_df / len(unique_parts)) 
    for unique in unique_parts:
        selected_rows = perm_df[perm_df[col] == unique]
        res_df = pd.concat([res_df, selected_rows[:num_parts]])
    
    return res_df.reindex(np.random.permutation(res_df.index))

In [23]:
if not onlyMpqa:
    test_reviews_even = equal_cuts(expectedResultsImdb, 2500, 'rating')
    sentencesTestSubsetImdb = sentencesImdb[sentencesImdb['real-id'].isin(test_reviews_even['real-id'])]

In [24]:
if not onlyMpqa:
    test_reviews_even['expected'] = test_reviews_even['type']

In [25]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: processWordsMovies(row), axis = 1)
    tEnd = time.time()
    print("Timed Movies data: {}".format(str(tEnd - tStart)))

In [26]:
def countNegations(row):
    words = row['textArrStemmed']
    negations['found'] = negations.apply(lambda row: 1 if row['phraseStemmed'] in words else 0, axis = 1)
    row['negations'] = negations['found'].sum()
    return row

In [27]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: countNegations(row), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

Timed MPQA: 5.5


In [28]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: countNegations(row), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [29]:
polarizedSentencesMpqa['negations'].value_counts()

0    4089
1     522
2       1
Name: negations, dtype: int64

In [30]:
if not onlyMpqa:
    sentencesTestSubsetImdb['negations'].value_counts()

In [31]:
def binSearchDfValue(df, val, start, end, isDebug = False):
    while start <= end:
        if isDebug:
            print("Start: {};    End: {}".format(start, end))
        middle = (start + end) / 2
        if isDebug:
            print("middle: {}".format(df[middle]))
        if df[middle] == val:
            return True
        else:
            if df[middle] > val:
                end = middle - 1
            else:
                start = middle + 1
    
    return False

def countOccurenceInColumn(row, lookFor, lookForLen, columnName):
    words = row['textArr']
    res = [word for word in words if binSearchDfValue(lookFor, word, 0, lookForLen)]
    row[columnName] = len(res)
    return row

def CalculateColumnCounts(column, df, val = None, typeWord = None):
    compareVal = column
    if column == None:
        columnName = "All"+typeWord
    else:
        if val != None:
            compareVal = val
        columnName = column+'Count'
        if val != None:
            columnName = columnName + val
        if typeWord != None:
            columnName = columnName + typeWord
        
    if typeWord == None:
        lookFor = sentimentDictionary[sentimentDictionary[column] == compareVal]
    else:
        if column == None:
            lookFor = sentimentDictionary[(sentimentDictionary['type'] == typeWord)]
        else:
            lookFor = sentimentDictionary[(sentimentDictionary[column] == compareVal) & (sentimentDictionary['type'] == typeWord)]
    
    lookForVals = lookFor['entry'].values
    lookForLen = len(lookForVals) - 1
    print("LookFor shape: {}".format(lookFor.shape))
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOccurenceInColumn(row, lookForVals, lookForLen, columnName), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [32]:
def calculateReiredColumnCounts(df):
    df = CalculateColumnCounts('priorpolarity', df, 'negative')
    df = CalculateColumnCounts('priorpolarity', df, 'both')
    df = CalculateColumnCounts('priorpolarity', df, 'neutral')
    df = CalculateColumnCounts('priorpolarity', df, 'positive')
    df = CalculateColumnCounts('priorpolarity', df, 'negative', 'strongsubj')
    df = CalculateColumnCounts('priorpolarity', df, 'positive', 'strongsubj')
    df = CalculateColumnCounts('priorpolarity', df, 'neutral', 'strongsubj')
    df = CalculateColumnCounts('priorpolarity', df, 'both', 'strongsubj')
    df = CalculateColumnCounts('priorpolarity', df, 'negative', 'weaksubj')
    df = CalculateColumnCounts('priorpolarity', df, 'positive', 'weaksubj')
    df = CalculateColumnCounts('priorpolarity', df, 'both', 'weaksubj')
    df = CalculateColumnCounts('priorpolarity', df, 'neutral', 'weaksubj')
    df = CalculateColumnCounts(None, df, None, 'weaksubj')
    df = CalculateColumnCounts(None, df, None, 'strongsubj')
    df = CalculateColumnCounts('hostile', df, None, 'weaksubj')
    df = CalculateColumnCounts('strong', df, None, 'weaksubj')
    df = CalculateColumnCounts('hostile', df, None, 'strongsubj')
    df = CalculateColumnCounts('strong', df, None, 'strongsubj')
    df = CalculateColumnCounts('hostile', df)
    df = CalculateColumnCounts('strong', df)
    df = CalculateColumnCounts('active', df, None, 'weaksubj')
    df = CalculateColumnCounts('passive', df, None, 'weaksubj')
    df = CalculateColumnCounts('active', df, None, 'strongsubj')
    df = CalculateColumnCounts('passive', df, None, 'strongsubj')
    df = CalculateColumnCounts('active', df)
    df = CalculateColumnCounts('passive', df)
    df = CalculateColumnCounts('positiv', df, None, 'weaksubj')
    df = CalculateColumnCounts('negativ', df, None, 'weaksubj')
    df = CalculateColumnCounts('positiv', df, None, 'strongsubj')
    df = CalculateColumnCounts('negativ', df, None, 'strongsubj')
    df = CalculateColumnCounts('positiv', df)
    df = CalculateColumnCounts('negativ', df)
    df = CalculateColumnCounts('yes', df)
    df = CalculateColumnCounts('no', df)

    return df

In [33]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = calculateReiredColumnCounts(sentencesTestSubsetImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [34]:
tStart = time.time()
polarizedSentencesMpqa = calculateReiredColumnCounts(polarizedSentencesMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

LookFor shape: (1552, 26)
Timed(priorpolarityCountnegative): 1.13199996948
LookFor shape: (7, 26)
Timed(priorpolarityCountboth): 0.711999893188
LookFor shape: (137, 26)
Timed(priorpolarityCountneutral): 0.938999891281
LookFor shape: (1040, 26)
Timed(priorpolarityCountpositive): 1.06100010872
LookFor shape: (972, 26)
Timed(priorpolarityCountnegativestrongsubj): 1.09500002861
LookFor shape: (584, 26)
Timed(priorpolarityCountpositivestrongsubj): 1.01400017738
LookFor shape: (45, 26)
Timed(priorpolarityCountneutralstrongsubj): 0.878999948502
LookFor shape: (6, 26)
Timed(priorpolarityCountbothstrongsubj): 0.701999902725
LookFor shape: (580, 26)
Timed(priorpolarityCountnegativeweaksubj): 1.02200007439
LookFor shape: (456, 26)
Timed(priorpolarityCountpositiveweaksubj): 0.982000112534
LookFor shape: (1, 26)
Timed(priorpolarityCountbothweaksubj): 0.674000024796
LookFor shape: (92, 26)
Timed(priorpolarityCountneutralweaksubj): 0.871000051498
LookFor shape: (1129, 26)
Timed(Allweaksubj): 1.082999

In [35]:
polarizedSentencesMpqa.columns

Index([u'index', u'docName', u'dirName', u'idx', u'startByte', u'endByte',
       u'sentLen', u'annotsCount', u'text', u'sentiment-measured',
       u'sentiment-type', u'sentiment-intensity', u'textLower', u'punctuation',
       u'wordCount', u'textArr', u'textStemmed', u'textArrStemmed', u'tokens',
       u'negations', u'priorpolarityCountnegative', u'priorpolarityCountboth',
       u'priorpolarityCountneutral', u'priorpolarityCountpositive',
       u'priorpolarityCountnegativestrongsubj',
       u'priorpolarityCountpositivestrongsubj',
       u'priorpolarityCountneutralstrongsubj',
       u'priorpolarityCountbothstrongsubj',
       u'priorpolarityCountnegativeweaksubj',
       u'priorpolarityCountpositiveweaksubj',
       u'priorpolarityCountbothweaksubj', u'priorpolarityCountneutralweaksubj',
       u'Allweaksubj', u'Allstrongsubj', u'hostileCountweaksubj',
       u'strongCountweaksubj', u'hostileCountstrongsubj',
       u'strongCountstrongsubj', u'hostileCount', u'strongCount',
   

In [36]:
def checkOccurenceColumn(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phrase']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['text'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['textStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['textStemmed'].find(rowLook['entry'], 0, posNeg)
            posAfter = row['textStemmed'].find(rowLook['entry'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = True
            if posAfter != -1:
                row[columnName+'After'] = True
            if row[columnName+'Before'] and row[columnName+'After']:
                return row

    return row


def countOccurenceColumnBeforeAfter(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phrase']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['text'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['textStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['textStemmed'].find(rowLook['entry'], 0, posNeg)
            posAfter = row['textStemmed'].find(rowLook['entry'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = row[columnName+'Before'] + 1 
            if posAfter != -1:
                row[columnName+'After'] = row[columnName+'After'] + 1

    return row


def WordsAroundNegations(column, df, val = None, count = False):
    columnName = column
    compareVal = val
    if val is None:
        compareVal = column
    else:
        columnName = column + val
    
    if count:
        columnName = columnName + "Count"
    
    lookFor = sentimentDictionary[sentimentDictionary[column] == compareVal]
    
    print("LookFor shape: {}".format(lookFor.shape))
    if count:
        df[columnName+"Before"]  = 0
        df[columnName+"After"]  = 0
    else:
        df[columnName+"Before"]  = False
        df[columnName+"After"]  = False
    tStart = time.time()
    if count:
        df = df.apply(lambda row: countOccurenceColumnBeforeAfter(row, lookFor, columnName), axis = 1)
    else:
        df = df.apply(lambda row: checkOccurenceColumn(row, lookFor, columnName), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [258]:
def calculateRequiredWordsAroundNegations(df):
    df = WordsAroundNegations('priorpolarity', df, 'negative')
    df = WordsAroundNegations('priorpolarity', df, 'positive')
    df = WordsAroundNegations('priorpolarity', df, 'neutral')
    df = WordsAroundNegations('priorpolarity', df, 'both')
    df = WordsAroundNegations('active', df)
    df = WordsAroundNegations('passive', df)
    df = WordsAroundNegations('hostile', df)
    df = WordsAroundNegations('yes', df)
    df = WordsAroundNegations('no', df)
    df = WordsAroundNegations('negate', df)
    df = WordsAroundNegations('active', df, None, True)
    df = WordsAroundNegations('passive', df, None, True)
    df = WordsAroundNegations('hostile', df, None, True)
    df = WordsAroundNegations('yes', df, None, True)
    df = WordsAroundNegations('no', df, None, True)
    df = WordsAroundNegations('negate', df, None, True)
    df = WordsAroundNegations('priorpolarity', df, 'negative', True)
    df = WordsAroundNegations('priorpolarity', df, 'positive', True)
    return df

In [38]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = calculateRequiredWordsAroundNegations(sentencesTestSubsetImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [39]:
tStart = time.time()
polarizedSentencesMpqa = calculateReiredWordsAroundNegations(polarizedSentencesMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

LookFor shape: (1552, 26)
Timed(priorpolaritynegative): 62.7290000916
LookFor shape: (1040, 26)
Timed(priorpolaritypositive): 42.2899999619
LookFor shape: (137, 26)
Timed(priorpolarityneutral): 7.15700006485
LookFor shape: (7, 26)
Timed(priorpolarityboth): 1.59700012207
LookFor shape: (554, 26)
Timed(active): 26.635999918
LookFor shape: (319, 26)
Timed(passive): 15.9340000153
LookFor shape: (439, 26)
Timed(hostile): 20.8169999123
LookFor shape: (5, 26)
Timed(yes): 1.71499991417
LookFor shape: (3, 26)
Timed(no): 1.38400006294
LookFor shape: (119, 26)
Timed(negate): 6.85399985313
LookFor shape: (554, 26)
Timed(activeCount): 29.2200000286
LookFor shape: (319, 26)
Timed(passiveCount): 19.6879999638
LookFor shape: (439, 26)
Timed(hostileCount): 20.2920000553
LookFor shape: (5, 26)
Timed(yesCount): 3.8220000267
LookFor shape: (3, 26)
Timed(noCount): 1.58299994469
LookFor shape: (119, 26)
Timed(negateCount): 6.82400012016
LookFor shape: (1552, 26)
Timed(priorpolaritynegativeCount): 69.8260002

In [40]:
sentimentDictionary['stemmed'].shape[0]

2736L

In [41]:
def create_n_grams(row, size_ngram):
    row[str(size_ngram)+"_gram"] = list(ngrams(row['tokens'], size_ngram))
    return row

def countOneBeforeOtherNgranRow(row, col, size_ngram, beforeSeries, afterSeries):
    # first, try to find a word from "after series"
    n_gram_col = str(size_ngram)+'_gram' 
    for n_gram in row[n_gram_col]:
        for idx in range(1, size_ngram):
            w = n_gram[idx]
            found = binSearchDfValue(afterSeries, w, 0, afterSeries.shape[0] - 1)
            if found:
                for idx_sub in range(0, idx):
                    w2 = n_gram[idx_sub]
                    found_sub = binSearchDfValue(beforeSeries, w2, 0, beforeSeries.shape[0] - 1)
                    if found_sub:
                        row[col] = row[col] + 1
    return row
    
def countOneBeforeOtherNgramDf(df, size_ngram, beforeClassCol, afterClassCol, beforeClassVal = None, afterClassVal = None, beforeSerieDefault = None):
    columnName = 'ngram_'+str(size_ngram)+"_"+ beforeClassCol + afterClassCol
    compareValBefore = beforeClassVal
    if beforeClassVal is None:
        compareValBefore = beforeClassCol
    else:
        columnName = columnName + beforeClassVal
    
    if beforeSerieDefault is None:
        lookForBefore = sentimentDictionary[sentimentDictionary[beforeClassCol] == compareValBefore].reset_index(drop=True)
        lookForBefore = lookForBefore['entry']
    else:
        lookForBefore = beforeSerieDefault

    compareValAfter = afterClassVal
    if afterClassVal is None:
        compareValAfter = afterClassCol
    else:
        columnName = columnName + afterClassVal
    
    lookForAfter = sentimentDictionary[sentimentDictionary[afterClassCol] == compareValAfter].reset_index(drop=True)
    lookForAfter = lookForAfter['entry']
    
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOneBeforeOtherNgranRow(row, columnName, size_ngram, lookForBefore, lookForAfter), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    return df

In [42]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row:create_n_grams(row, 2), axis = 1)
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row:create_n_grams(row, 3), axis = 1)
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row:create_n_grams(row, 4), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams(row, 2), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams(row, 3), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams(row, 4), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

Timed MPQA: 9.53299999237


In [43]:
def calculateBeforeAfterNgrams(df):
    df = countOneBeforeOtherNgramDf(df, 2, 'negativ', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'negativ', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 4, 'negativ', 'priorpolarity', None, 'positive')
    
    df = countOneBeforeOtherNgramDf(df, 2, 'negativ', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 3, 'negativ', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 4, 'negativ', 'priorpolarity', None, 'negative')

    df = countOneBeforeOtherNgramDf(df, 2, 'negativ', 'priorpolarity', None, 'neutral')
    df = countOneBeforeOtherNgramDf(df, 3, 'negativ', 'priorpolarity', None, 'neutral')
    df = countOneBeforeOtherNgramDf(df, 4, 'negativ', 'priorpolarity', None, 'neutral')

    df = countOneBeforeOtherNgramDf(df, 2, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 3, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 4, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    
    df = countOneBeforeOtherNgramDf(df, 2, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 3, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 4, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    
    df = countOneBeforeOtherNgramDf(df, 2, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 3, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 4, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    
    df = countOneBeforeOtherNgramDf(df, 2, 'hostile', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 3, 'hostile', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 4, 'hostile', 'priorpolarity', None, 'negative')

    df = countOneBeforeOtherNgramDf(df, 2, 'persist', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'persist', 'priorpolarity', None, 'positive')
    
    df = countOneBeforeOtherNgramDf(df, 2, 'pleasur', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'pleasur', 'priorpolarity', None, 'positive')
    
    df = countOneBeforeOtherNgramDf(df, 2, 'weak', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 3, 'weak', 'priorpolarity', None, 'negative')

    df = countOneBeforeOtherNgramDf(df, 2, 'active', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'active', 'priorpolarity', None, 'positive')
    return df

In [44]:
sentimentDictionary.columns

Index([u'positiv', u'negativ', u'active', u'passive', u'affil', u'hostile',
       u'strong', u'power', u'weak', u'submit', u'yes', u'no', u'negate',
       u'intrj', u'pleasur', u'pain', u'feel', u'need', u'persist', u'entry',
       u'othtags', u'type', u'pos', u'stemmed', u'priorpolarity', u'entryRaw'],
      dtype='object')

In [45]:
tStart = time.time()
polarizedSentencesMpqa = calculateBeforeAfterNgrams(polarizedSentencesMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = calculateBeforeAfterNgrams(sentencesTestSubsetImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

Timed(ngram_2_negativpriorpolaritypositive): 9.42599987984
Timed(ngram_3_negativpriorpolaritypositive): 16.888999939
Timed(ngram_4_negativpriorpolaritypositive): 23.9049999714
Timed(ngram_2_negativpriorpolaritynegative): 8.89700007439
Timed(ngram_3_negativpriorpolaritynegative): 17.2459998131
Timed(ngram_4_negativpriorpolaritynegative): 21.7569999695
Timed(ngram_2_negativpriorpolarityneutral): 5.95399999619
Timed(ngram_3_negativpriorpolarityneutral): 10.6440000534
Timed(ngram_4_negativpriorpolarityneutral): 14.0650000572
Timed(ngram_2_negationspriorpolaritypositive): 7.46000003815
Timed(ngram_3_negationspriorpolaritypositive): 13.4879999161
Timed(ngram_4_negationspriorpolaritypositive): 18.4649999142
Timed(ngram_2_negationspriorpolaritynegative): 7.77300000191
Timed(ngram_3_negationspriorpolaritynegative): 13.8459999561
Timed(ngram_4_negationspriorpolaritynegative): 19.6760001183
Timed(ngram_2_negationspriorpolarityneutral): 5.97199988365
Timed(ngram_3_negationspriorpolarityneutral): 9

In [46]:
polarizedSentencesMpqa

Unnamed: 0,index,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text,sentiment-measured,...,ngram_3_hostilepriorpolaritynegative,ngram_4_hostilepriorpolaritynegative,ngram_2_persistpriorpolaritypositive,ngram_3_persistpriorpolaritypositive,ngram_2_pleasurpriorpolaritypositive,ngram_3_pleasurpriorpolaritypositive,ngram_2_weakpriorpolaritynegative,ngram_3_weakpriorpolaritynegative,ngram_2_activepriorpolaritypositive,ngram_3_activepriorpolaritypositive
0,0,13.40.05-15087,20010620,0,109,226,117,2,|the|kimberley|provincial|hospital|said|it|wou...,1,...,0,0,0,0,0,0,0,0,0,0
1,7,13.40.05-15087,20010620,7,793,884,91,2,|the|woman|was|admitted|to|the|hospital|on|sat...,1,...,2,3,0,0,0,0,0,0,0,0
2,11,13.40.05-15087,20010620,11,656,749,93,3,|he|said|it|was|his|opinion|that|the|patient|a...,1,...,0,0,0,0,0,0,0,0,0,0
3,13,13.40.05-15087,20010620,13,588,655,67,3,|saeed|said|indications|were|that|those|tests|...,1,...,1,1,0,0,0,0,0,0,0,0
5,26,23.46.20-17835,20010627,11,1466,1597,131,2,|private|organizations|are|also|being|encourag...,1,...,0,0,0,0,0,0,0,0,0,2
7,31,00.48.42-17806,20010630,3,2648,2698,50,2,|we|decided|to|make|some|bold|decisions|he|said|,1,...,0,0,0,0,0,0,0,0,1,2
9,34,00.48.42-17806,20010630,6,4017,4133,116,4,|but|the|costs|of|failing|to|support|yugoslavi...,1,...,1,2,0,0,0,0,0,0,0,0
11,37,00.48.42-17806,20010630,9,2507,2641,134,3,|we|are|now|fully|back|into|the|international|...,1,...,0,0,0,0,0,0,0,0,0,0
12,38,00.48.42-17806,20010630,10,3931,4016,85,2,|there|is|no|quick|fix|to|the|yugoslav|problem...,1,...,0,0,0,0,0,0,0,0,0,0
22,50,00.48.42-17806,20010630,22,2421,2506,85,2,|aside|from|the|financial|assistance|said|labu...,1,...,0,0,0,0,0,0,0,0,0,0


In [56]:
import cPickle as pickle
if doSave:
#    with open('./clean-sentiment-mpqa.dump','wb') as fp:
#        pickle.dump(polarizedSentencesMpqa,fp)
#    with open('./clean-sentiment-imdb-sents.dump','wb') as fp:
#        pickle.dump(sentencesTestSubsetImdb,fp)
    with open('./clean-sentiment-sentences.dump','wb') as fp:
        pickle.dump(selectedPolarizedSentencesMpqa,fp)    

In [47]:
sentiment_positive = 1
sentiment_negative = -1
polarizedSentencesMpqa.loc[polarizedSentencesMpqa['sentiment-intensity'] < 0, 'sentiment-intensity'] = sentiment_negative
polarizedSentencesMpqa.loc[polarizedSentencesMpqa['sentiment-intensity'] > 0, 'sentiment-intensity'] = sentiment_positive

In [166]:
def get_equal_sentiment_parts(df, ratio):
    neg_df = df[df['sentiment-intensity'] < 0].shape[0]
    pos_df = df[df['sentiment-intensity'] > 0].shape[0]
    neutral_df = df[df['sentiment-intensity'] == 0].shape[0]
    min_size = min(neg_df, pos_df, neutral_df)
    
    new_df = pd.concat([df[df['sentiment-intensity'] < 0][:min_size],df[df['sentiment-intensity'] > 0][:int(ratio * min_size)], df[df['sentiment-intensity'] == 0][:min_size]])
    return new_df.reindex(np.random.permutation(new_df.index))

def get_equal_neutral_training_sentiment_parts(df, ratio):
    new_df = df.reindex(np.random.permutation(df.index))
    
    neg_df = df[df['sentiment-intensity'] < 0].shape[0]
    pos_df = df[df['sentiment-intensity'] > 0].shape[0]
    neutral_df = df[df['sentiment-intensity'] == 0].shape[0]
    min_size = min(neg_df, pos_df, neutral_df)
    if min_size * 2 > neutral_df:
        min_size = int(neutral_df / 2)
    
    new_df = pd.concat([new_df[new_df['sentiment-intensity'] < 0][:int(min_size *  ratio)],new_df[new_df['sentiment-intensity'] > 0][:int(ratio * min_size)],
                            new_df[new_df['sentiment-intensity'] == 0][:int(2 * min_size * ratio)]])
    
    new_df['sentiment-neutral'] = 0
    new_df.loc[new_df['sentiment-intensity'] == 0, 'sentiment-neutral'] = 1
    return new_df.reindex(np.random.permutation(new_df.index))

In [49]:
if not onlyMpqa:
    test_reviews_even.loc[test_reviews_even['expected'] < 0, 'expected'] = sentiment_negative
    test_reviews_even.loc[test_reviews_even['expected'] > 0, 'expected'] = sentiment_positive

In [50]:
def show_all_value_counts(df):
    skip = ['index', 'docName', 'dirName', 'idx', 'startByte', 'endByte', 'sentLen', 'annotsCount', 'text', 'sentiment-measured',
       'sentiment-type', 'sentiment-intensity', 'textLower', 'textList', 'wordCount', 'textArr', 'textStemmed', 'textArrStemmed',
        'id', 'rating', 'set', 'type', 'real-id']
    
    for col in df.columns:
        if col in skip:
            continue
         
        print
        print("Column: {}".format(col))
        print(df[col].value_counts())

In [51]:
# show_all_value_counts(sentencesTestSubsetImdb)

In [52]:
# show_all_value_counts(selectedPolarizedSentencesMpqa)

In [53]:
def calculateRequiredMoreThanOther(df):
    df['morePositiveThanNegativeStrong'] = False
    dt = df[df['priorpolarityCountpositivestrongsubj'] > df['priorpolarityCountnegativestrongsubj'] ]
    df.loc[dt.index,'morePositiveThanNegativeStrong'] = True

    df['morePositiveThanNegativeWeak'] = False
    dt = df[df['priorpolarityCountpositiveweaksubj'] > df['priorpolarityCountnegativeweaksubj'] ]
    df.loc[dt.index,'morePositiveThanNegativeWeak'] = True

    df['morePositiveThanNegative'] = False
    dt = df[df['priorpolarityCountpositive'] > df['priorpolarityCountnegative'] ]
    df.loc[dt.index,'morePositiveThanNegative'] = True

    df['morePositiveThanNeutral'] = False
    dt = df[df['priorpolarityCountpositive'] > df['priorpolarityCountneutral'] ]
    df.loc[dt.index,'morePositiveThanNeutral'] = True

    df['moreNegativeThanNeutral'] = False
    dt = df[df['priorpolarityCountnegative'] > df['priorpolarityCountneutral'] ]
    df.loc[dt.index,'moreNegativeThanNeutral'] = True
    return df

In [54]:
polarizedSentencesMpqa = calculateRequiredMoreThanOther(polarizedSentencesMpqa)
if not onlyMpqa:
    sentencesTestSubsetImdb = calculateRequiredMoreThanOther(sentencesTestSubsetImdb)
print




In [212]:
selectedPolarizedSentencesMpqa = get_equal_sentiment_parts(polarizedSentencesMpqa[polarizedSentencesMpqa['annotsCount'] > 1], 1.0)
#with open('./clean-sentiment-sentences.dump','rb') as fp:
#    selectedPolarizedSentencesMpqa = pickle.load(fp)    
selectedPolarizedSentencesMpqa.shape

(1932, 127)

In [58]:
neutralSelectedSentencesMpqa = get_equal_neutral_training_sentiment_parts(polarizedSentencesMpqa, 1)
neutralSelectedSentencesMpqa.shape

(1712, 128)

In [59]:
def assign_pos_tags(row):
    row['textPos'] = pos_tag(row['textArr'])
    return row

In [220]:
tStart = time.time()
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row: assign_pos_tags(row), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

Timed MPQA: 3186.56299996


In [152]:
tags = [gram[1] for gram_list in selectedPolarizedSentencesMpqa['textPos'].values for gram in gram_list]
tags_pd = pd.DataFrame(tags, columns=["tag"])
tags_pd['tag'].value_counts()

NN      6400
IN      3740
DT      3593
JJ      2843
NNS     2122
RB      1500
VB      1403
VBD     1226
PRP     1122
VBZ     1072
TO       930
VBN      873
CC       821
VBP      674
VBG      617
MD       563
PRP$     362
CD       323
WDT      150
WP       121
EX        90
JJR       88
RP        87
WRB       81
RBR       52
JJS       46
PDT       35
FW        14
RBS       14
NNP        9
WP$        6
UH         2
$          1
Name: tag, dtype: int64

In [169]:
def create_n_grams_pos(row, size_ngram):
    row["pos_"+str(size_ngram)+"_gram"] = list(ngrams(row['textPos'], size_ngram))
    return row

def countOneBeforeOtherNgranPosRow(row, col, size_ngram, beforeSeries, afterSeries, beforeSeriesPos = None, afterSeriesPos = None):
    # first, try to find a word from "after series"
    n_gram_col = 'pos_'+str(size_ngram)+'_gram' 
    for n_gram in row[n_gram_col]:
        for idx in range(1, size_ngram):
            w = n_gram[idx]
            if afterSeriesPos is not None and w[1] not in afterSeriesPos:
                continue

            found = binSearchDfValue(afterSeries, w[0], 0, afterSeries.shape[0] - 1)
            if found:
                for idx_sub in range(0, idx):
                    w2 = n_gram[idx_sub]
                    if beforeSeriesPos is not None and w2[1] not in beforeSeriesPos:
                        continue
                    found_sub = binSearchDfValue(beforeSeries, w2[0], 0, beforeSeries.shape[0] - 1)
                    if found_sub:
                        row[col] = row[col] + 1
    return row
    
def countOneBeforeOtherNgramPosDf(df, size_ngram, beforeClassCol, afterClassCol,
                                  beforeClassPos = None, afterClassPos = None,
                                beforeClassVal = None, afterClassVal = None, beforeSerieDefault = None):
    columnName = 'pos_ngram_'+str(size_ngram)+"_" + '.'.join(beforeClassPos) + '.'.join(afterClassPos)
    if beforeClassCol is not None:
        columnName += beforeClassCol
    if afterClassCol is not None:
        columnName += afterClassCol
    
    
    compareValBefore = beforeClassVal
    if beforeClassVal is None:
        compareValBefore = beforeClassCol
    else:
        columnName = columnName + beforeClassVal
    
    if beforeSerieDefault is None:
        if beforeClassCol is None:
                lookForBefore = sentimentDictionary.reset_index(drop = True)
                lookForBefore = lookForBefore['entryRaw']
        else:
            lookForBefore = sentimentDictionary[sentimentDictionary[beforeClassCol] == compareValBefore].reset_index(drop=True)
            lookForBefore = lookForBefore['entryRaw']
    else:
        lookForBefore = beforeSerieDefault

    compareValAfter = afterClassVal
    if afterClassVal is None:
        compareValAfter = afterClassCol
    else:
        columnName = columnName + afterClassVal
    
    if afterClassCol is None:
        lookForAfter = sentimentDictionary.reset_index(drop = True)
        lookForAfter = lookForAfter['entryRaw']
    else:
        lookForAfter = sentimentDictionary[sentimentDictionary[afterClassCol] == compareValAfter].reset_index(drop=True)
        lookForAfter = lookForAfter['entryRaw']
    
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOneBeforeOtherNgranPosRow(row, columnName, size_ngram, lookForBefore, lookForAfter, beforeClassPos, afterClassPos), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    return df

In [170]:
tStart = time.time()
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 4), axis = 1)
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 5), axis = 1)
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 6), axis = 1)
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 7), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 2), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 3), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 4), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

KeyError: ('textPos', u'occurred at index 2174')

In [162]:
def calculateBeforeAfterNgramsPos(df):
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    noun_tags = ['NN', 'NNP', 'NNS']
    adjective_tags = ['JJ', 'JJR', 'JJS']
    adjective_adverb_tags = adjective_tags + ['RB', 'RBR', 'RBS']
    print(adjective_adverb_tags)
#    df = countOneBeforeOtherNgramPosDf(df, 2, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
#    df = countOneBeforeOtherNgramPosDf(df, 3, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
#    df = countOneBeforeOtherNgramPosDf(df, 4, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramPosDf(df, 4, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
    df = countOneBeforeOtherNgramPosDf(df, 5, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
    df = countOneBeforeOtherNgramPosDf(df, 6, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
    df = countOneBeforeOtherNgramPosDf(df, 7, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
    return df

    df = countOneBeforeOtherNgramDf(df, 2, 'negativ', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 3, 'negativ', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 4, 'negativ', 'priorpolarity', None, 'negative')

    df = countOneBeforeOtherNgramDf(df, 2, 'negativ', 'priorpolarity', None, 'neutral')
    df = countOneBeforeOtherNgramDf(df, 3, 'negativ', 'priorpolarity', None, 'neutral')
    df = countOneBeforeOtherNgramDf(df, 4, 'negativ', 'priorpolarity', None, 'neutral')

    df = countOneBeforeOtherNgramDf(df, 2, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 3, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 4, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    
    df = countOneBeforeOtherNgramDf(df, 2, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 3, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 4, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    
    df = countOneBeforeOtherNgramDf(df, 2, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 3, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    df = countOneBeforeOtherNgramDf(df, 4, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    
    df = countOneBeforeOtherNgramDf(df, 2, 'hostile', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 3, 'hostile', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 4, 'hostile', 'priorpolarity', None, 'negative')

    df = countOneBeforeOtherNgramDf(df, 2, 'persist', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'persist', 'priorpolarity', None, 'positive')
    
    df = countOneBeforeOtherNgramDf(df, 2, 'pleasur', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'pleasur', 'priorpolarity', None, 'positive')
    
    df = countOneBeforeOtherNgramDf(df, 2, 'weak', 'priorpolarity', None, 'negative')
    df = countOneBeforeOtherNgramDf(df, 3, 'weak', 'priorpolarity', None, 'negative')

    df = countOneBeforeOtherNgramDf(df, 2, 'active', 'priorpolarity', None, 'positive')
    df = countOneBeforeOtherNgramDf(df, 3, 'active', 'priorpolarity', None, 'positive')
    return df

In [163]:
tStart = time.time()
test = calculateBeforeAfterNgramsPos(selectedPolarizedSentencesMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
Timed(pos_ngram_4_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive): 2.97499990463
Timed(pos_ngram_5_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive): 3.32000017166
Timed(pos_ngram_6_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive): 4.1099998951
Timed(pos_ngram_7_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive): 4.24000000954
Timed MPQA: 14.6659998894


In [165]:
test['pos_ngram_7_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive'].value_counts()

0     1850
6       16
4       15
1       15
2       14
5       10
3        7
9        3
15       1
14       1
Name: pos_ngram_7_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive, dtype: int64

In [130]:
if not onlyMpqa:
    sentencesTestSubsetImdb[sentencesTestSubsetImdb['moreNegativeThanNeutral'] == True].shape

In [289]:
selected_columns = [
        'negations',
#        'wordCount', 
        'positivCount',
        'negativCount',
        'priorpolaritynegativeAfter',
        'priorpolaritypositiveBefore',
        'activeBefore', 'passiveBefore',
        'priorpolaritynegativeBefore',
        'hostileBefore', 'yesBefore', 'yesAfter', 'noBefore', 'negateBefore',
#        'activeCountBefore','passiveCountBefore', 'hostileCountBefore', 'yesCountBefore',
#        'noCountBefore', 'negateCountBefore', 'negateCountAfter', 
        #'activeAfter',  'passiveAfter',
        'priorpolaritynegativeCountBefore', 'priorpolaritypositiveCountBefore',
#        'activeCountAfter', 'passiveCountAfter', 'hostileCountAfter', 
        'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
#       'yesCountAfter', 'noCountAfter', 
#        'priorpolaritynegativeCountAfter',
        'priorpolaritypositiveAfter',
        'priorpolarityCountpositive',
        'priorpolarityCountpositivestrongsubj',
        'negateAfter',
        'strongCount',
        'punctuation',
'ngram_2_negativpriorpolaritynegative', 'ngram_3_negativpriorpolaritynegative',
'ngram_4_negativpriorpolaritynegative', 'ngram_2_negativpriorpolarityneutral',
'ngram_3_negativpriorpolarityneutral', 'ngram_4_negativpriorpolarityneutral',
'ngram_2_negativpriorpolaritypositive','ngram_3_negativpriorpolaritypositive',
'ngram_4_negativpriorpolaritypositive', 'ngram_2_negationspriorpolaritypositive',
'ngram_3_negationspriorpolaritypositive', 'ngram_4_negationspriorpolaritypositive',
#'ngram_2_negationspriorpolaritynegative', 'ngram_3_negationspriorpolaritynegative',
'ngram_4_negationspriorpolaritynegative', 'ngram_2_negationspriorpolarityneutral',
#'ngram_3_negationspriorpolarityneutral', 'ngram_4_negationspriorpolarityneutral',
'ngram_3_hostilepriorpolaritynegative', 'ngram_3_activepriorpolaritypositive',
 'ngram_4_hostilepriorpolaritynegative',
'ngram_2_persistpriorpolaritypositive', 'ngram_3_persistpriorpolaritypositive', 
'ngram_2_pleasurpriorpolaritypositive', 'ngram_3_pleasurpriorpolaritypositive',
'ngram_2_weakpriorpolaritynegative', 'ngram_3_weakpriorpolaritynegative',
        'priorpolaritypositiveCountAfter',
#        'hostileAfter', 'noAfter',
        'morePositiveThanNeutral', 'moreNegativeThanNeutral',
#        'priorpolarityCountboth',
        'priorpolarityCountneutral', 'priorpolarityCountneutralstrongsubj',
        'priorpolarityCountbothstrongsubj', 'priorpolarityCountbothweaksubj',
        'priorpolarityCountneutralweaksubj',
        'priorpolarityneutralAfter',  'priorpolarityneutralBefore',
        'priorpolaritybothBefore', 'priorpolaritybothAfter'
    ]

selected_columns_train = selected_columns + ['sentiment-intensity']

n_cuts = 5
print("Number of features: {}".format(len(selected_columns)))

Number of features: 59


In [290]:
selected_columns_neutral = [
        'negations',
#        'wordCount', 
        'positivCount',
        'negativCount',
#        'priorpolaritynegativeAfter',
        'priorpolaritypositiveBefore',
        'activeBefore', 'passiveBefore',
#        'priorpolaritynegativeBefore',
#        'hostileBefore', 'yesBefore', 'yesAfter', 'noBefore', 'negateBefore',
 #       'activeCountBefore','passiveCountBefore', 'hostileCountBefore', 'yesCountBefore',
#        'noCountBefore', 'negateCountBefore', 'negateCountAfter', 
        #'activeAfter',  'passiveAfter',
        'priorpolaritynegativeCountBefore', 'priorpolaritypositiveCountBefore',
#        'activeCountAfter', 'passiveCountAfter', 'hostileCountAfter', 
        'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
#       'yesCountAfter', 'noCountAfter', 
        'priorpolaritynegativeCountAfter',
#        'priorpolaritypositiveAfter',
#        'negateAfter',
#        'strongCount',
        'priorpolaritypositiveCountAfter',
#        'hostileAfter', 'noAfter',
        'morePositiveThanNeutral', 'moreNegativeThanNeutral',
        'priorpolarityCountboth',
        'priorpolarityCountneutral', 'priorpolarityCountneutralstrongsubj',
#        'priorpolarityCountbothstrongsubj', 'priorpolarityCountbothweaksubj',
        'priorpolarityCountneutralweaksubj',
        'priorpolarityneutralAfter',  'priorpolarityneutralBefore'
#        'priorpolaritybothBefore', 'priorpolaritybothAfter'
    ]

selected_columns_train_neutral = selected_columns_neutral + ['sentiment-neutral']
n_cuts_neutrals = 6
print("Number of features: {}".format(len(selected_columns_neutral)))

Number of features: 21


In [291]:
def analyzeErrors(df, predicted, expected):
    print("Accuracy: {}".format(calculateAccuracy(df, predicted, expected)))
    print("Accuracy in positive sentiment: {}".format(calculateAccuracy(df[df[expected] == sentiment_positive], predicted, expected)))
    print("Accuracy in negative sentiment: {}".format(calculateAccuracy(df[df[expected] == sentiment_negative], predicted, expected)))
    print("Accuracy in neutral sentiment: {}".format(calculateAccuracy(df[df[expected] == 0], predicted, expected))) 
    cm = confusion_matrix(df[expected], df[predicted])
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm)
    print(cm_normalized)


def calculateAccuracy(df, predicted, expected):
    if df.shape[0] == 0:
        return 0
    else:
        return df[df[predicted] == df[expected]].shape[0] / float(df.shape[0])

In [292]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [293]:
def create_n_stratified_parts(df, n_cuts, project_columns):
    copy_df = df[project_columns].copy().reset_index()
    
    copy_df = copy_df.reindex(np.random.permutation(copy_df.index))
    positive = copy_df[copy_df['sentiment-intensity'] == sentiment_positive].reset_index(drop = True)
    neutral = copy_df[copy_df['sentiment-intensity'] == 0].reset_index(drop = True)
    negative = copy_df[copy_df['sentiment-intensity'] == sentiment_negative].reset_index(drop = True)
    
    positive_parts = []
    negative_parts = []
    neutral_parts = []
    positive_step = positive.shape[0] / n_cuts
    negative_step = negative.shape[0] / n_cuts
    neutral_step = neutral.shape[0] / n_cuts
    for part in range(n_cuts):
        if part == n_cuts - 1:
            positive_parts.append(positive[part * positive_step:])
            negative_parts.append(negative[part * negative_step:])
            neutral_parts.append(neutral[part * neutral_step:])
        else:
            positive_parts.append(positive[part * positive_step : (part+1) * positive_step])
            negative_parts.append(negative[part * negative_step : (part+1) * negative_step])
            neutral_parts.append(neutral[part * neutral_step : (part+1) * neutral_step])
            
    
    return positive_parts, negative_parts, neutral_parts

In [294]:
def create_n_stratified_neutral_parts(df, n_cuts, project_columns):
    copy_df = df[project_columns].copy().reset_index()
    
    copy_df = copy_df.reindex(np.random.permutation(copy_df.index))
    neutral = copy_df[copy_df['sentiment-neutral'] == 1].reset_index(drop = True)
    other = copy_df[copy_df['sentiment-neutral'] == 0].reset_index(drop = True)
    
    other_parts = []
    neutral_parts = []
    other_step = other.shape[0] / n_cuts
    neutral_step = neutral.shape[0] / n_cuts
    for part in range(n_cuts):
        if part == n_cuts - 1:
            other_parts.append(other[part * other_step:])
            neutral_parts.append(neutral[part * neutral_step:])
        else:
            other_parts.append(other[part * other_step : (part+1) * other_step])
            neutral_parts.append(neutral[part * neutral_step : (part+1) * neutral_step])
            
    return other_parts, neutral_parts

In [295]:
positive_mixed_parts, negative_mixed_parts, neutral_mixed_parts = create_n_stratified_parts(selectedPolarizedSentencesMpqa, n_cuts, selected_columns_train)

In [296]:
other_parts, neutral_parts = create_n_stratified_neutral_parts(neutralSelectedSentencesMpqa, n_cuts_neutrals, selected_columns_train_neutral)

In [302]:
best_test = None
best_train = None
best_predicted = None
best_accuracy = 0
best_clasifier = None

worst_accurracy = 1
worst_test = None
worst_predicted = None
classifier = None

accuracy = []
num_repeats = 2
consts = [4, 5, 6, 7, 8, 10, 12, 15]
#consts = [8]

for idx_const in range(len(consts)):
    print("######### Next Constant")
    for part in range(n_cuts):
        test = pd.DataFrame({})
        train = pd.DataFrame({})
        # create test and training set
        for idx in range(n_cuts):
            if idx == part:
                test = pd.concat([positive_mixed_parts[idx], negative_mixed_parts[idx], neutral_mixed_parts[idx]])
            else:
                train = pd.concat([train, positive_mixed_parts[idx], negative_mixed_parts[idx], neutral_mixed_parts[idx]])

        train.reset_index(inplace = True, drop = True)
        train = train.reindex(np.random.permutation(train.index))
        # train & evaluate
        train_results = train['sentiment-intensity'].values
        train.drop('sentiment-intensity', axis = 1, inplace = True)
        train_index = train['index'].values
        train.drop('index', axis = 1, inplace = True)    
        test_results = test['sentiment-intensity'].values
        test.drop('sentiment-intensity', axis = 1, inplace = True)
        test_index = test['index'].values
        test.drop('index', axis = 1, inplace = True)    
        acc = []
        for repeat_test in range(num_repeats):
            clasifier = RandomForestClassifier(n_estimators = 100, max_features = 'log2',
                                                max_depth = consts[idx_const], n_jobs = -1)
#            clasifier = ExtraTreesClassifier(n_estimators = consts[idx_const], max_features = 'sqrt',
#                                                max_depth = 7, n_jobs = -1)

#            clasifier = AdaBoostClassifier( ExtraTreesClassifier(n_estimators = consts[idx_const], max_features = 'sqrt',
#                                                max_depth = 7, n_jobs = -1),
#                                                n_estimators=consts[idx_const],
#                                                learning_rate=1.8)

#            clasifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=6),
#                                                n_estimators=consts[idx_const],
#                                                learning_rate=1.8)
            clasifier.fit(train.values, train_results)

            predicted = clasifier.predict(test.values)
            current_score = clasifier.score(train.values, train_results)
            list_predicted = list(predicted)
            current_accuracy = accuracy_score(test_results, predicted)
            acc.append(current_accuracy)
            print("idx = {}, round = {}, cut = {}, accuracy = {}, score = {}".format(idx_const, repeat_test, part, current_accuracy, current_score))
            
            if worst_accurracy > current_accuracy:
                worst_accurracy = current_accuracy
                worst_test = test.copy()
                worst_test['index'] = test_index
                worst_test['sentiment-intensity'] = test_results
                worst_test['predicted'] = predicted
                print("@@@ Worst Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))
            
            if best_accuracy < current_accuracy:
                best_test = test.copy()
                best_test['index'] = test_index
                best_test['sentiment-intensity'] = test_results
                best_test['predicted'] = predicted
                best_train = train.copy()
                best_train['index'] = train_index
                best_train['sentiment-intensity'] = train_results
                best_predicted = predicted.copy()
                best_clasifier = clasifier
                best_accuracy = current_accuracy
                print("@@@ Best Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))

        accuracy.append(np.mean(acc))


    print("{}) Mean Accuracy: {}".format(consts[idx_const], np.mean(accuracy)))

######### Next Constant
idx = 0, round = 0, cut = 0, accuracy = 0.453125, score = 0.499354005168
@@@ Worst Configuration: idx = 0, round = 0, cut = 0, accuracy = 0.453125
@@@ Best Configuration: idx = 0, round = 0, cut = 0, accuracy = 0.453125
idx = 0, round = 1, cut = 0, accuracy = 0.447916666667, score = 0.50322997416
@@@ Worst Configuration: idx = 0, round = 1, cut = 0, accuracy = 0.447916666667
idx = 0, round = 0, cut = 1, accuracy = 0.46875, score = 0.507105943152
@@@ Best Configuration: idx = 0, round = 0, cut = 1, accuracy = 0.46875
idx = 0, round = 1, cut = 1, accuracy = 0.466145833333, score = 0.516149870801
idx = 0, round = 0, cut = 2, accuracy = 0.46875, score = 0.509043927649
idx = 0, round = 1, cut = 2, accuracy = 0.463541666667, score = 0.50645994832
idx = 0, round = 0, cut = 3, accuracy = 0.4765625, score = 0.510335917313
@@@ Best Configuration: idx = 0, round = 0, cut = 3, accuracy = 0.4765625
idx = 0, round = 1, cut = 3, accuracy = 0.486979166667, score = 0.50516795865

In [303]:
#best_test['predicted'] = best_predicted
analyzeErrors(best_test, 'predicted', 'sentiment-intensity')

Accuracy: 0.517676767677
Accuracy in positive sentiment: 0.386363636364
Accuracy in negative sentiment: 0.568181818182
Accuracy in neutral sentiment: 0.598484848485
[[75 39 18]
 [24 79 29]
 [25 56 51]]
[[ 0.56818182  0.29545455  0.13636364]
 [ 0.18181818  0.59848485  0.21969697]
 [ 0.18939394  0.42424242  0.38636364]]


In [304]:
#best_test['predicted'] = best_predicted
analyzeErrors(worst_test, 'predicted', 'sentiment-intensity')

Accuracy: 0.434895833333
Accuracy in positive sentiment: 0.3203125
Accuracy in negative sentiment: 0.4453125
Accuracy in neutral sentiment: 0.5390625
[[57 57 14]
 [27 69 32]
 [26 61 41]]
[[ 0.4453125  0.4453125  0.109375 ]
 [ 0.2109375  0.5390625  0.25     ]
 [ 0.203125   0.4765625  0.3203125]]


In [219]:
best_test = None
best_train = None
best_predicted = None
best_accuracy = 0
best_clasifier = None

accuracy = []
num_repeats = 2
consts = [4, 5, 6, 7, 8, 9]
#consts = [8]

for idx_const in range(len(consts)):
    print("######### Next Constant")
    for part in range(n_cuts_neutrals):
        test = pd.DataFrame({})
        train = pd.DataFrame({})
        # create test and training set
        for idx in range(n_cuts_neutrals):
            if idx != part:
                test = pd.concat([other_parts[idx], neutral_parts[idx]])
            else:
                train = pd.concat([train, other_parts[idx], neutral_parts[idx]])

        train.reset_index(inplace = True, drop = True)
        train = train.reindex(np.random.permutation(train.index))
        # train & evaluate
        train_results = train['sentiment-neutral'].values
        train.drop('sentiment-neutral', axis = 1, inplace = True)
        train_index = train['index'].values
        train.drop('index', axis = 1, inplace = True)    
        test_results = test['sentiment-neutral'].values
        test.drop('sentiment-neutral', axis = 1, inplace = True)
        test_index = test['index'].values
        test.drop('index', axis = 1, inplace = True)    
        acc = []
        for repeat_test in range(num_repeats):
            clasifier = RandomForestClassifier(n_estimators = 100, max_features = 'log2',
                                                max_depth = consts[idx_const], n_jobs = -1)

#            clasifier = AdaBoostClassifier( ExtraTreesClassifier(n_estimators = consts[idx_const], max_features = 'sqrt',
#                                                max_depth = 7, n_jobs = -1),
#                                                n_estimators=consts[idx_const],
#                                                learning_rate=1.8)

#            clasifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=6),
#                                                n_estimators=consts[idx_const],
#                                                learning_rate=1.8)
            clasifier.fit(train.values, train_results)
            current_score = clasifier.score(train.values, train_results)
            predicted = clasifier.predict(test.values)
            list_predicted = list(predicted)
            current_f1_score = f1_score(test_results, predicted)
            current_accuracy = accuracy_score(test_results, predicted)
            acc.append(current_accuracy)
            print("idx = {}, round = {}, cut = {}, accuracy = {}, train score = {}, f1 score = {}"
                          .format(idx_const, repeat_test, part, current_accuracy, current_score, current_f1_score))

            if best_accuracy < current_score:
                best_test = test.copy()
                best_test['index'] = test_index
                best_test['sentiment-neutral'] = test_results
                best_test['predicted'] = predicted
                best_train = train.copy()
                best_train['index'] = train_index
                best_train['sentiment-neutral'] = train_results
                best_predicted = predicted.copy()
                best_clasifier = clasifier
                best_accuracy = current_score
                print("@@@ Best Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))

        accuracy.append(np.mean(acc))


    print("{}) Mean Accuracy: {}".format(consts[idx_const], np.mean(accuracy)))

######### Next Constant
idx = 0, round = 0, cut = 0, accuracy = 0.561643835616, train score = 0.640845070423, f1 score = 0.60736196319
@@@ Best Configuration: idx = 0, round = 0, cut = 0, accuracy = 0.561643835616
idx = 0, round = 1, cut = 0, accuracy = 0.582191780822, train score = 0.62676056338, f1 score = 0.598684210526
idx = 0, round = 0, cut = 1, accuracy = 0.544520547945, train score = 0.661971830986, f1 score = 0.569579288026
@@@ Best Configuration: idx = 0, round = 0, cut = 1, accuracy = 0.544520547945
idx = 0, round = 1, cut = 1, accuracy = 0.565068493151, train score = 0.661971830986, f1 score = 0.609230769231
idx = 0, round = 0, cut = 2, accuracy = 0.479452054795, train score = 0.665492957746, f1 score = 0.558139534884
@@@ Best Configuration: idx = 0, round = 0, cut = 2, accuracy = 0.479452054795
idx = 0, round = 1, cut = 2, accuracy = 0.479452054795, train score = 0.669014084507, f1 score = 0.560693641618
@@@ Best Configuration: idx = 0, round = 1, cut = 2, accuracy = 0.479

In [None]:
best_test['predicted'] = best_predicted
analyzeErrors(best_test, 'predicted', 'sentiment-neutral')

In [None]:
wrong_sentences = neutralSelectedSentencesMpqa.loc[best_test.loc[best_test['predicted'] != best_test['sentiment-neutral'], 'index'], neutralSelectedSentencesMpqa.columns]

In [None]:
# verification phase
def directClassification(row, pos_column, neg_column, suffix):
    if row[pos_column] < row[neg_column]:
        row['apriori-class-'+suffix] = -1
    elif row[pos_column] > row[neg_column]:
        row['apriori-class-'+suffix] = 1
    else:
        row['apriori-class-'+suffix] = 0
    
    return row

In [None]:
if doSave:
    joblib.dump(best_clasifier, './models/sentiment/version_3/version_3.pkl')

In [None]:
def calculateSentiment(sentences, column_predicted):
    numPositive = sentences[sentences[column_predicted] == sentiment_positive].shape[0]
    numNegative = sentences[sentences[column_predicted] == sentiment_negative].shape[0]
    numNeutral = sentences[sentences[column_predicted] == 0].shape[0]
#    print("Pos = {}; Neg = {}; Neutral = {}".format(numPositive, numNegative, numNeutral))
    if numPositive > numNegative:
        return sentiment_positive
    elif numPositive < numNegative:
        return sentiment_negative
    else:
        return 0    

def calculateSentimentReview(review_id, df, col):
    review_data = review_id.split('#')
    review_set_id = int(review_data[2])
    review_rating = int(review_data[1])
    review_df_id = int(review_data[0])
    sentences = df[(df['id'] == review_df_id) & (df['set'] == review_set_id) & (df['rating'] == review_rating) ]
    return calculateSentiment(sentences, col)

In [None]:
test_reviews_even['expected'] = test_reviews_even['type']
verificationData = sentencesTestSubsetImdb[selected_columns]

In [None]:
sentencesTestSubsetImdb['trained-classifier-predicted'] = best_clasifier.predict(verificationData.values)

In [None]:
test_reviews_even['predicted-model'] = test_reviews_even['real-id'].map(lambda cell: calculateSentimentReview(cell, sentencesTestSubsetImdb, 'trained-classifier-predicted'))
print("Done")

In [None]:
sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)
test_reviews_even['predicted-direct'] = test_reviews_even['real-id'].map(lambda cell: calculateSentimentReview(cell, sentencesTestSubsetImdb, 'apriori-class-direct'))
print("Done")
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)

In [None]:
print("Direct model: {}".format(calculateAccuracy(test_reviews_even, 'predicted-direct', 'expected')))
print("Trained model: {}".format(calculateAccuracy(test_reviews_even, 'predicted-model', 'expected')))
print("")
print("Direct model (sentences): {}".format(calculateAccuracy(selectedPolarizedSentencesMpqa, 'apriori-class-direct', 'sentiment-intensity')))

In [None]:
print("Direct model")
analyzeErrors(test_reviews_even, 'predicted-direct', 'expected')
print
print("Trained model")
analyzeErrors(test_reviews_even, 'predicted-model', 'expected')


In [None]:
# Neural network
from tools.thirdparty.nnbook import network2

In [None]:
def preprocessResultsNN(row):
    if row['expected'] > 0:
        row['pos'] = 1
        row['neuron'] = 3
    elif row['expected'] < 0:
        row['neg'] = 1
        row['neuron'] = 1
    else:
        row['neutral'] = 1
        row['neuron'] = 2
    
    return row

def preprocessDataFrameForNN(df, not_test = True):
    df_nn = df.drop('index', axis = 1).reset_index(drop = True)
    expected_dt = pd.DataFrame(np.zeros((df_nn.shape[0], 5)), columns = ['neg', 'neutral', 'pos', 'neuron', 'expected'])
    expected_dt['expected'] = df_nn['sentiment-intensity']
    expected_dt = expected_dt.apply(lambda row: preprocessResultsNN(row), axis = 1)
    df_nn.drop('sentiment-intensity', axis = 1, inplace = True)
    if 'predicted' in df_nn.columns:
        df_nn.drop('predicted', axis = 1, inplace = True)

    inp_size = len(df_nn.columns)
    out_size = 3
    
    for col in df_nn.columns:
        if df_nn[col].dtype == 'bool':
            df_nn[col] = df_nn[col].astype(int)

    res = expected_dt[['neg', 'neutral', 'pos']].values
#    if not_test == False:
#        res = expected_dt['neuron'].values
#        out_size = 1
            
    training_data = [(np.ndarray((inp_size,1), buffer=np.array(x), dtype=int),
                    np.ndarray((out_size,1), buffer=np.array(y), dtype=int))
                    for (x,y) in zip(df_nn.values, res)]

    return training_data

In [None]:
pos_test = best_train[best_train['sentiment-intensity'] == sentiment_positive][1:5]
neg_test = best_train[best_train['sentiment-intensity'] == sentiment_negative][1:5]
neutral_test = best_train[best_train['sentiment-intensity'] == 0][1:5]
train_test_data = pd.concat([pos_test, neg_test, neutral_test])
train_test_data.shape

In [None]:
training_data = preprocessDataFrameForNN(best_train)
test_data = preprocessDataFrameForNN(best_test, False)

In [None]:
training_data[0][0].shape

In [None]:
experiment_data = preprocessDataFrameForNN(train_test_data)

In [None]:
hidden_neurons = 30
nn = network2.Network([len(training_data[0][0]), hidden_neurons, hidden_neurons, 3],
                                cost=network2.CrossEntropyCost)
#nn.large_weight_initializer()

In [None]:
epochs = 100
lmbda = 0.00002
learningRate = 0.00002
batchSize = 3

nn.SGD(
#        experiment_data
        training_data
       , epochs,batchSize, learningRate,
        lmbda = lmbda, evaluation_data = test_data, 
           monitor_training_accuracy = True
       , monitor_evaluation_accuracy = True,
           monitor_evaluation_cost = True, monitor_training_cost = True
      )

In [None]:
test_data