In [2]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import time
import string

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

In [5]:
from tools.parsers.corpora_sentiment import generalinquirer as generalInquirerParser
from tools.parsers.corpora_sentiment import largemoviereviews as largeMovieReviewsParser
from tools.parsers.corpora_sentiment import generalinquirer as generalInquirerParser
from tools.parsers.corpora_sentiment import negation as negationParser 

from tools.sentimentanalysis import preparation
from nltk.util import ngrams
from nltk import pos_tag, pos_tag_sents

In [29]:
maxWords = 30
onlyMpqa = True
doSave = False

In [7]:
stemmer = SnowballStemmer("english")
prepData = preparation.Preparation()
parserInquirer = generalInquirerParser.GeneralInquirer()
parserNegation = negationParser.Negation()
parserMovieReviews = largeMovieReviewsParser.LargeMovieReviews()

In [8]:
sentencesMpqa = parserInquirer.readFileCsv(prepData.defaultFileNameSentimentSentencesNormalized)
non_english_text = ['im_401b_e73i32c22_031705-2', 'IZ-060316-01-Trans-1', '20000815_AFP_ARB.0084.IBM-HA-NEW', 'NapierDianne']
polarizedSentencesMpqa = sentencesMpqa[sentencesMpqa['sentiment-type'] == 0]
polarizedSentencesMpqa = polarizedSentencesMpqa[np.invert(polarizedSentencesMpqa['docName'].isin(non_english_text))]
polarizedSentencesMpqa.loc[polarizedSentencesMpqa.index, 'textRaw'] = polarizedSentencesMpqa.loc[polarizedSentencesMpqa.index, 'text']
polarizedSentencesMpqa.loc[polarizedSentencesMpqa.index, 'textLower'] = polarizedSentencesMpqa.loc[polarizedSentencesMpqa.index, 'text'].map( lambda cell: cell.lower())

In [9]:
sentimentDictionary  = parserInquirer.readFileCsv(parserInquirer.combinedFileLoc)
negations = parserNegation.readFileCsv(parserNegation.defaultFileNameProcessed)
sentencesImdb = parserMovieReviews.readFileCsv(parserMovieReviews.defaultFileNameProcessed)
expectedResultsImdb = parserMovieReviews.readFileCsv(parserMovieReviews.defaultFileNameProcessedOverall)

In [10]:
sentimentDictionary['entryRaw'] = sentimentDictionary['entry']
sentimentDictionary['entry'] = sentimentDictionary['entry'].map(lambda cell: stemmer.stem(cell))

negations['phraseStemmed'] = negations['phrase'].map(lambda cell: stemmer.stem(cell).lower())

sentencesImdb = sentencesImdb[np.invert(sentencesImdb['id'].isnull())]
sentencesImdb = sentencesImdb[np.invert(sentencesImdb['text'].isnull())] 
sentencesImdb['textRaw'] = sentencesImdb['text'] 

In [11]:
def duplicate_attributes_directory(directory):
    checkCategories = ['positiv', 'negativ', 'active', 'passive', 'affil', 'hostile', 'strong',
                       'power', 'weak', 'submit', 'yes', 'no',
                       'negate', 'intrj', 'pleasur', 'pain', 'feel', 'need', 'persist']
    # copy categories among duplicate words in dictionary
    idx = 0
    while idx < directory.shape[0] - 1:
        found = {}
        for key in checkCategories:
            found[key] = directory[key][idx] == key

        idx2 = idx + 1
        while idx2 < directory.shape[0]:
            if directory['entry'][idx] == directory['entry'][idx2]:
                for key in checkCategories:
                    if directory[key][idx2] == key:
                        found[key] = True
                idx2 += 1
            else:
                idx2 -= 1
                break

        if idx2 == idx:
            idx += 1
        else:
            idx2 += 1
            for key in checkCategories:
                if found[key] != False:
                    directory[key][idx:idx2] = key

            idx = idx2 + 1
    return directory

In [12]:
pd.set_option('display.max_columns', 500)
sentimentDictionary = duplicate_attributes_directory(sentimentDictionary)

In [13]:
sentimentDictionary.drop_duplicates(subset = 'entry', inplace = True)
sentimentDictionary.shape

(2736, 26)

In [14]:
def infoDf(df, col):
    print("Negative: {}".format(df[df[col] < 0].shape))
    print("Neutral: {}".format(df[df[col] == 0].shape))
    print("Positive: {}".format(df[df[col] > 0].shape))

In [15]:
print("MPQA")
infoDf(polarizedSentencesMpqa, 'sentiment-intensity')
print
print("IMDB")
infoDf(sentencesImdb, 'type')

MPQA
Negative: (3131, 14)
Neutral: (2058, 14)
Positive: (2392, 14)

IMDB
Negative: (273487, 6)
Neutral: (0, 6)
Positive: (258804, 6)


In [16]:
def remove_articles(row):
    tokens = row['textArrStemmed']
    articles = ['a', 'an', 'the']
    tokens = [w for w in tokens if w not in articles]
    row['tokens'] = tokens
    return row

In [18]:
def get_punctuation(row):
    last_character = row['text'][-1:]
    if last_character in '?!.':
        row['punctuation'] = ord(last_character)
    else:
        if last_character in '"\'':
            penultimate_character = row['text'][-2:-1]
            if penultimate_character in '?!.':
                row['punctuation'] = ord(penultimate_character)
            else:
                row['punctuation'] = None
        else:
            row['punctuation'] = ord('x')

    return row

In [19]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: get_punctuation(row), axis = 1)
tEnd = time.time()
print("Timed MPQA data: {}".format(str(tEnd - tStart)))

Timed MPQA data: 6.6850001812


In [20]:
# prepare movie reviews data
if not onlyMpqa:
    tStart = time.time()
    sentencesImdb = sentencesImdb.apply(lambda row: get_punctuation(row), axis = 1) 
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

Timed IMDB: 488.314999819


In [21]:
polarizedSentencesMpqa = polarizedSentencesMpqa[polarizedSentencesMpqa['punctuation'].notnull()]

In [22]:
polarizedSentencesMpqa.shape

(7865, 15)

In [23]:
def cleanUpWord(w):
    return w.translate(None, string.punctuation).strip()

In [24]:
def processWordsMpqa(row):
    global stemmer

    words = [ word for word in [cleanUpWord(w) for w in word_tokenize(row['textLower'])] if len(word) > 0]
    row['text'] = '|' + '|'.join(words) + '|'
    row['wordCount'] = len(words)
    row['textArr'] = words
    
    wordsStemmed = [stemmer.stem(unicode(w, errors='ignore')) for w in words]
    row['textStemmed'] = '|'.join(wordsStemmed)
    row['textStemmed'] = '|' + row['textStemmed'] + '|'
    row['textArrStemmed'] = wordsStemmed
    return row

def processWordsMovies(row):
    global stemmer

    words = [ word for word in [cleanUpWord(w) for w in row['text'].split('|')] if len(word) > 0]    
    wordsStemmed = [stemmer.stem(unicode(w, errors='ignore')) for w in words]
    row['textStemmed'] = '|'.join(wordsStemmed).lower()
    row['textStemmed'] = '|' + row['textStemmed'] + '|'
    row['wordCount'] = len(words)
    row['textArr'] = words
    row['textArrStemmed'] = wordsStemmed
    row['text'] = '|' + row['text'] + '|'
    return row

In [25]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: processWordsMpqa(row), axis = 1)
tEnd = time.time()
print("Timed MPQA data: {}".format(str(tEnd - tStart)))

Timed MPQA data: 31.9789998531


In [26]:
polarizedSentencesMpqa = polarizedSentencesMpqa[polarizedSentencesMpqa['wordCount'] < maxWords]

In [27]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: remove_articles(row), axis = 1)
tEnd = time.time()
print("Timed MPQA data: {}".format(str(tEnd - tStart)))

Timed MPQA data: 5.90099978447


In [28]:
def generate_row_id(row):
    row['real-id'] = "{}#{}#{}".format(int(row['id']), int(row['rating']), int(row['set']))
    return row

In [30]:
# prepare movie reviews data
if not onlyMpqa:
    sentencesImdb = sentencesImdb.apply(lambda row: generate_row_id(row), axis = 1) 
    expectedResultsImdb = expectedResultsImdb.apply(lambda row: generate_row_id(row), axis = 1)

In [31]:
def equal_cuts(df, size_df, col):
    perm_df = df.reindex(np.random.permutation(df.index))
    res_df = pd.DataFrame({}, columns = df.columns)
    unique_parts = perm_df[col].unique()
    num_parts = int(size_df / len(unique_parts)) 
    for unique in unique_parts:
        selected_rows = perm_df[perm_df[col] == unique]
        res_df = pd.concat([res_df, selected_rows[:num_parts]])
    
    return res_df.reindex(np.random.permutation(res_df.index))

In [32]:
if not onlyMpqa:
    test_reviews_even = equal_cuts(expectedResultsImdb, 2000, 'rating')
    sentencesTestSubsetImdb = sentencesImdb[sentencesImdb['real-id'].isin(test_reviews_even['real-id'])]

    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: processWordsMovies(row), axis = 1)
    tEnd = time.time()
    print("Timed Process Movies data: {}".format(str(tEnd - tStart)))
    
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: remove_articles(row), axis = 1) 
    tEnd = time.time()
    print("Timed Remove Articles IMDB: {}".format(str(tEnd-tStart)))
    test_reviews_even['expected'] = test_reviews_even['type']

In [33]:
def countNegations(row):
    words = row['textArrStemmed']
    negations['found'] = negations.apply(lambda row: 1 if row['phraseStemmed'] in words else 0, axis = 1)
    row['negations'] = negations['found'].sum()
    return row

In [34]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: countNegations(row), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

Timed MPQA: 12.0680000782


In [35]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: countNegations(row), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [36]:
polarizedSentencesMpqa['negations'].value_counts()

0    5081
1     677
2       3
Name: negations, dtype: int64

In [37]:
if not onlyMpqa:
    sentencesTestSubsetImdb['negations'].value_counts()

In [38]:
featureColumnsMpqa = {'punctuation' : 1, 'negations' : 1}
featureColumnsImdb = {'punctuation' : 1, 'negations' : 1}

In [39]:
def binSearchDfValue(df, val, start, end, isDebug = False):
    while start <= end:
        if isDebug:
            print("Start: {};    End: {}".format(start, end))
        middle = (start + end) / 2
        if isDebug:
            print("middle: {}".format(df[middle]))
        if df[middle] == val:
            return True
        else:
            if df[middle] > val:
                end = middle - 1
            else:
                start = middle + 1
    
    return False

def countOccurenceInColumn(row, lookFor, lookForLen, columnName):
    words = row['textArr']
    res = [word for word in words if binSearchDfValue(lookFor, word, 0, lookForLen)]
    row[columnName] = len(res)
    return row

def CalculateColumnCounts(colArr, column, df, val = None, typeWord = None):
    compareVal = column
    if column == None:
        columnName = "All"+typeWord
    else:
        if val != None:
            compareVal = val
        columnName = column+'Count'
        if val != None:
            columnName = columnName + val
        if typeWord != None:
            columnName = columnName + typeWord
        
    if typeWord == None:
        lookFor = sentimentDictionary[sentimentDictionary[column] == compareVal]
    else:
        if column == None:
            lookFor = sentimentDictionary[(sentimentDictionary['type'] == typeWord)]
        else:
            lookFor = sentimentDictionary[(sentimentDictionary[column] == compareVal) & (sentimentDictionary['type'] == typeWord)]
    
    lookForVals = lookFor['entry'].values
    lookForLen = len(lookForVals) - 1
    print("LookFor shape: {}".format(lookFor.shape))
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOccurenceInColumn(row, lookForVals, lookForLen, columnName), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    colArr[columnName] = 1
    
    return df, colArr

In [40]:
def calculateReiredColumnCounts(df, colArr):
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'negative')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'both')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'neutral')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'positive')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'negative', 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'positive', 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'neutral', 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'both', 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'negative', 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'positive', 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'both', 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'priorpolarity', df, 'neutral', 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, None, df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, None, df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'hostile', df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'strong', df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'hostile', df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'strong', df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'hostile', df)
    df, colArr = CalculateColumnCounts(colArr, 'strong', df)
    df, colArr = CalculateColumnCounts(colArr, 'active', df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'passive', df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'active', df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'passive', df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'active', df)
    df, colArr = CalculateColumnCounts(colArr, 'passive', df)
    df, colArr = CalculateColumnCounts(colArr, 'positiv', df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'negativ', df, None, 'weaksubj')
    df, colArr = CalculateColumnCounts(colArr, 'positiv', df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'negativ', df, None, 'strongsubj')
    df, colArr = CalculateColumnCounts(colArr, 'positiv', df)
    df, colArr = CalculateColumnCounts(colArr, 'negativ', df)
    df, colArr = CalculateColumnCounts(colArr, 'yes', df)
    df, colArr = CalculateColumnCounts(colArr, 'no', df)

    return df, colArr

In [41]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb, featureColumnsImdb = calculateReiredColumnCounts(sentencesTestSubsetImdb, featureColumnsImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [42]:
tStart = time.time()
polarizedSentencesMpqa, featureColumnsMpqa = calculateReiredColumnCounts(polarizedSentencesMpqa, featureColumnsMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

LookFor shape: (1552, 26)
Timed(priorpolarityCountnegative): 1.89800000191
LookFor shape: (7, 26)
Timed(priorpolarityCountboth): 1.26099991798
LookFor shape: (137, 26)
Timed(priorpolarityCountneutral): 1.84599995613
LookFor shape: (1040, 26)
Timed(priorpolarityCountpositive): 1.87900018692
LookFor shape: (972, 26)
Timed(priorpolarityCountnegativestrongsubj): 2.03900003433
LookFor shape: (584, 26)
Timed(priorpolarityCountpositivestrongsubj): 2.00100016594
LookFor shape: (45, 26)
Timed(priorpolarityCountneutralstrongsubj): 1.53900003433
LookFor shape: (6, 26)
Timed(priorpolarityCountbothstrongsubj): 1.2650001049
LookFor shape: (580, 26)
Timed(priorpolarityCountnegativeweaksubj): 1.7539999485
LookFor shape: (456, 26)
Timed(priorpolarityCountpositiveweaksubj): 1.78300023079
LookFor shape: (1, 26)
Timed(priorpolarityCountbothweaksubj): 1.06900000572
LookFor shape: (92, 26)
Timed(priorpolarityCountneutralweaksubj): 1.56299996376
LookFor shape: (1129, 26)
Timed(Allweaksubj): 1.95799994469
Loo

In [43]:
featureColumnsMpqa.keys()

['negativCountweaksubj',
 'priorpolarityCountpositive',
 'negativCount',
 'activeCount',
 'Allstrongsubj',
 'positivCountstrongsubj',
 'activeCountweaksubj',
 'positivCount',
 'passiveCount',
 'noCount',
 'negativCountstrongsubj',
 'punctuation',
 'priorpolarityCountneutralstrongsubj',
 'priorpolarityCountnegativeweaksubj',
 'priorpolarityCountneutral',
 'priorpolarityCountpositivestrongsubj',
 'hostileCountstrongsubj',
 'strongCountweaksubj',
 'priorpolarityCountbothweaksubj',
 'priorpolarityCountbothstrongsubj',
 'priorpolarityCountnegative',
 'positivCountweaksubj',
 'passiveCountweaksubj',
 'priorpolarityCountneutralweaksubj',
 'priorpolarityCountboth',
 'Allweaksubj',
 'negations',
 'priorpolarityCountpositiveweaksubj',
 'priorpolarityCountnegativestrongsubj',
 'strongCountstrongsubj',
 'strongCount',
 'passiveCountstrongsubj',
 'yesCount',
 'hostileCount',
 'hostileCountweaksubj',
 'activeCountstrongsubj']

In [44]:
def countOccurenceColumnBeforeAfter(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phrase']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['text'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['textStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['textStemmed'].find(rowLook['entry'], 0, posNeg)
            posAfter = row['textStemmed'].find(rowLook['entry'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = row[columnName+'Before'] + 1 
            if posAfter != -1:
                row[columnName+'After'] = row[columnName+'After'] + 1

    return row


def WordsAroundNegations(colArr, column, df, val = None, count = False):
    columnName = column
    compareVal = val
    if val is None:
        compareVal = column
    else:
        columnName = column + val
    
    if count:
        columnName = columnName + "Count"
    
    lookFor = sentimentDictionary[sentimentDictionary[column] == compareVal]
    
    print("LookFor shape: {}".format(lookFor.shape))
    if count:
        df[columnName+"Before"]  = 0
        df[columnName+"After"]  = 0
    else:
        df[columnName+"Before"]  = False
        df[columnName+"After"]  = False
        foundCount = columnName + "CountBefore" in df.columns

    tStart = time.time()
    if count:
        df = df.apply(lambda row: countOccurenceColumnBeforeAfter(row, lookFor, columnName), axis = 1)
    else:
        df[columnName+'Before'] = df[columnName + "CountBefore"] > 0
        df[columnName+'After'] = df[columnName + "CountAfter"] > 0
        
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))

    colArr[columnName+"Before"] = 1
    colArr[columnName+"After"] = 1
    
    return df, colArr

In [45]:
def calculateRequiredWordsAroundNegations(df, colArr):   
    df, colArr = WordsAroundNegations(colArr, 'active', df, None, True)
    df, colArr = WordsAroundNegations(colArr, 'passive', df, None, True)
    df, colArr = WordsAroundNegations(colArr, 'hostile', df, None, True)
    df, colArr = WordsAroundNegations(colArr, 'yes', df, None, True)
    df, colArr = WordsAroundNegations(colArr, 'no', df, None, True)
    df, colArr = WordsAroundNegations(colArr, 'negate', df, None, True)
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'negative', True)
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'positive', True)
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'both', True)
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'neutral', True)
    
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'negative')
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'positive')
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'neutral')
    df, colArr = WordsAroundNegations(colArr, 'priorpolarity', df, 'both')
    df, colArr = WordsAroundNegations(colArr, 'active', df)
    df, colArr = WordsAroundNegations(colArr, 'passive', df)
    df, colArr = WordsAroundNegations(colArr, 'hostile', df)
    df, colArr = WordsAroundNegations(colArr, 'yes', df)
    df, colArr = WordsAroundNegations(colArr, 'no', df)
    df, colArr = WordsAroundNegations(colArr, 'negate', df)
    return df, colArr

In [46]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb, featureColumnsImdb = calculateRequiredWordsAroundNegations(sentencesTestSubsetImdb, featureColumnsImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [47]:
tStart = time.time()
polarizedSentencesMpqa, featureColumnsMpqa = calculateRequiredWordsAroundNegations(polarizedSentencesMpqa, featureColumnsMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

LookFor shape: (610, 26)
Timed(activeCount): 58.0090000629
LookFor shape: (369, 26)
Timed(passiveCount): 34.8809998035
LookFor shape: (478, 26)
Timed(hostileCount): 50.4609999657
LookFor shape: (8, 26)
Timed(yesCount): 3.48000001907
LookFor shape: (4, 26)
Timed(noCount): 2.88199996948
LookFor shape: (128, 26)
Timed(negateCount): 13.8519999981
LookFor shape: (1552, 26)
Timed(priorpolaritynegativeCount): 151.440999985
LookFor shape: (1040, 26)
Timed(priorpolaritypositiveCount): 95.2250001431
LookFor shape: (7, 26)
Timed(priorpolaritybothCount): 2.96600008011
LookFor shape: (137, 26)
Timed(priorpolarityneutralCount): 15.2139999866
LookFor shape: (1552, 26)
Timed(priorpolaritynegative): 0.0
LookFor shape: (1040, 26)
Timed(priorpolaritypositive): 0.000999927520752
LookFor shape: (137, 26)
Timed(priorpolarityneutral): 0.000999927520752
LookFor shape: (7, 26)
Timed(priorpolarityboth): 0.0
LookFor shape: (610, 26)
Timed(active): 0.000999927520752
LookFor shape: (369, 26)
Timed(passive): 0.0
Lo

In [48]:
featureColumnsMpqa

{'Allstrongsubj': 1,
 'Allweaksubj': 1,
 'activeAfter': 1,
 'activeBefore': 1,
 'activeCount': 1,
 'activeCountAfter': 1,
 'activeCountBefore': 1,
 'activeCountstrongsubj': 1,
 'activeCountweaksubj': 1,
 'hostileAfter': 1,
 'hostileBefore': 1,
 'hostileCount': 1,
 'hostileCountAfter': 1,
 'hostileCountBefore': 1,
 'hostileCountstrongsubj': 1,
 'hostileCountweaksubj': 1,
 'negateAfter': 1,
 'negateBefore': 1,
 'negateCountAfter': 1,
 'negateCountBefore': 1,
 'negations': 1,
 'negativCount': 1,
 'negativCountstrongsubj': 1,
 'negativCountweaksubj': 1,
 'noAfter': 1,
 'noBefore': 1,
 'noCount': 1,
 'noCountAfter': 1,
 'noCountBefore': 1,
 'passiveAfter': 1,
 'passiveBefore': 1,
 'passiveCount': 1,
 'passiveCountAfter': 1,
 'passiveCountBefore': 1,
 'passiveCountstrongsubj': 1,
 'passiveCountweaksubj': 1,
 'positivCount': 1,
 'positivCountstrongsubj': 1,
 'positivCountweaksubj': 1,
 'priorpolarityCountboth': 1,
 'priorpolarityCountbothstrongsubj': 1,
 'priorpolarityCountbothweaksubj': 1,
 

In [49]:
sentimentDictionary['stemmed'].shape[0]

2736L

In [50]:
def create_n_grams(row, size_ngram):
    row[str(size_ngram)+"_gram"] = list(ngrams(row['tokens'], size_ngram))
    return row

def countOneBeforeOtherNgranRow(row, col, size_ngram, beforeSeries, afterSeries):
    # first, try to find a word from "after series"
    n_gram_col = str(size_ngram)+'_gram' 
    for n_gram in row[n_gram_col]:
        for idx in range(1, size_ngram):
            w = n_gram[idx]
            found = binSearchDfValue(afterSeries, w, 0, afterSeries.shape[0] - 1)
            if found:
                for idx_sub in range(0, idx):
                    w2 = n_gram[idx_sub]
                    found_sub = binSearchDfValue(beforeSeries, w2, 0, beforeSeries.shape[0] - 1)
                    if found_sub:
                        row[col] = row[col] + 1
    return row
    
def countOneBeforeOtherNgramDf(colArr, df, size_ngram, beforeClassCol, afterClassCol, beforeClassVal = None, afterClassVal = None, beforeSerieDefault = None):
    columnName = 'ngram_'+str(size_ngram)+"_"+ beforeClassCol + afterClassCol
    compareValBefore = beforeClassVal
    if beforeClassVal is None:
        compareValBefore = beforeClassCol
    else:
        columnName = columnName + beforeClassVal
    
    if beforeSerieDefault is None:
        lookForBefore = sentimentDictionary[sentimentDictionary[beforeClassCol] == compareValBefore].reset_index(drop=True)
        lookForBefore = lookForBefore['entry']
    else:
        lookForBefore = beforeSerieDefault

    compareValAfter = afterClassVal
    if afterClassVal is None:
        compareValAfter = afterClassCol
    else:
        columnName = columnName + afterClassVal
    
    lookForAfter = sentimentDictionary[sentimentDictionary[afterClassCol] == compareValAfter].reset_index(drop=True)
    lookForAfter = lookForAfter['entry']
    
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOneBeforeOtherNgranRow(row, columnName, size_ngram, lookForBefore, lookForAfter), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    colArr[columnName] = 1
    return df, colArr

In [51]:
tStart = time.time()
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row:create_n_grams(row, 2), axis = 1)
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row:create_n_grams(row, 3), axis = 1)
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row:create_n_grams(row, 4), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

Timed MPQA: 16.5380001068


In [52]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams(row, 2), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams(row, 3), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams(row, 4), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [53]:
def calculateBeforeAfterNgrams(df, colArr):
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'negativ', 'priorpolarity', None, 'positive')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'negativ', 'priorpolarity', None, 'positive')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'negativ', 'priorpolarity', None, 'positive')
    
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'negativ', 'priorpolarity', None, 'negative')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'negativ', 'priorpolarity', None, 'negative')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'negativ', 'priorpolarity', None, 'negative')

    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'negativ', 'priorpolarity', None, 'neutral')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'negativ', 'priorpolarity', None, 'neutral')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'negativ', 'priorpolarity', None, 'neutral')

    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'negations', 'priorpolarity', None, 'positive', negations['phraseStemmed'])
    
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'negations', 'priorpolarity', None, 'negative', negations['phraseStemmed'])
    
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'negations', 'priorpolarity', None, 'neutral', negations['phraseStemmed'])
    
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'hostile', 'priorpolarity', None, 'negative')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'hostile', 'priorpolarity', None, 'negative')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 4, 'hostile', 'priorpolarity', None, 'negative')

    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'persist', 'priorpolarity', None, 'positive')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'persist', 'priorpolarity', None, 'positive')
    
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'pleasur', 'priorpolarity', None, 'positive')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'pleasur', 'priorpolarity', None, 'positive')
    
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'weak', 'priorpolarity', None, 'negative')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'weak', 'priorpolarity', None, 'negative')

    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 2, 'active', 'priorpolarity', None, 'positive')
    df, colArr = countOneBeforeOtherNgramDf(colArr, df, 3, 'active', 'priorpolarity', None, 'positive')
    return df, colArr

In [54]:
sentimentDictionary.columns

Index([u'positiv', u'negativ', u'active', u'passive', u'affil', u'hostile',
       u'strong', u'power', u'weak', u'submit', u'yes', u'no', u'negate',
       u'intrj', u'pleasur', u'pain', u'feel', u'need', u'persist', u'entry',
       u'othtags', u'type', u'pos', u'stemmed', u'priorpolarity', u'entryRaw'],
      dtype='object')

In [55]:
tStart = time.time()
polarizedSentencesMpqa, featureColumnsMpqa = calculateBeforeAfterNgrams(polarizedSentencesMpqa, featureColumnsMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

Timed(ngram_2_negativpriorpolaritypositive): 29.1039998531
Timed(ngram_3_negativpriorpolaritypositive): 56.6860001087
Timed(ngram_4_negativpriorpolaritypositive): 86.3970000744
Timed(ngram_2_negativpriorpolaritynegative): 29.5710000992
Timed(ngram_3_negativpriorpolaritynegative): 52.9360001087
Timed(ngram_4_negativpriorpolaritynegative): 78.3540000916
Timed(ngram_2_negativpriorpolarityneutral): 19.2510001659
Timed(ngram_3_negativpriorpolarityneutral): 36.9440000057
Timed(ngram_4_negativpriorpolarityneutral): 53.375
Timed(ngram_2_negationspriorpolaritypositive): 27.7059998512
Timed(ngram_3_negationspriorpolaritypositive): 47.6859998703
Timed(ngram_4_negationspriorpolaritypositive): 68.3469998837
Timed(ngram_2_negationspriorpolaritynegative): 30.4709999561
Timed(ngram_3_negationspriorpolaritynegative): 50.9910001755
Timed(ngram_4_negationspriorpolaritynegative): 75.7569999695
Timed(ngram_2_negationspriorpolarityneutral): 20.1700000763
Timed(ngram_3_negationspriorpolarityneutral): 40.8459

In [56]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb, featureColumnsImdb = calculateBeforeAfterNgrams(sentencesTestSubsetImdb, featureColumnsImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [57]:
polarizedSentencesMpqa

Unnamed: 0,index,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text,sentiment-measured,sentiment-type,sentiment-intensity,textRaw,textLower,punctuation,wordCount,textArr,textStemmed,textArrStemmed,tokens,negations,priorpolarityCountnegative,priorpolarityCountboth,priorpolarityCountneutral,priorpolarityCountpositive,priorpolarityCountnegativestrongsubj,priorpolarityCountpositivestrongsubj,priorpolarityCountneutralstrongsubj,priorpolarityCountbothstrongsubj,priorpolarityCountnegativeweaksubj,priorpolarityCountpositiveweaksubj,priorpolarityCountbothweaksubj,priorpolarityCountneutralweaksubj,Allweaksubj,Allstrongsubj,hostileCountweaksubj,strongCountweaksubj,hostileCountstrongsubj,strongCountstrongsubj,hostileCount,strongCount,activeCountweaksubj,passiveCountweaksubj,activeCountstrongsubj,passiveCountstrongsubj,activeCount,passiveCount,positivCountweaksubj,negativCountweaksubj,positivCountstrongsubj,negativCountstrongsubj,positivCount,negativCount,yesCount,noCount,activeCountBefore,activeCountAfter,passiveCountBefore,passiveCountAfter,hostileCountBefore,hostileCountAfter,yesCountBefore,yesCountAfter,noCountBefore,noCountAfter,negateCountBefore,negateCountAfter,priorpolaritynegativeCountBefore,priorpolaritynegativeCountAfter,priorpolaritypositiveCountBefore,priorpolaritypositiveCountAfter,priorpolaritybothCountBefore,priorpolaritybothCountAfter,priorpolarityneutralCountBefore,priorpolarityneutralCountAfter,priorpolaritynegativeBefore,priorpolaritynegativeAfter,priorpolaritypositiveBefore,priorpolaritypositiveAfter,priorpolarityneutralBefore,priorpolarityneutralAfter,priorpolaritybothBefore,priorpolaritybothAfter,activeBefore,activeAfter,passiveBefore,passiveAfter,hostileBefore,hostileAfter,yesBefore,yesAfter,noBefore,noAfter,negateBefore,negateAfter,2_gram,3_gram,4_gram,ngram_2_negativpriorpolaritypositive,ngram_3_negativpriorpolaritypositive,ngram_4_negativpriorpolaritypositive,ngram_2_negativpriorpolaritynegative,ngram_3_negativpriorpolaritynegative,ngram_4_negativpriorpolaritynegative,ngram_2_negativpriorpolarityneutral,ngram_3_negativpriorpolarityneutral,ngram_4_negativpriorpolarityneutral,ngram_2_negationspriorpolaritypositive,ngram_3_negationspriorpolaritypositive,ngram_4_negationspriorpolaritypositive,ngram_2_negationspriorpolaritynegative,ngram_3_negationspriorpolaritynegative,ngram_4_negationspriorpolaritynegative,ngram_2_negationspriorpolarityneutral,ngram_3_negationspriorpolarityneutral,ngram_4_negationspriorpolarityneutral,ngram_2_hostilepriorpolaritynegative,ngram_3_hostilepriorpolaritynegative,ngram_4_hostilepriorpolaritynegative,ngram_2_persistpriorpolaritypositive,ngram_3_persistpriorpolaritypositive,ngram_2_pleasurpriorpolaritypositive,ngram_3_pleasurpriorpolaritypositive,ngram_2_weakpriorpolaritynegative,ngram_3_weakpriorpolaritynegative,ngram_2_activepriorpolaritypositive,ngram_3_activepriorpolaritypositive
0,0,13.40.05-15087,20010620,0,109,226,117.0,2.0,|the|kimberley|provincial|hospital|said|it|wou...,1.0,0.0,0.000000,The Kimberley Provincial Hospital said it woul...,the kimberley provincial hospital said it woul...,46.0,19,"[the, kimberley, provincial, hospital, said, i...",|the|kimberley|provinci|hospit|said|it|would|p...,"[the, kimberley, provinci, hospit, said, it, w...","[kimberley, provinci, hospit, said, it, would,...",0,1,0,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(kimberley, provinci), (provinci, hospit), (h...","[(kimberley, provinci, hospit), (provinci, hos...","[(kimberley, provinci, hospit, said), (provinc...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,13.40.05-15087,20010620,7,793,884,91.0,2.0,|the|woman|was|admitted|to|the|hospital|on|sat...,1.0,0.0,-0.850000,The woman was admitted to the hospital on Satu...,the woman was admitted to the hospital on satu...,46.0,15,"[the, woman, was, admitted, to, the, hospital,...",|the|woman|was|admit|to|the|hospit|on|saturday...,"[the, woman, was, admit, to, the, hospit, on, ...","[woman, was, admit, to, hospit, on, saturday, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(woman, was), (was, admit), (admit, to), (to,...","[(woman, was, admit), (was, admit, to), (admit...","[(woman, was, admit, to), (was, admit, to, hos...",0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,0,0,0,0,0,0,0,0
2,11,13.40.05-15087,20010620,11,656,749,93.0,3.0,|he|said|it|was|his|opinion|that|the|patient|a...,1.0,0.0,0.000000,He said it was his opinion that the patient --...,he said it was his opinion that the patient --...,46.0,17,"[he, said, it, was, his, opinion, that, the, p...",|he|said|it|was|his|opinion|that|the|patient|a...,"[he, said, it, was, his, opinion, that, the, p...","[he, said, it, was, his, opinion, that, patien...",0,2,0,0,1,1,0,0,0,1,1,0,0,2,1,0,1,0,0,0,1,0,2,0,0,0,2,1,1,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(he, said), (said, it), (it, was), (was, his)...","[(he, said, it), (said, it, was), (it, was, hi...","[(he, said, it, was), (said, it, was, his), (i...",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,13,13.40.05-15087,20010620,13,588,655,67.0,3.0,|saeed|said|indications|were|that|those|tests|...,1.0,0.0,0.000000,Saeed said indications were that those tests w...,saeed said indications were that those tests w...,46.0,11,"[saeed, said, indications, were, that, those, ...",|saeed|said|indic|were|that|those|test|would|b...,"[saeed, said, indic, were, that, those, test, ...","[saeed, said, indic, were, that, those, test, ...",0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(saeed, said), (said, indic), (indic, were), ...","[(saeed, said, indic), (said, indic, were), (i...","[(saeed, said, indic, were), (said, indic, wer...",0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0
4,15,23.46.20-17835,20010627,0,1782,1960,178.0,3.0,|it|is|believed|that|sand|from|northern|areas|...,1.0,0.0,0.000000,"It is believed that sand from northern areas, ...","it is believed that sand from northern areas, ...",46.0,28,"[it, is, believed, that, sand, from, northern,...",|it|is|believ|that|sand|from|northern|area|esp...,"[it, is, believ, that, sand, from, northern, a...","[it, is, believ, that, sand, from, northern, a...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(it, is), (is, believ), (believ, that), (that...","[(it, is, believ), (is, believ, that), (believ...","[(it, is, believ, that), (is, believ, that, sa...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,26,23.46.20-17835,20010627,11,1466,1597,131.0,2.0,|private|organizations|are|also|being|encourag...,1.0,0.0,0.900150,Private organizations are also being encourage...,private organizations are also being encourage...,46.0,18,"[private, organizations, are, also, being, enc...",|privat|organ|are|also|be|encourag|to|help|fig...,"[privat, organ, are, also, be, encourag, to, h...","[privat, organ, are, also, be, encourag, to, h...",0,1,0,0,1,0,0,0,0,1,1,0,0,2,0,1,1,0,0,1,1,2,1,0,0,2,1,1,2,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(privat, organ), (organ, are), (are, also), (...","[(privat, organ, are), (organ, are, also), (ar...","[(privat, organ, are, also), (organ, are, also...",0,1,3,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,2
7,31,00.48.42-17806,20010630,3,2648,2698,50.0,2.0,|we|decided|to|make|some|bold|decisions|he|said|,1.0,0.0,0.967549,"""We decided to make some bold decisions,"" he s...","""we decided to make some bold decisions,"" he s...",46.0,9,"[we, decided, to, make, some, bold, decisions,...",|we|decid|to|make|some|bold|decis|he|said|,"[we, decid, to, make, some, bold, decis, he, s...","[we, decid, to, make, some, bold, decis, he, s...",0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(we, decid), (decid, to), (to, make), (make, ...","[(we, decid, to), (decid, to, make), (to, make...","[(we, decid, to, make), (decid, to, make, some...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2
9,34,00.48.42-17806,20010630,6,4017,4133,116.0,4.0,|but|the|costs|of|failing|to|support|yugoslavi...,1.0,0.0,-0.953667,"""But the costs of failing to support Yugoslavi...","""but the costs of failing to support yugoslavi...",46.0,21,"[but, the, costs, of, failing, to, support, yu...",|but|the|cost|of|fail|to|support|yugoslavia|as...,"[but, the, cost, of, fail, to, support, yugosl...","[but, cost, of, fail, to, support, yugoslavia,...",0,1,0,1,1,0,1,0,0,1,0,0,1,2,1,1,2,0,1,1,3,2,0,1,1,3,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(but, cost), (cost, of), (of, fail), (fail, t...","[(but, cost, of), (cost, of, fail), (of, fail,...","[(but, cost, of, fail), (cost, of, fail, to), ...",0,1,2,0,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0
11,37,00.48.42-17806,20010630,9,2507,2641,134.0,3.0,|we|are|now|fully|back|into|the|international|...,1.0,0.0,0.959061,We are now fully back into the international c...,we are now fully back into the international c...,46.0,20,"[we, are, now, fully, back, into, the, interna...",|we|are|now|fulli|back|into|the|intern|communi...,"[we, are, now, fulli, back, into, the, intern,...","[we, are, now, fulli, back, into, intern, comm...",0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(we, are), (are, now), (now, fulli), (fulli, ...","[(we, are, now), (are, now, fulli), (now, full...","[(we, are, now, fulli), (are, now, fulli, back...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12,38,00.48.42-17806,20010630,10,3931,4016,85.0,2.0,|there|is|no|quick|fix|to|the|yugoslav|problem...,1.0,0.0,-0.960000,"""There is no quick fix"" to the Yugoslav proble...","""there is no quick fix"" to the yugoslav proble...",46.0,16,"[there, is, no, quick, fix, to, the, yugoslav,...",|there|is|no|quick|fix|to|the|yugoslav|problem...,"[there, is, no, quick, fix, to, the, yugoslav,...","[there, is, no, quick, fix, to, yugoslav, prob...",0,1,0,1,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[(there, is), (is, no), (no, quick), (quick, f...","[(there, is, no), (is, no, quick), (no, quick,...","[(there, is, no, quick), (is, no, quick, fix),...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [191]:
import cPickle as pickle
if doSave:
#    with open('./clean-sentiment-mpqa.dump','wb') as fp:
#        pickle.dump(polarizedSentencesMpqa,fp)
#    with open('./clean-sentiment-imdb-sents.dump','wb') as fp:
#        pickle.dump(sentencesTestSubsetImdb,fp)
    with open('./clean-sentiment-sentences.dump','wb') as fp:
        pickle.dump(selectedPolarizedSentencesMpqa,fp)    

In [58]:
sentiment_positive = 1
sentiment_negative = -1
polarizedSentencesMpqa.loc[polarizedSentencesMpqa['sentiment-intensity'] < 0, 'sentiment-intensity'] = sentiment_negative
polarizedSentencesMpqa.loc[polarizedSentencesMpqa['sentiment-intensity'] > 0, 'sentiment-intensity'] = sentiment_positive

In [59]:
def get_equal_sentiment_parts(df, ratio):
    neg_df = df[df['sentiment-intensity'] < 0].shape[0]
    pos_df = df[df['sentiment-intensity'] > 0].shape[0]
    neutral_df = df[df['sentiment-intensity'] == 0].shape[0]
    min_size = min(neg_df, pos_df, neutral_df)
    
    new_df = pd.concat([df[df['sentiment-intensity'] < 0][:min_size],df[df['sentiment-intensity'] > 0][:int(ratio * min_size)], df[df['sentiment-intensity'] == 0][:min_size]])
    return new_df.reindex(np.random.permutation(new_df.index))

def get_equal_neutral_training_sentiment_parts(df, ratio):
    new_df = df.reindex(np.random.permutation(df.index))
    
    neg_df = df[df['sentiment-intensity'] < 0].shape[0]
    pos_df = df[df['sentiment-intensity'] > 0].shape[0]
    neutral_df = df[df['sentiment-intensity'] == 0].shape[0]
    min_size = min(neg_df, pos_df, neutral_df)
    if min_size * 2 > neutral_df:
        min_size = int(neutral_df / 2)
    
    new_df = pd.concat([new_df[new_df['sentiment-intensity'] < 0][:int(min_size *  ratio)],new_df[new_df['sentiment-intensity'] > 0][:int(ratio * min_size)],
                            new_df[new_df['sentiment-intensity'] == 0][:int(min_size * ratio)]])
    
    new_df['sentiment-neutral'] = 0
    new_df.loc[new_df['sentiment-intensity'] == 0, 'sentiment-neutral'] = 1
    return new_df.reindex(np.random.permutation(new_df.index))

In [60]:
if not onlyMpqa:
    test_reviews_even.loc[test_reviews_even['expected'] < 0, 'expected'] = sentiment_negative
    test_reviews_even.loc[test_reviews_even['expected'] > 0, 'expected'] = sentiment_positive

In [61]:
def show_all_value_counts(df):
    skip = ['index', 'docName', 'dirName', 'idx', 'startByte', 'endByte', 'sentLen', 'annotsCount', 'text', 'sentiment-measured',
       'sentiment-type', 'sentiment-intensity', 'textLower', 'textList', 'wordCount', 'textArr', 'textStemmed', 'textArrStemmed',
        'id', 'rating', 'set', 'type', 'real-id']
    
    for col in df.columns:
        if col in skip:
            continue
         
        print
        print("Column: {}".format(col))
        print(df[col].value_counts())

In [62]:
# show_all_value_counts(sentencesTestSubsetImdb)

In [63]:
# show_all_value_counts(selectedPolarizedSentencesMpqa)

In [64]:
def calculateRequiredMoreThanOther(df, colArr):
    df['morePositiveThanNegativeStrong'] = False
    dt = df[df['priorpolarityCountpositivestrongsubj'] > df['priorpolarityCountnegativestrongsubj'] ]
    df.loc[dt.index,'morePositiveThanNegativeStrong'] = True
    colArr['morePositiveThanNegativeStrong'] = 1

    df['morePositiveThanNegativeWeak'] = False
    dt = df[df['priorpolarityCountpositiveweaksubj'] > df['priorpolarityCountnegativeweaksubj'] ]
    df.loc[dt.index,'morePositiveThanNegativeWeak'] = True
    colArr['morePositiveThanNegativeWeak'] = 1

    df['morePositiveThanNegative'] = False
    dt = df[df['priorpolarityCountpositive'] > df['priorpolarityCountnegative'] ]
    df.loc[dt.index,'morePositiveThanNegative'] = True
    colArr['morePositiveThanNegative'] = 1

    df['morePositiveThanNeutral'] = False
    dt = df[df['priorpolarityCountpositive'] > df['priorpolarityCountneutral'] ]
    df.loc[dt.index,'morePositiveThanNeutral'] = True
    colArr['morePositiveThanNeutral'] = 1

    df['moreNegativeThanNeutral'] = False
    dt = df[df['priorpolarityCountnegative'] > df['priorpolarityCountneutral'] ]
    df.loc[dt.index,'moreNegativeThanNeutral'] = True
    colArr['moreNegativeThanNeutral'] = 1
    return df, colArr

In [65]:
polarizedSentencesMpqa, featureColumnsMpqa = calculateRequiredMoreThanOther(polarizedSentencesMpqa, featureColumnsMpqa)
if not onlyMpqa:
    sentencesTestSubsetImdb, featureColumnsImdb = calculateRequiredMoreThanOther(sentencesTestSubsetImdb, featureColumnsImdb)
print




In [66]:
selectedPolarizedSentencesMpqa = get_equal_sentiment_parts(polarizedSentencesMpqa[polarizedSentencesMpqa['annotsCount'] > 0], 1.0)
#with open('./clean-sentiment-sentences.dump','rb') as fp:
#    selectedPolarizedSentencesMpqa = pickle.load(fp)    
selectedPolarizedSentencesMpqa.shape

(4668, 132)

In [67]:
#neutralSelectedSentencesMpqa = get_equal_neutral_training_sentiment_parts(polarizedSentencesMpqa, 1)
#neutralSelectedSentencesMpqa.shape

In [68]:
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger

pos_tagger = PerceptronTagger()

def assign_pos_tags(row):
    tagged_words = TextBlob(row['textRaw'], pos_tagger= pos_tagger)
    row['textPos'] = tagged_words.tags
    return row

In [69]:
tStart = time.time()
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row: assign_pos_tags(row), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

Timed MPQA: 13.4799997807


In [70]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: assign_pos_tags(row), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [71]:
tags = [gram[1] for gram_list in selectedPolarizedSentencesMpqa['textPos'].values for gram in gram_list]
tags_pd = pd.DataFrame(tags, columns=["tag"])
tags_pd['tag'].value_counts()

NN      11948
IN       9946
DT       9227
NNP      8273
JJ       6302
NNS      5184
RB       3575
VB       3395
VBD      3022
PRP      2808
TO       2535
VBZ      2489
CC       2271
VBN      2210
VBG      1684
VBP      1621
MD       1291
PRP$     1036
CD        952
POS       757
WDT       420
NNPS      392
WP        299
WRB       236
RP        226
JJR       214
EX        194
JJS       129
RBR       109
PDT        59
RBS        48
WP$        19
UH         14
FW         11
SYM         4
Name: tag, dtype: int64

In [72]:
def create_n_grams_pos(row, size_ngram):
    row["pos_"+str(size_ngram)+"_gram"] = list(ngrams(row['textPos'], size_ngram))
    return row

def countOneBeforeOtherNgranPosRow(row, col, size_ngram, beforeSeries, afterSeries, beforeSeriesPos = None, afterSeriesPos = None):
    # first, try to find a word from "after series"
    n_gram_col = 'pos_'+str(size_ngram)+'_gram' 
    for n_gram in row[n_gram_col]:
        for idx in range(1, size_ngram):
            w = n_gram[idx]
            if afterSeriesPos is not None and w[1] not in afterSeriesPos:
                continue

            found = binSearchDfValue(afterSeries, w[0], 0, afterSeries.shape[0] - 1)
            if found:
                for idx_sub in range(0, idx):
                    w2 = n_gram[idx_sub]
                    if beforeSeriesPos is not None and w2[1] not in beforeSeriesPos:
                        continue
                    found_sub = binSearchDfValue(beforeSeries, w2[0], 0, beforeSeries.shape[0] - 1)
                    if found_sub:
                        row[col] = row[col] + 1
    return row
    
def countOneBeforeOtherNgramPosDf(colArr, df, size_ngram, beforeClassCol, afterClassCol,
                                  beforeClassPos = None, afterClassPos = None,
                                beforeClassVal = None, afterClassVal = None, beforeSerieDefault = None):
    columnName = 'pos_ngram_'+str(size_ngram)+"_" + '.'.join(beforeClassPos) + '.'.join(afterClassPos)
    if beforeClassCol is not None:
        columnName += beforeClassCol
    if afterClassCol is not None:
        columnName += afterClassCol
    
    
    compareValBefore = beforeClassVal
    if beforeClassVal is None:
        compareValBefore = beforeClassCol
    else:
        columnName = columnName + beforeClassVal
    
    if beforeSerieDefault is None:
        if beforeClassCol is None:
                lookForBefore = sentimentDictionary.reset_index(drop = True)
                lookForBefore = lookForBefore['entryRaw']
        else:
            lookForBefore = sentimentDictionary[sentimentDictionary[beforeClassCol] == compareValBefore].reset_index(drop=True)
            lookForBefore = lookForBefore['entryRaw']
    else:
        lookForBefore = beforeSerieDefault

    compareValAfter = afterClassVal
    if afterClassVal is None:
        compareValAfter = afterClassCol
    else:
        columnName = columnName + afterClassVal
    
    if afterClassCol is None:
        lookForAfter = sentimentDictionary.reset_index(drop = True)
        lookForAfter = lookForAfter['entryRaw']
    else:
        lookForAfter = sentimentDictionary[sentimentDictionary[afterClassCol] == compareValAfter].reset_index(drop=True)
        lookForAfter = lookForAfter['entryRaw']
    
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOneBeforeOtherNgranPosRow(row, columnName, size_ngram, lookForBefore, lookForAfter, beforeClassPos, afterClassPos), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    colArr[columnName] = 1
    return df, colArr

In [73]:
sentencesTestSubsetImdb['textRaw'] = sentencesTestSubsetImdb['text'].map(lambda cell: ' '.join(cell.split('|')))

NameError: name 'sentencesTestSubsetImdb' is not defined

In [None]:
tStart = time.time()
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 4), axis = 1)
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 5), axis = 1)
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 6), axis = 1)
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row:create_n_grams_pos(row, 7), axis = 1)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

In [None]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 2), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 4), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 5), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 6), axis = 1)
    sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row:create_n_grams_pos(row, 7), axis = 1)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [None]:
def calculateBeforeAfterNgramsPos(df, colArr):
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    noun_tags = ['NN', 'NNP', 'NNS']
    adjective_tags = ['JJ', 'JJR', 'JJS']
    adjective_adverb_tags = adjective_tags + ['RB', 'RBR', 'RBS']
    print(adjective_adverb_tags)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'positiv', adjective_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'positiv', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'positiv', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'positiv', adjective_tags, noun_tags, None, None)

    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'active', adjective_adverb_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'active', adjective_adverb_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'active', adjective_adverb_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'active', adjective_adverb_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'strong', adjective_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'strong', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'strong', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'strong', adjective_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'negativ', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'negativ', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'negativ', adjective_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, 'negations', 'priorpolarity', 'ADJ', 'NOUN', None, 'positive', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'positive')
    
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'negative')
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'negative')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'negative')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'negative')
    
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'neutral')
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'neutral')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'neutral')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'priorpolarity', adjective_adverb_tags, noun_tags, None, 'neutral')

    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'priorpolarity', adjective_tags, noun_tags, None, 'positive')
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'priorpolarity', adjective_tags, noun_tags, None, 'positive')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'priorpolarity', adjective_tags, noun_tags, None, 'positive')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'priorpolarity', adjective_tags, noun_tags, None, 'positive')
    
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'priorpolarity', adjective_tags, noun_tags, None, 'negative')
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'priorpolarity', adjective_tags, noun_tags, None, 'negative')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'priorpolarity', adjective_tags, noun_tags, None, 'negative')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'priorpolarity', adjective_tags, noun_tags, None, 'negative')
    
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'priorpolarity', adjective_tags, noun_tags, None, 'neutral')
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'priorpolarity', adjective_tags, noun_tags, None, 'neutral')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'priorpolarity', adjective_tags, noun_tags, None, 'neutral')
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'priorpolarity', adjective_tags, noun_tags, None, 'neutral')
    
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, 'negations', 'priorpolarity', adjective_tags, noun_tags, None, 'positive', negations['phraseStemmed'])
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, 'negations', 'priorpolarity', adjective_tags, noun_tags, None, 'positive', negations['phraseStemmed'])
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, 'negations', 'priorpolarity', adjective_tags, noun_tags, None, 'positive', negations['phraseStemmed'])
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, 'negations', 'priorpolarity', adjective_tags, noun_tags, None, 'positive', negations['phraseStemmed'])

    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 4, None, 'hostile', adjective_tags, noun_tags, None, None)
    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 5, None, 'hostile', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 6, None, 'hostile', adjective_tags, noun_tags, None, None)
#    df, colArr = countOneBeforeOtherNgramPosDf(colArr, df, 7, None, 'hostile', adjective_tags, noun_tags, None, None)
    return df, colArr

In [None]:
tStart = time.time()
selectedPolarizedSentencesMpqa, featureColumnsMpqa = calculateBeforeAfterNgramsPos(selectedPolarizedSentencesMpqa, featureColumnsMpqa)
tEnd = time.time()
print("Timed MPQA: {}".format(str(tEnd-tStart)))

In [None]:
if not onlyMpqa:
    tStart = time.time()
    sentencesTestSubsetImdb, featureColumnsImdb = calculateBeforeAfterNgramsPos(sentencesTestSubsetImdb, featureColumnsImdb)
    tEnd = time.time()
    print("Timed IMDB: {}".format(str(tEnd-tStart)))

In [None]:
def generate_description_features(df, features):
    df_pos = df[df['sentiment-intensity'] > 0]
    df_neg = df[df['sentiment-intensity'] < 0]
    df_neu = df[df['sentiment-intensity'] == 0]
    total_counts = [df_pos.shape[0], df_neg.shape[0], df_neu.shape[0]]
    stats = []
    for feature in features.keys():
        if feature not in df.columns:
            continue
        # val1 = # of sentences with 0 val
        # val2 = # of sentences with other val
        # val3 = # of different features
        pos_stats = []
        neg_stats = []
        neutral_stats = []
        if df[feature].dtype == 'bool':
            pos_stats = [
                df_pos[df_pos[feature] == False].shape[0],
                df_pos[df_pos[feature] == True].shape[0],
                2
            ]
            neg_stats = [
                df_neg[df_neg[feature] == False].shape[0],
                df_neg[df_neg[feature] == True].shape[0],
                2
            ]
            neutral_stats = [
                df_neu[df_neu[feature] == False].shape[0],
                df_neu[df_neu[feature] == True].shape[0],
                2
            ]
        else: # counts
            pos_stats = [
                df_pos[df_pos[feature] == 0].shape[0],
                df_pos[df_pos[feature] > 0].shape[0],
                df_pos[feature].value_counts().shape[0]
            ]
            neg_stats = [
                df_neg[df_neg[feature]  == 0].shape[0],
                df_neg[df_neg[feature] > 0].shape[0],
                df_neg[feature].value_counts().shape[0]
            ]
            neutral_stats = [
                df_neu[df_neu[feature]  == 0].shape[0],
                df_neu[df_neu[feature] > 0 ].shape[0],
                df_neu[feature].value_counts().shape[0]
            ]
        
        row = [feature] + pos_stats + neg_stats + neutral_stats + total_counts
        stats.append(row)
        
    return pd.DataFrame(stats, columns = ['feature', 'pos=0', 'pos>1', 'pos-features',
                                                     'neg=0', 'neg>1', 'neg-features'
                                                     , 'neutral=0', 'neutral>1', 'neutral-features',
                                                     'total-pos', 'total-neg', 'total-neutral'])

In [None]:
stats = generate_description_features(selectedPolarizedSentencesMpqa, featureColumnsMpqa)
stats.shape

In [642]:
stats.to_csv('./corpora/processed/df_stats.csv')

In [None]:
selectedPolarizedSentencesMpqa['pos_ngram_4_JJ.JJR.JJSNN.NNP.NNSpositiv'].value_counts().shape

In [None]:
if not onlyMpqa:
    sentencesTestSubsetImdb[sentencesTestSubsetImdb['moreNegativeThanNeutral'] == True].shape

In [None]:
selected_columns = [
        'negations', 'positivCount', 'negativCount',
        'priorpolaritynegativeAfter',
        'priorpolaritypositiveBefore',
        'activeBefore', 'passiveBefore',
        'priorpolaritynegativeBefore',
        'hostileBefore', 'negativCountstrongsubj',
        'priorpolaritynegativeCountBefore', 'priorpolaritypositiveCountBefore',
        'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
        'priorpolaritypositiveAfter',
        'priorpolarityCountpositive', 'priorpolarityCountpositiveweaksubj',
        'priorpolarityCountpositivestrongsubj',
        'strongCount', 'passiveCount', 'activeCountweaksubj',
        'priorpolarityCountnegativestrongsubj', 'priorpolarityCountnegativeweaksubj',
'ngram_4_negativpriorpolaritynegative',
'ngram_4_negativpriorpolaritypositive',
'ngram_3_negativpriorpolaritynegative',
'ngram_3_negativpriorpolaritypositive',
'ngram_4_hostilepriorpolaritynegative',
'ngram_3_activepriorpolaritypositive',
'ngram_4_negativpriorpolaritypositive',
'pos_ngram_7_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritynegative',
'pos_ngram_7_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive',    
'pos_ngram_6_JJ.JJR.JJSNN.NNP.NNSpriorpolaritypositive',
'pos_ngram_5_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive',
'pos_ngram_6_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive',
'pos_ngram_6_JJ.JJR.JJSNN.NNP.NNSpriorpolaritynegative',
'pos_ngram_7_JJ.JJR.JJSNN.NNP.NNSstrong',
'pos_ngram_4_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritynegative',
#'pos_ngram_4_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritypositive',
#'pos_ngram_4_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolaritynegative',
#'pos_ngram_4_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSpriorpolarityneutral',
#'pos_ngram_4_JJ.JJR.JJSNN.NNP.NNSpositiv',
'ngram_4_negativpriorpolaritynegative',   
#'pos_ngram_6_JJ.JJR.JJSNN.NNP.NNSstrong',
#'pos_ngram_4_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSactive',
#'pos_ngram_6_JJ.JJR.JJS.RB.RBR.RBSNN.NNP.NNSactive',
        'priorpolaritypositiveCountAfter', 'priorpolaritynegativeCountAfter',
        'morePositiveThanNeutral', 'moreNegativeThanNeutral',
        'priorpolarityCountneutral', 'priorpolarityCountneutralstrongsubj',
        'priorpolarityCountneutralweaksubj',
        'priorpolarityneutralAfter',  'priorpolarityneutralBefore',
    
        'Allweaksubj', 'Allstrongsubj', 'activeCount',
        'strongCountweaksubj', 'positivCountweaksubj', 'positivCountstrongsubj'
    ]

selected_columns_train = selected_columns + ['sentiment-intensity']

print("Number of features: {}".format(len(selected_columns)))

In [None]:
def precalculate_feature_selection(className, stats):
    more1 = stats[className+'>1'].astype(float)    
    stats[stats[className+'-features'] == 0] = 1.00001
    stats[className+'-ratio'] = more1 / stats[className+'-features']
    stats[className+'-portion'] = more1 / stats['total-'+className] 
    return stats
    

def feature_selection(className, stats, ratio = None, portion = None, otherRatio = None):
    other = None
    if className == 'pos':
        other = ['neg', 'neutral']
    elif className == 'neg':
        other = ['pos', 'neutral']
    else:
        other = ['neg', 'pos']
    
    idx = None
    if ratio is not None:
        idx = stats[className+'-ratio'] > ratio
    if portion is not None:
        if idx is None:
            idx = stats[className+'-portion'] > portion
        else:
            idx = idx & (stats[className+'-portion'] > portion)

    if otherRatio is not None:
        idx2 = None
        idx2 = (stats[other[0]+'-ratio'] > otherRatio) | (stats[other[1]+'-ratio'] > otherRatio)

        if idx is None:
            idx = idx2
        else:
            idx = idx & idx2        

    return stats.loc[ idx, 'feature'].values

In [None]:
stats

In [None]:
stats = precalculate_feature_selection( 'pos', stats)
stats = precalculate_feature_selection( 'neg', stats)
stats = precalculate_feature_selection( 'neutral', stats)

In [None]:
def analyzeErrors(df, predicted, expected):
    print("Accuracy: {}".format(calculateAccuracy(df, predicted, expected)))
    print("Accuracy in positive sentiment: {}".format(calculateAccuracy(df[df[expected] == sentiment_positive], predicted, expected)))
    print("Accuracy in negative sentiment: {}".format(calculateAccuracy(df[df[expected] == sentiment_negative], predicted, expected)))
    print("Accuracy in neutral sentiment: {}".format(calculateAccuracy(df[df[expected] == 0], predicted, expected))) 
    cm = confusion_matrix(df[expected], df[predicted])
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm)
    print(cm_normalized)


def calculateAccuracy(df, predicted, expected):
    if df.shape[0] == 0:
        return 0
    else:
        return df[df[predicted] == df[expected]].shape[0] / float(df.shape[0])

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [None]:
def create_n_stratified_parts(df, n_cuts, project_columns):
    copy_df = df[project_columns].copy().reset_index()
    
    copy_df = copy_df.reindex(np.random.permutation(copy_df.index))
    positive = copy_df[copy_df['sentiment-intensity'] == sentiment_positive].reset_index(drop = True)
    neutral = copy_df[copy_df['sentiment-intensity'] == 0].reset_index(drop = True)
    negative = copy_df[copy_df['sentiment-intensity'] == sentiment_negative].reset_index(drop = True)
    
    positive_parts = []
    negative_parts = []
    neutral_parts = []
    positive_step = positive.shape[0] / n_cuts
    negative_step = negative.shape[0] / n_cuts
    neutral_step = neutral.shape[0] / n_cuts
    for part in range(n_cuts):
        if part == n_cuts - 1:
            positive_parts.append(positive[part * positive_step:])
            negative_parts.append(negative[part * negative_step:])
            neutral_parts.append(neutral[part * neutral_step:])
        else:
            positive_parts.append(positive[part * positive_step : (part+1) * positive_step])
            negative_parts.append(negative[part * negative_step : (part+1) * negative_step])
            neutral_parts.append(neutral[part * neutral_step : (part+1) * neutral_step])
            
    
    return positive_parts, negative_parts, neutral_parts

In [None]:
def create_n_stratified_neutral_parts(df, n_cuts, project_columns):
    copy_df = df[project_columns].copy().reset_index()
    
    copy_df = copy_df.reindex(np.random.permutation(copy_df.index))
    neutral = copy_df[copy_df['sentiment-neutral'] == 1].reset_index(drop = True)
    other = copy_df[copy_df['sentiment-neutral'] == 0].reset_index(drop = True)
    
    other_parts = []
    neutral_parts = []
    other_step = other.shape[0] / n_cuts
    neutral_step = neutral.shape[0] / n_cuts
    for part in range(n_cuts):
        if part == n_cuts - 1:
            other_parts.append(other[part * other_step:])
            neutral_parts.append(neutral[part * neutral_step:])
        else:
            other_parts.append(other[part * other_step : (part+1) * other_step])
            neutral_parts.append(neutral[part * neutral_step : (part+1) * neutral_step])
            
    return other_parts, neutral_parts

In [None]:
n_cuts = 8

In [None]:
#pos_features = feature_selection('pos', stats, 1.5)
#neg_features = feature_selection('neg', stats, 3.5, None, 4.0)
#neutral_features = feature_selection('neutral', stats, 1.5, 0.05)
pos_features = feature_selection('pos', stats, 2.5)
neg_features = feature_selection('neg', stats, 2.2, None, 4.0)
neutral_features = feature_selection('neutral', stats, 2.5, 0.05)

selected_columns_test = [] + pos_features.tolist() + neg_features.tolist() + neutral_features.tolist()
selected_columns_test = list(set(selected_columns_test))
selected_columns_train = selected_columns_test  + ['sentiment-intensity']
print("Number of features: pos = {}, neg = {}, neutral = {}, total = {}".format(len(pos_features), len(neg_features), len(neutral_features), len(selected_columns_train)))

In [733]:
test = selectedPolarizedSentencesMpqa[selectedPolarizedSentencesMpqa['wordCount'] < 15]
infoDf(test, 'sentiment-intensity')

Negative: (530, 163)
Neutral: (587, 163)
Positive: (573, 163)


In [734]:
positive_mixed_parts, negative_mixed_parts, neutral_mixed_parts = create_n_stratified_parts(selectedPolarizedSentencesMpqa, n_cuts, selected_columns_train)
#positive_mixed_parts, negative_mixed_parts, neutral_mixed_parts = create_n_stratified_parts(test, n_cuts, selected_columns_train)

In [735]:
best_test = None
best_train = None
best_predicted = None
best_accuracy = 0
best_clasifier = None

worst_accurracy = 1
worst_test = None
worst_predicted = None
classifier = None

accuracy = []
num_repeats = 2
consts = [4, 5, 6, 7, 8, 10, 12, 15]
#consts = [8]

for idx_const in range(len(consts)):
    print("######### Next Constant")
    for part in range(n_cuts):
        test = pd.DataFrame({})
        train = pd.DataFrame({})
        # create test and training set
        for idx in range(n_cuts):
            if idx == part:
                test = pd.concat([positive_mixed_parts[idx], negative_mixed_parts[idx], neutral_mixed_parts[idx]])
            else:
                train = pd.concat([train, positive_mixed_parts[idx], negative_mixed_parts[idx], neutral_mixed_parts[idx]])

        train.reset_index(inplace = True, drop = True)
        train = train.reindex(np.random.permutation(train.index))
        # train & evaluate
        train_results = train['sentiment-intensity'].values
        train.drop('sentiment-intensity', axis = 1, inplace = True)
        train_index = train['index'].values
        train.drop('index', axis = 1, inplace = True)    
        test_results = test['sentiment-intensity'].values
        test.drop('sentiment-intensity', axis = 1, inplace = True)
        test_index = test['index'].values
        test.drop('index', axis = 1, inplace = True)    
        acc = []
        for repeat_test in range(num_repeats):
            clasifier = RandomForestClassifier(n_estimators = 400, max_features = 'log2',
                                                max_depth = consts[idx_const], n_jobs = -1)
#            clasifier = ExtraTreesClassifier(n_estimators = consts[idx_const], max_features = 'sqrt',
#                                                max_depth = 7, n_jobs = -1)

#            clasifier = AdaBoostClassifier( ExtraTreesClassifier(n_estimators = consts[idx_const], max_features = 'sqrt',
#                                                max_depth = 7, n_jobs = -1),
#                                                n_estimators=consts[idx_const],
#                                                learning_rate=1.8)

#            clasifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=6),
#                                                n_estimators=consts[idx_const],
#                                                learning_rate=1.8)
            clasifier.fit(train.values, train_results)

            predicted = clasifier.predict(test.values)
            #current_score = clasifier.score(train.values, train_results)
            current_score = recall_score(test_results, predicted, average = None)
            list_predicted = list(predicted)
            current_accuracy = accuracy_score(test_results, predicted)
            acc.append(current_accuracy)
            print("idx = {}, round = {}, cut = {}, accuracy = {} ------------- score = {}".format(idx_const, repeat_test, part, current_accuracy, current_score))
            
            if worst_accurracy > current_accuracy:
                worst_accurracy = current_accuracy
                worst_test = test.copy()
                worst_test['index'] = test_index
                worst_test['sentiment-intensity'] = test_results
                worst_test['predicted'] = predicted
                print("@@@ Worst Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))
            
            if best_accuracy < current_accuracy:
                best_test = test.copy()
                best_test['index'] = test_index
                best_test['sentiment-intensity'] = test_results
                best_test['predicted'] = predicted
                best_train = train.copy()
                best_train['index'] = train_index
                best_train['sentiment-intensity'] = train_results
                best_predicted = predicted.copy()
                best_clasifier = clasifier
                best_accuracy = current_accuracy
                print("@@@ Best Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))

        accuracy.append(np.mean(acc))


    print("{}) Mean Accuracy: {}".format(consts[idx_const], np.mean(accuracy)))

######### Next Constant
idx = 0, round = 0, cut = 0, accuracy = 0.519916142558 ------------- score = [ 0.51572327  0.52830189  0.51572327]
@@@ Worst Configuration: idx = 0, round = 0, cut = 0, accuracy = 0.519916142558
@@@ Best Configuration: idx = 0, round = 0, cut = 0, accuracy = 0.519916142558
idx = 0, round = 1, cut = 0, accuracy = 0.509433962264 ------------- score = [ 0.51572327  0.51572327  0.49685535]
@@@ Worst Configuration: idx = 0, round = 1, cut = 0, accuracy = 0.509433962264
idx = 0, round = 0, cut = 1, accuracy = 0.448637316562 ------------- score = [ 0.35220126  0.55345912  0.44025157]
@@@ Worst Configuration: idx = 0, round = 0, cut = 1, accuracy = 0.448637316562
idx = 0, round = 1, cut = 1, accuracy = 0.438155136268 ------------- score = [ 0.35220126  0.5408805   0.42138365]
@@@ Worst Configuration: idx = 0, round = 1, cut = 1, accuracy = 0.438155136268
idx = 0, round = 0, cut = 2, accuracy = 0.477987421384 ------------- score = [ 0.44025157  0.52830189  0.46540881]
id

In [738]:
#best_test['predicted'] = best_predicted
analyzeErrors(best_test, 'predicted', 'sentiment-intensity')

Accuracy: 0.526205450734
Accuracy in positive sentiment: 0.528301886792
Accuracy in negative sentiment: 0.522012578616
Accuracy in neutral sentiment: 0.528301886792
[[83 63 13]
 [38 84 37]
 [18 57 84]]
[[ 0.52201258  0.39622642  0.08176101]
 [ 0.23899371  0.52830189  0.2327044 ]
 [ 0.11320755  0.35849057  0.52830189]]


In [696]:
analyzeErrors(best_test, 'predicted', 'sentiment-intensity')

Accuracy: 0.524822695035
Accuracy in positive sentiment: 0.496453900709
Accuracy in negative sentiment: 0.482269503546
Accuracy in neutral sentiment: 0.595744680851
[[68 53 20]
 [21 84 36]
 [21 50 70]]
[[ 0.4822695   0.37588652  0.14184397]
 [ 0.14893617  0.59574468  0.25531915]
 [ 0.14893617  0.35460993  0.4964539 ]]


In [739]:
#best_test['predicted'] = best_predicted
analyzeErrors(worst_test, 'predicted', 'sentiment-intensity')

Accuracy: 0.438155136268
Accuracy in positive sentiment: 0.421383647799
Accuracy in negative sentiment: 0.352201257862
Accuracy in neutral sentiment: 0.540880503145
[[56 67 36]
 [29 86 44]
 [25 67 67]]
[[ 0.35220126  0.42138365  0.22641509]
 [ 0.18238994  0.5408805   0.27672956]
 [ 0.1572327   0.42138365  0.42138365]]


In [665]:
analyzeErrors(worst_test, 'predicted', 'sentiment-intensity')

Accuracy: 0.427895981087
Accuracy in positive sentiment: 0.41134751773
Accuracy in negative sentiment: 0.397163120567
Accuracy in neutral sentiment: 0.475177304965
[[56 60 25]
 [28 67 46]
 [20 63 58]]
[[ 0.39716312  0.42553191  0.17730496]
 [ 0.19858156  0.4751773   0.32624113]
 [ 0.14184397  0.44680851  0.41134752]]


In [93]:
selected_columns_neutral = [
        'negations',
#        'wordCount', 
        'positivCount',
        'negativCount',
#        'priorpolaritynegativeAfter',
        'priorpolaritypositiveBefore',
        'activeBefore', 'passiveBefore',
#        'priorpolaritynegativeBefore',
#        'hostileBefore', 'yesBefore', 'yesAfter', 'noBefore', 'negateBefore',
 #       'activeCountBefore','passiveCountBefore', 'hostileCountBefore', 'yesCountBefore',
#        'noCountBefore', 'negateCountBefore', 'negateCountAfter', 
        #'activeAfter',  'passiveAfter',
        'priorpolaritynegativeCountBefore', 'priorpolaritypositiveCountBefore',
#        'activeCountAfter', 'passiveCountAfter', 'hostileCountAfter', 
        'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
#       'yesCountAfter', 'noCountAfter', 
        'priorpolaritynegativeCountAfter',
#        'priorpolaritypositiveAfter',
#        'negateAfter',
#        'strongCount',
        'priorpolaritypositiveCountAfter',
#        'hostileAfter', 'noAfter',
        'morePositiveThanNeutral', 'moreNegativeThanNeutral',
        'priorpolarityCountneutral', 'priorpolarityCountneutralstrongsubj',
#        'priorpolarityCountbothstrongsubj', 'priorpolarityCountbothweaksubj',
        'priorpolarityCountneutralweaksubj',
        'priorpolarityneutralAfter',  'priorpolarityneutralBefore'
#        'priorpolaritybothBefore', 'priorpolaritybothAfter'
    ]

selected_columns_train_neutral = selected_columns_neutral + ['sentiment-neutral']
n_cuts_neutrals = 6
print("Number of features: {}".format(len(selected_columns_neutral)))

Number of features: 20


In [95]:
doNeutral = False

In [96]:
if doNeutral:
    other_parts, neutral_parts = create_n_stratified_neutral_parts(neutralSelectedSentencesMpqa, n_cuts_neutrals, selected_columns_train_neutral)

In [361]:
if doNeutral:
    best_test = None
    best_train = None
    best_predicted = None
    best_accuracy = 0
    best_clasifier = None

    accuracy = []
    num_repeats = 2
    consts = [4, 5, 6, 7, 8, 9]
    #consts = [8]

    for idx_const in range(len(consts)):
        print("######### Next Constant")
        for part in range(n_cuts_neutrals):
            test = pd.DataFrame({})
            train = pd.DataFrame({})
            # create test and training set
            for idx in range(n_cuts_neutrals):
                if idx != part:
                    test = pd.concat([other_parts[idx], neutral_parts[idx]])
                else:
                    train = pd.concat([train, other_parts[idx], neutral_parts[idx]])

            train.reset_index(inplace = True, drop = True)
            train = train.reindex(np.random.permutation(train.index))
            # train & evaluate
            train_results = train['sentiment-neutral'].values
            train.drop('sentiment-neutral', axis = 1, inplace = True)
            train_index = train['index'].values
            train.drop('index', axis = 1, inplace = True)    
            test_results = test['sentiment-neutral'].values
            test.drop('sentiment-neutral', axis = 1, inplace = True)
            test_index = test['index'].values
            test.drop('index', axis = 1, inplace = True)    
            acc = []
            for repeat_test in range(num_repeats):
                clasifier = RandomForestClassifier(n_estimators = 300, max_features = 'log2',
                                                    max_depth = consts[idx_const], n_jobs = -1)

    #            clasifier = AdaBoostClassifier( ExtraTreesClassifier(n_estimators = consts[idx_const], max_features = 'sqrt',
    #                                                max_depth = 7, n_jobs = -1),
    #                                                n_estimators=consts[idx_const],
    #                                                learning_rate=1.8)

    #            clasifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=6),
    #                                                n_estimators=consts[idx_const],
    #                                                learning_rate=1.8)
                clasifier.fit(train.values, train_results)
                current_score = clasifier.score(train.values, train_results)
                predicted = clasifier.predict(test.values)
                list_predicted = list(predicted)
                current_f1_score = f1_score(test_results, predicted)
                current_accuracy = accuracy_score(test_results, predicted)
                acc.append(current_accuracy)
                print("idx = {}, round = {}, cut = {}, accuracy = {}, train score = {}, f1 score = {}"
                              .format(idx_const, repeat_test, part, current_accuracy, current_score, current_f1_score))

                if best_accuracy < current_score:
                    best_test = test.copy()
                    best_test['index'] = test_index
                    best_test['sentiment-neutral'] = test_results
                    best_test['predicted'] = predicted
                    best_train = train.copy()
                    best_train['index'] = train_index
                    best_train['sentiment-neutral'] = train_results
                    best_predicted = predicted.copy()
                    best_clasifier = clasifier
                    best_accuracy = current_score
                    print("@@@ Best Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))

            accuracy.append(np.mean(acc))


        print("{}) Mean Accuracy: {}".format(consts[idx_const], np.mean(accuracy)))

In [98]:
if doNeutral:
    best_test['predicted'] = best_predicted
    analyzeErrors(best_test, 'predicted', 'sentiment-neutral')

In [99]:
if doNeutral:
    wrong_sentences = neutralSelectedSentencesMpqa.loc[best_test.loc[best_test['predicted'] != best_test['sentiment-neutral'], 'index'], neutralSelectedSentencesMpqa.columns]

In [377]:
# verification phase
def directClassification(row, pos_column, neg_column, suffix):
    if row[pos_column] < row[neg_column]:
        row['apriori-class-'+suffix] = -1
    elif row[pos_column] > row[neg_column]:
        row['apriori-class-'+suffix] = 1
    else:
        row['apriori-class-'+suffix] = 0
    
    return row

In [570]:
def calculateSentiment(sentences, column_predicted):
    numPositive = sentences[sentences[column_predicted] == sentiment_positive].shape[0]
    numNegative = sentences[sentences[column_predicted] == sentiment_negative].shape[0]
    numNeutral = sentences[sentences[column_predicted] == 0].shape[0]
#    print("Pos = {}; Neg = {}; Neutral = {}".format(numPositive, numNegative, numNeutral))
    if numPositive > numNegative:
        return sentiment_positive
    elif numPositive < numNegative:
        return sentiment_negative
    else:
        return 0    

def calculateSentimentReview(review_id, df, col):
    review_data = review_id.split('#')
    review_set_id = int(review_data[2])
    review_rating = int(review_data[1])
    review_df_id = int(review_data[0])
    sentences = df[(df['id'] == review_df_id) & (df['set'] == review_set_id) & (df['rating'] == review_rating) ]
    return calculateSentiment(sentences, col)

In [571]:
len(selected_columns_test)

86

In [740]:
test_reviews_even['expected'] = test_reviews_even['type']
verificationData = sentencesTestSubsetImdb[selected_columns_test]

In [690]:
sentencesTestSubsetImdb = sentencesTestSubsetImdb.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)
test_reviews_even['predicted-direct'] = test_reviews_even['real-id'].map(lambda cell: calculateSentimentReview(cell, sentencesTestSubsetImdb, 'apriori-class-direct'))
print("Done")
selectedPolarizedSentencesMpqa = selectedPolarizedSentencesMpqa.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)

Done


In [741]:
sentencesTestSubsetImdb['trained-classifier-predicted'] = best_clasifier.predict(verificationData.values)
test_reviews_even['predicted-model'] = test_reviews_even['real-id'].map(lambda cell: calculateSentimentReview(cell, sentencesTestSubsetImdb, 'trained-classifier-predicted'))
print("Done")
print("Direct model: {}".format(calculateAccuracy(test_reviews_even, 'predicted-direct', 'expected')))
print("Trained model: {}".format(calculateAccuracy(test_reviews_even, 'predicted-model', 'expected')))
print("")
print("Direct model (sentences): {}".format(calculateAccuracy(selectedPolarizedSentencesMpqa, 'apriori-class-direct', 'sentiment-intensity')))
print
print
print("Direct model")
analyzeErrors(test_reviews_even, 'predicted-direct', 'expected')
print
print("Trained model")
analyzeErrors(test_reviews_even, 'predicted-model', 'expected')


Done
Direct model: 0.525
Trained model: 0.537

Direct model (sentences): 0.442000523697


Direct model
Accuracy: 0.525
Accuracy in positive sentiment: 0.803
Accuracy in negative sentiment: 0.247
Accuracy in neutral sentiment: 0
[[247 143 610]
 [  0   0   0]
 [103  94 803]]
[[ 0.247  0.143  0.61 ]
 [   nan    nan    nan]
 [ 0.103  0.094  0.803]]

Trained model
Accuracy: 0.537
Accuracy in positive sentiment: 0.505
Accuracy in negative sentiment: 0.569
Accuracy in neutral sentiment: 0
[[569 140 291]
 [  0   0   0]
 [356 139 505]]
[[ 0.569  0.14   0.291]
 [   nan    nan    nan]
 [ 0.356  0.139  0.505]]


In [693]:
doSave = True
version = '11'

In [694]:
if doSave:
    joblib.dump(best_clasifier, './models-sentiment/version_'+version+'/version_'+version+'.pkl')
    with open('./models-sentiment/version_'+version+'/features.dump','wb') as fp:
        pickle.dump(selected_columns_test, fp)    

In [695]:
doSave = False

In [112]:
doNeuralNetwork = False

In [113]:
# Neural network
from tools.thirdparty.nnbook import network2

In [119]:
def preprocessResultsNN(row):
    if row['expected'] > 0:
        row['pos'] = 1
        row['neuron'] = 2
    elif row['expected'] < 0:
        row['neg'] = 1
        row['neuron'] = 0
    else:
        row['neutral'] = 1
        row['neuron'] = 1
    
    return row

def preprocessDataFrameForNN(df, not_test = True):
    df_nn = df.drop('index', axis = 1).reset_index(drop = True)
    expected_dt = pd.DataFrame(np.zeros((df_nn.shape[0], 5)), columns = ['neg', 'neutral', 'pos', 'neuron', 'expected'])
    expected_dt['expected'] = df_nn['sentiment-intensity']
    expected_dt = expected_dt.apply(lambda row: preprocessResultsNN(row), axis = 1)
    df_nn.drop('sentiment-intensity', axis = 1, inplace = True)
    if 'predicted' in df_nn.columns:
        df_nn.drop('predicted', axis = 1, inplace = True)

    inp_size = len(df_nn.columns)
    out_size = 3
    
    for col in df_nn.columns:
        if df_nn[col].dtype == 'bool':
            df_nn[col] = df_nn[col].astype(int)

    res = expected_dt[['neg', 'neutral', 'pos']].values
    if not_test == False:
        res = expected_dt['neuron'].values
        out_size = 1
            
        training_data = [(np.ndarray((inp_size,1), buffer=np.array(x)),
                        np.ndarray((out_size,1), buffer=np.array(y), dtype=int))
                        for (x,y) in zip(df_nn.values, res)]
    else:
        training_data = [(np.ndarray((inp_size,1), buffer=np.array(x)),
                        np.ndarray((out_size,1), buffer=np.array(y), dtype=int))
                        for (x,y) in zip(df_nn.values, res)]

    return training_data, df_nn

In [120]:
if doNeuralNetwork:
    pos_test = best_train[best_train['sentiment-intensity'] == sentiment_positive][1:5]
    neg_test = best_train[best_train['sentiment-intensity'] == sentiment_negative][1:5]
    neutral_test = best_train[best_train['sentiment-intensity'] == 0][1:5]
    train_test_data = pd.concat([pos_test, neg_test, neutral_test])
    train_test_data.shape

In [121]:
if doNeuralNetwork:
    training_data, df_nn = preprocessDataFrameForNN(best_train)
    test_data, df_nn_test = preprocessDataFrameForNN(best_test, False)

In [122]:
if doNeuralNetwork:
    hidden_neurons = 30
    nn = network2.Network([len(training_data[0][0]), hidden_neurons, 3],
                                    cost=network2.CrossEntropyCost)
    #nn.large_weight_initializer()

In [123]:
if doNeuralNetwork:
    epochs = 3
    lmbda = 0.00002
    learningRate = 0.00002
    batchSize = 3

    nn.SGD(
    #        experiment_data
            training_data
           , epochs,batchSize, learningRate,
            lmbda = lmbda, evaluation_data = test_data, 
               monitor_training_accuracy = True
           , monitor_evaluation_accuracy = True,
               monitor_evaluation_cost = True, monitor_training_cost = True
          )