In [1]:
import pandas as pd
import numpy as np
from tools.parsers import generalinquirer as generalInquirerParser
from tools.parsers import negation as negationParser 
from tools.sentimentanalysis import preparation
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import time
import string

In [2]:
stemmer = SnowballStemmer("english")
prepData = preparation.Preparation()
parserInquirer = generalInquirerParser.GeneralInquirer()
parserNegation = negationParser.Negation()
sentimentDictionaries = parserInquirer.readFileCsv(parserInquirer.combinedFileLoc)
sentences = parserInquirer.readFileCsv(prepData.defaultFileNameSentimentSentencesNormalized)
negations = parserNegation.readFileCsv(parserNegation.defaultFileNameProcessed)
print("Shape Sentences: {}".format(sentences.shape))
print("Shape Negations: {}".format(negations.shape))

Shape Sentences: (9554, 12)
Shape Negations: (16, 2)


In [3]:
sentences['textLower'] = sentences['text'].map( lambda cell: cell.lower())
polarizedSentences = sentences[sentences['sentiment-type'] == 0]
polarizedSentences.shape

(5646, 13)

In [4]:
sentimentDictionaries['entry'] = sentimentDictionaries['entry'].map(lambda cell: stemmer.stem(cell))

In [5]:
negations['phraseStemmed'] = negations['phrase'].map(lambda cell: stemmer.stem(cell).lower())

In [6]:
sentimentDictionaries.drop_duplicates(subset = 'entryUpper', inplace = True)

In [7]:
print("Negative: {}".format(polarizedSentences[polarizedSentences['sentiment-intensity'] < 0].shape))
print("Neutral: {}".format(polarizedSentences[polarizedSentences['sentiment-intensity'] == 0].shape))
print("Positive: {}".format(polarizedSentences[polarizedSentences['sentiment-intensity'] > 0].shape))
print("Dictionary: {}".format(sentimentDictionaries.shape))

Negative: (836, 13)
Neutral: (2093, 13)
Positive: (2717, 13)
Dictionary: (2736, 26)


In [8]:
sentimentDictionaries[sentimentDictionaries['passive'] == 'passive']

Unnamed: 0,positiv,negativ,active,passive,affil,hostile,strong,power,weak,submit,...,feel,need,persist,entry,othtags,type,pos,stemmed,priorpolarity,entryUpper
2,,negativ,,passive,,,,,,,...,,,,abat,supv,weaksubj,verb,y,negative,ABAT
3,,negativ,,passive,,,,,weak,submit,...,,,,abdic,supv,weaksubj,verb,y,negative,ABDIC
4,,negativ,,passive,,hostile,,,,,...,,,,abhor,supv,strongsubj,anypos,y,negative,ABHOR
8,,negativ,,passive,,,,,weak,submit,...,,,,abject,modif,strongsubj,adj,n,negative,ABJECT
13,positiv,,,passive,,,,,,,...,,,,abound,supv,weaksubj,verb,y,positive,ABOUND
18,,negativ,,passive,,,,,weak,,...,,,,absent-mind,modif,strongsubj,adj,n,negative,ABSENT-MIND
36,positiv,,,passive,,,,,,submit,...,,,,accept,supv,weaksubj,verb,y,positive,ACCEPT
58,,negativ,,passive,,,,,,,...,,,,ach,supv,strongsubj,verb,y,negative,ACH
86,,,,passive,,,,,,submit,...,,,,admiss,noun,weaksubj,adj,n,positive,ADMISS
88,positiv,,,passive,,,,,weak,submit,...,,,,admit,supv,strongsubj,verb,y,positive,ADMIT


In [9]:
def cleanUpWord(w):
    return w.translate(None, string.punctuation).strip()
polarizedSentences[polarizedSentences['index'] == 13740]

Unnamed: 0,index,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text,sentiment-measured,sentiment-type,sentiment-intensity,textUpper
8944,13740,IZ-060316-01-Trans-1,ula,24,12239,12406,167,1,LU_ANNOTATE> We have restrictions on beheading...,1,0,0.625,LU_ANNOTATE> WE HAVE RESTRICTIONS ON BEHEADING...


In [10]:
def processWords(row):
    global stemmer

    non_english_text = ['im_401b_e73i32c22_031705-2', 'IZ-060316-01-Trans-1', '20000815_AFP_ARB.0084.IBM-HA-NEW', 'NapierDianne']
    words = [ word for word in [cleanUpWord(w) for w in word_tokenize(row['textUpper'])] if len(word) > 0]
    row['wordsList'] = '|' + '|'.join(words) + '|'
    row['wordCount'] = len(words)
    
    if row['docName'] in non_english_text :
        row['wordsListStemmed'] = unicode('|'.join(words), errors='ignore')
    else:
        row['wordsListStemmed'] = '|'.join([stemmer.stem(unicode(w, errors='ignore')) for w in words]).upper()

    row['wordsListStemmed'] = '|' + row['wordsListStemmed'] + '|'
    return row


tStart = time.time()
polarizedSentences = polarizedSentences.apply(lambda row: processWords(row), axis = 1)
tEnd = time.time()
print("Timed: {}".format(str(tEnd - tStart)))

Timed: 12.2449998856


In [11]:
polarizedSentences

Unnamed: 0,index,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text,sentiment-measured,sentiment-type,sentiment-intensity,textUpper,wordsList,wordCount,wordsListStemmed
0,0,13.40.05-15087,20010620,0,109,226,117,2,The Kimberley Provincial Hospital said it woul...,1,0,0.000000,THE KIMBERLEY PROVINCIAL HOSPITAL SAID IT WOUL...,|THE|KIMBERLEY|PROVINCIAL|HOSPITAL|SAID|IT|WOU...,19,|THE|KIMBERLEY|PROVINCI|HOSPIT|SAID|IT|WOULD|P...
2,11,13.40.05-15087,20010620,11,656,749,93,3,He said it was his opinion that the patient --...,1,0,0.000000,HE SAID IT WAS HIS OPINION THAT THE PATIENT --...,|HE|SAID|IT|WAS|HIS|OPINION|THAT|THE|PATIENT|A...,17,|HE|SAID|IT|WAS|HIS|OPINION|THAT|THE|PATIENT|A...
3,13,13.40.05-15087,20010620,13,588,655,67,3,Saeed said indications were that those tests w...,1,0,0.000000,SAEED SAID INDICATIONS WERE THAT THOSE TESTS W...,|SAEED|SAID|INDICATIONS|WERE|THAT|THOSE|TESTS|...,11,|SAEED|SAID|INDIC|WERE|THAT|THOSE|TEST|WOULD|B...
4,15,23.46.20-17835,20010627,0,1782,1960,178,3,"It is believed that sand from northern areas, ...",1,0,0.000000,"IT IS BELIEVED THAT SAND FROM NORTHERN AREAS, ...",|IT|IS|BELIEVED|THAT|SAND|FROM|NORTHERN|AREAS|...,28,|IT|IS|BELIEV|THAT|SAND|FROM|NORTHERN|AREA|ESP...
5,26,23.46.20-17835,20010627,11,1466,1597,131,2,Private organizations are also being encourage...,1,0,0.937500,PRIVATE ORGANIZATIONS ARE ALSO BEING ENCOURAGE...,|PRIVATE|ORGANIZATIONS|ARE|ALSO|BEING|ENCOURAG...,18,|PRIVAT|ORGAN|ARE|ALSO|BE|ENCOURAG|TO|HELP|FIG...
6,29,00.48.42-17806,20010630,1,1376,1578,202,1,"Labus said the United States, which had until ...",1,0,0.000000,"LABUS SAID THE UNITED STATES, WHICH HAD UNTIL ...",|LABUS|SAID|THE|UNITED|STATES|WHICH|HAD|UNTIL|...,30,|LABUS|SAID|THE|UNIT|STATE|WHICH|HAD|UNTIL|THE...
7,31,00.48.42-17806,20010630,3,2648,2698,50,2,"""We decided to make some bold decisions,"" he s...",1,0,0.959375,"""WE DECIDED TO MAKE SOME BOLD DECISIONS,"" HE S...",|WE|DECIDED|TO|MAKE|SOME|BOLD|DECISIONS|HE|SAID|,9,|WE|DECID|TO|MAKE|SOME|BOLD|DECIS|HE|SAID|
11,37,00.48.42-17806,20010630,9,2507,2641,134,3,We are now fully back into the international c...,1,0,0.948750,WE ARE NOW FULLY BACK INTO THE INTERNATIONAL C...,|WE|ARE|NOW|FULLY|BACK|INTO|THE|INTERNATIONAL|...,20,|WE|ARE|NOW|FULLI|BACK|INTO|THE|INTERN|COMMUNI...
12,38,00.48.42-17806,20010630,10,3931,4016,85,2,"""There is no quick fix"" to the Yugoslav proble...",1,0,-0.960000,"""THERE IS NO QUICK FIX"" TO THE YUGOSLAV PROBLE...",|THERE|IS|NO|QUICK|FIX|TO|THE|YUGOSLAV|PROBLEM...,16,|THERE|IS|NO|QUICK|FIX|TO|THE|YUGOSLAV|PROBLEM...
13,39,00.48.42-17806,20010630,11,4140,4309,169,3,"""The possibility of a democratic, stable and p...",1,0,0.964583,"""THE POSSIBILITY OF A DEMOCRATIC, STABLE AND P...",|THE|POSSIBILITY|OF|A|DEMOCRATIC|STABLE|AND|PR...,26,|THE|POSSIBL|OF|A|DEMOCRAT|STABL|AND|PROSPER|Y...


In [12]:
sentimentDictionaries

Unnamed: 0,positiv,negativ,active,passive,affil,hostile,strong,power,weak,submit,...,feel,need,persist,entry,othtags,type,pos,stemmed,priorpolarity,entryUpper
0,,negativ,,,,,,,weak,,...,,,,abandon,supv,weaksubj,verb,y,negative,ABANDON
2,,negativ,,passive,,,,,,,...,,,,abat,supv,weaksubj,verb,y,negative,ABAT
3,,negativ,,passive,,,,,weak,submit,...,,,,abdic,supv,weaksubj,verb,y,negative,ABDIC
4,,negativ,,passive,,hostile,,,,,...,,,,abhor,supv,strongsubj,anypos,y,negative,ABHOR
6,positiv,,active,,affil,,,,,,...,,,,abid,supv,strongsubj,anypos,y,positive,ABID
7,positiv,,,,,,strong,,,,...,,,,abil,noun,weaksubj,noun,n,positive,ABIL
8,,negativ,,passive,,,,,weak,submit,...,,,,abject,modif,strongsubj,adj,n,negative,ABJECT
9,positiv,,,,,,strong,,,,...,,,,abl,modif,weaksubj,adj,n,positive,ABL
10,,negativ,,,,,,,,,...,,,,abnorm,modif,weaksubj,adj,n,negative,ABNORM
11,,negativ,active,,,hostile,strong,power,,,...,,,,abolish,supv,weaksubj,verb,y,negative,ABOLISH


In [13]:
def countNegations(row):
    words = row['wordsList']
    negations['found'] = negations.apply(lambda row: 1 if '|'+row['phraseUpper']+'|' in words else 0, axis = 1)
    row['negations'] = negations['found'].sum()
    return row

tStart = time.time()
polarizedSentences = polarizedSentences.apply(lambda row: countNegations(row), axis = 1)
tEnd = time.time()
print("Timed: {}".format(str(tEnd-tStart)))

Timed: 7.99000000954


In [14]:
polarizedSentences[polarizedSentences['index'] == 13617]

Unnamed: 0,index,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text,sentiment-measured,sentiment-type,sentiment-intensity,textUpper,wordsList,wordCount,wordsListStemmed,negations
8910,13617,im_401b_e73i32c22_031705-2,ula,0,4646,4693,47,4,(%إنقطاع) Line 0032-[respondentA2E]: se,1,0,0.93625,(%إنقطاع) LINE 0032-[RESPONDENTA2E]: SE,|إنقطاع|LINE|0032|RESPONDENTA2E|SE|,5,||LINE|0032|RESPONDENTA2E|SE|,0


In [15]:
def countOccurenceColumn(row, lookFor, columnName, normalize):
    words = row['wordsListStemmed']
    lookFor['found'] = lookFor.apply(lambda rowLook: 1 if '|'+rowLook['entryUpper']+'|' in words else 0, axis = 1)
    row[columnName] = lookFor['found'].sum()
    if normalize:
        row[columnName] = row[columnName] / float(len(words))
    return row

def CalculateColumn(column, df, val = None, typeWord = None, normalize = False):
    global sentimentDictionaries
    
    compareVal = column
    if column == None:
        columnName = "All"+typeWord
    else:
        if val != None:
            compareVal = val
        columnName = column+'Count'
        if val != None:
            columnName = columnName + val
        if typeWord != None:
            columnName = columnName + typeWord
        
    if typeWord == None:
        lookFor = sentimentDictionaries[sentimentDictionaries[column] == compareVal].reset_index()
    else:
        if column == None:
            lookFor = sentimentDictionaries[(sentimentDictionaries['type'] == typeWord)].reset_index()
        else:
            lookFor = sentimentDictionaries[(sentimentDictionaries[column] == compareVal) & (sentimentDictionaries['type'] == typeWord)].reset_index()
    
    print("LookFor shape: {}".format(lookFor.shape))
    df[columnName]  = 0
    tStart = time.time()
    df = df.apply(lambda row: countOccurenceColumn(row, lookFor, columnName, normalize), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [43]:
polarizedSentences = CalculateColumn('priorpolarity', polarizedSentences, 'negative')
polarizedSentences = CalculateColumn('priorpolarity', polarizedSentences, 'positive')
polarizedSentences = CalculateColumn('priorpolarity', polarizedSentences, 'negative', 'strongsubj')
polarizedSentences = CalculateColumn('priorpolarity', polarizedSentences, 'positive', 'strongsubj')
polarizedSentences = CalculateColumn('priorpolarity', polarizedSentences, 'negative', 'weaksubj')
polarizedSentences = CalculateColumn('priorpolarity', polarizedSentences, 'positive', 'weaksubj')

LookFor shape: (1552, 27)
Timed(priorpolarityCountnegative): 109.779999971
LookFor shape: (1040, 27)
Timed(priorpolarityCountpositive): 74.8910000324
LookFor shape: (972, 27)
Timed(priorpolarityCountnegativestrongsubj): 69.7039999962
LookFor shape: (584, 27)
Timed(priorpolarityCountpositivestrongsubj): 43.8350000381
LookFor shape: (580, 27)
Timed(priorpolarityCountnegativeweaksubj): 43.2169997692
LookFor shape: (456, 27)
Timed(priorpolarityCountpositiveweaksubj): 34.7119998932


In [44]:
polarizedSentences = CalculateColumn(None, polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn(None, polarizedSentences, None, 'strongsubj')

LookFor shape: (1129, 27)
Timed(Allweaksubj): 86.8709998131
LookFor shape: (1607, 27)
Timed(Allstrongsubj): 118.191999912


In [45]:
polarizedSentences = CalculateColumn('hostile', polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn('strong', polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn('hostile', polarizedSentences, None, 'strongsubj')
polarizedSentences = CalculateColumn('strong', polarizedSentences, None, 'strongsubj')
polarizedSentences = CalculateColumn('hostile', polarizedSentences)
polarizedSentences = CalculateColumn('strong', polarizedSentences)

LookFor shape: (140, 27)
Timed(hostileCountweaksubj): 14.2070000172
LookFor shape: (312, 27)
Timed(strongCountweaksubj): 33.0880000591
LookFor shape: (299, 27)
Timed(hostileCountstrongsubj): 29.9669997692
LookFor shape: (274, 27)
Timed(strongCountstrongsubj): 24.5879998207
LookFor shape: (439, 27)
Timed(hostileCount): 33.7050001621
LookFor shape: (586, 27)
Timed(strongCount): 43.754999876


In [46]:
polarizedSentences = CalculateColumn('active', polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn('passive', polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn('active', polarizedSentences, None, 'strongsubj')
polarizedSentences = CalculateColumn('passive', polarizedSentences, None, 'strongsubj')
polarizedSentences = CalculateColumn('active', polarizedSentences)
polarizedSentences = CalculateColumn('passive', polarizedSentences)

LookFor shape: (264, 27)
Timed(activeCountweaksubj): 22.2240002155
LookFor shape: (126, 27)
Timed(passiveCountweaksubj): 12.1900000572
LookFor shape: (290, 27)
Timed(activeCountstrongsubj): 23.3739998341
LookFor shape: (193, 27)
Timed(passiveCountstrongsubj): 17.4790000916
LookFor shape: (554, 27)
Timed(activeCount): 41.9180002213
LookFor shape: (319, 27)
Timed(passiveCount): 25.5300002098


In [47]:
polarizedSentences = CalculateColumn('positiv', polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn('negativ', polarizedSentences, None, 'weaksubj')
polarizedSentences = CalculateColumn('positiv', polarizedSentences, None, 'strongsubj')
polarizedSentences = CalculateColumn('negativ', polarizedSentences, None, 'strongsubj')
polarizedSentences = CalculateColumn('positiv', polarizedSentences)
polarizedSentences = CalculateColumn('negativ', polarizedSentences)

LookFor shape: (403, 27)
Timed(positivCountweaksubj): 31.4140000343
LookFor shape: (500, 27)
Timed(negativCountweaksubj): 38.3249998093
LookFor shape: (562, 27)
Timed(positivCountstrongsubj): 42.3600001335
LookFor shape: (921, 27)
Timed(negativCountstrongsubj): 68.3980000019
LookFor shape: (965, 27)
Timed(positivCount): 70.2990000248
LookFor shape: (1421, 27)
Timed(negativCount): 100.661999941


In [6]:
import cPickle as pickle

In [67]:
with open('./data-sentiment-polarized-stem.dump','wb') as fp:
    pickle.dump(polarizedSentences,fp)

In [7]:
with open('./data-sentiment-polarized-stem.dump','rb') as fp:
    polarizedSentences = pickle.load(fp)

In [None]:
data2

In [8]:
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
def compareValues(df, column):
    plt.figure()
    plt.plot(df[column][df['sentiment-intensity'] == 0].values)
    plt.title(column + " - Neutral")
    mean = str(df[column][df['sentiment-intensity'] == 0].mean())
    std = str(df[column][df['sentiment-intensity'] == 0].std())
    quantile = str(df[column][df['sentiment-intensity'] == 0].quantile([0.25, 0.5, 0.9, 0.95, 0.99, 1.0]))    
    print("Neutral: mean = {}; std = {}; 0.9".format(mean, std))
    print("Quantile:")
    print(quantile)
    print("")

    plt.figure()
    plt.plot(df[column][df['sentiment-intensity'] > 0 ].values)
    plt.title(column + " - Positive")
    mean = str(df[column][df['sentiment-intensity'] > 0].mean())
    std = str(df[column][df['sentiment-intensity'] > 0].std())
    quantile = str(df[column][df['sentiment-intensity'] > 0].quantile([0.25, 0.5, 0.9, 0.95, 0.99, 1.0]))
    print("Positive: mean = {}; std = {}".format(mean, std))
    print("Quantile:")
    print(quantile)
    print("")
    
    plt.figure()
    plt.plot(df[column][df['sentiment-intensity'] < 0 ].values)
    plt.title(column + " - Negative")
    mean = str(df[column][df['sentiment-intensity'] < 0].mean())
    std = str(df[column][df['sentiment-intensity'] < 0].std())
    quantile = str(df[column][df['sentiment-intensity'] < 0].quantile([0.25, 0.5, 0.9, 0.95, 0.99, 1.0]))
    print("Negative: mean = {}; std = {}".format(mean, std))
    print("Quantile:")
    print(quantile)
    print("")
    plt.show()

In [None]:
compareValues(polarizedSentences, 'priorpolarityCountnegativestrongsubj')

In [52]:
polarizedSentences['morePositiveThanNegativeStrong'] = False
dt = polarizedSentences[polarizedSentences['priorpolarityCountpositivestrongsubj'] > polarizedSentences['priorpolarityCountnegativestrongsubj'] ]
polarizedSentences.loc[dt.index,'morePositiveThanNegativeStrong'] = True
polarizedSentences['morePositiveThanNegativeWeak'] = False
dt = polarizedSentences[polarizedSentences['priorpolarityCountpositiveweaksubj'] > polarizedSentences['priorpolarityCountnegativeweaksubj'] ]
polarizedSentences.loc[dt.index,'morePositiveThanNegativeWeak'] = True
polarizedSentences['morePositiveThanNegative'] = False
dt = polarizedSentences[polarizedSentences['priorpolarityCountpositive'] > polarizedSentences['priorpolarityCountnegative'] ]
polarizedSentences.loc[dt.index,'morePositiveThanNegative'] = True

In [53]:
def compareBinaryValues(df, column, txt):
    val1 = polarizedSentences[(polarizedSentences[column] == 0) & (polarizedSentences['sentiment-intensity'] < 0)].shape[0] / float(polarizedSentences[polarizedSentences['sentiment-intensity'] < 0].shape[0])
    print(txt+": Positive <= Negative for negative, {}".format(str(val1)))
    val2 = polarizedSentences[(polarizedSentences[column] == 0) & (polarizedSentences['sentiment-intensity'] > 0)].shape[0] / float(polarizedSentences[polarizedSentences['sentiment-intensity'] > 0].shape[0])
    print(txt+": Positive <= Negative for positive, {}".format(str(val2)))
    val3 = polarizedSentences[(polarizedSentences[column] == 0) & (polarizedSentences['sentiment-intensity'] == 0)].shape[0] / float(polarizedSentences[polarizedSentences['sentiment-intensity'] == 0].shape[0])
    print(txt+": Positive <= Negative for neutral, {}".format(str(val3)))

    val4 = polarizedSentences[(polarizedSentences[column] == 1) & (polarizedSentences['sentiment-intensity'] < 0)].shape[0] / float(polarizedSentences[polarizedSentences['sentiment-intensity'] < 0].shape[0])
    print(txt+": Positive > Negative for negative, {}".format(str(val4)))
    val5 = polarizedSentences[(polarizedSentences[column] == 1) & (polarizedSentences['sentiment-intensity'] > 0)].shape[0] / float(polarizedSentences[polarizedSentences['sentiment-intensity'] > 0].shape[0])
    print(txt+": Positive > Negative for positive, {}".format(str(val5)))
    val6 = polarizedSentences[(polarizedSentences[column] == 1) & (polarizedSentences['sentiment-intensity'] == 0)].shape[0] / float(polarizedSentences[polarizedSentences['sentiment-intensity'] == 0].shape[0])
    print(txt+": Positive > Negative for neutral, {}".format(str(val6)))
    return [val1, val2, val3, val4, val5, val6]

In [None]:
compareBinaryValues(polarizedSentences, 'morePositiveThanNegativeStrong', "Strong")

In [54]:
def checkOccurenceColumn(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phraseUpper']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['wordsList'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['wordsListStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['wordsListStemmed'].find(rowLook['entryUpper'], 0, posNeg)
            posAfter = row['wordsListStemmed'].find(rowLook['entryUpper'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = True
            if posAfter != -1:
                row[columnName+'After'] = True
            if row[columnName+'Before'] and row[columnName+'After']:
                return row

    return row


def countOccurenceColumnBeforeAfter(row, lookFor, columnName):
    global negations
    for _, neg_row in negations.iterrows():
        neg = '|'+neg_row['phraseUpper']+'|'
        neg_stemmed = '|' + neg_row['phraseStemmed'] + '|'
        posNeg = row['wordsList'].find(neg)
        if posNeg == -1 :
            return row
        posNeg = row['wordsListStemmed'].find(neg_stemmed)

        afterPos = posNeg + len(neg_stemmed)
        for _, rowLook in lookFor.iterrows():
            posBefore = row['wordsListStemmed'].find(rowLook['entryUpper'], 0, posNeg)
            posAfter = row['wordsListStemmed'].find(rowLook['entryUpper'], afterPos)

            if posBefore != -1:
                row[columnName+'Before'] = row[columnName+'Before'] + 1 
            if posAfter != -1:
                row[columnName+'After'] = row[columnName+'After'] + 1

    return row


def WordsAroundNegations(column, df, val = None, count = False):
    global sentimentDictionaries

    columnName = column
    compareVal = val
    if val is None:
        compareVal = column
    else:
        columnName = column + val
    
    if count:
        columnName = columnName + "Count"
    
    lookFor = sentimentDictionaries[sentimentDictionaries[column] == compareVal]
    
    print("LookFor shape: {}".format(lookFor.shape))
    if count:
        df[columnName+"Before"]  = 0
        df[columnName+"After"]  = 0
    else:
        df[columnName+"Before"]  = False
        df[columnName+"After"]  = False
    tStart = time.time()
    if count:
        df = df.apply(lambda row: countOccurenceColumnBeforeAfter(row, lookFor, columnName), axis = 1)
    else:
        df = df.apply(lambda row: checkOccurenceColumn(row, lookFor, columnName), axis = 1)
    tEnd = time.time()
    print("Timed({}): {}".format(columnName, str(tEnd-tStart)))
    
    return df

In [55]:
polarizedSentences = WordsAroundNegations('priorpolarity', polarizedSentences, 'negative')
polarizedSentences = WordsAroundNegations('priorpolarity', polarizedSentences, 'positive')
polarizedSentences = WordsAroundNegations('active', polarizedSentences)
polarizedSentences = WordsAroundNegations('passive', polarizedSentences)
polarizedSentences = WordsAroundNegations('hostile', polarizedSentences)
polarizedSentences = WordsAroundNegations('yes', polarizedSentences)
polarizedSentences = WordsAroundNegations('no', polarizedSentences)
polarizedSentences = WordsAroundNegations('negate', polarizedSentences)
polarizedSentences = WordsAroundNegations('active', polarizedSentences, None, True)
polarizedSentences = WordsAroundNegations('passive', polarizedSentences, None, True)
polarizedSentences = WordsAroundNegations('hostile', polarizedSentences, None, True)
polarizedSentences = WordsAroundNegations('yes', polarizedSentences, None, True)
polarizedSentences = WordsAroundNegations('no', polarizedSentences, None, True)
polarizedSentences = WordsAroundNegations('negate', polarizedSentences, None, True)
polarizedSentences = WordsAroundNegations('priorpolarity', polarizedSentences, 'negative', True)
polarizedSentences = WordsAroundNegations('priorpolarity', polarizedSentences, 'positive', True)

LookFor shape: (1552, 26)
Timed(priorpolaritynegative): 84.9679999352
LookFor shape: (1040, 26)
Timed(priorpolaritypositive): 55.4409999847
LookFor shape: (554, 26)
Timed(active): 32.5439999104
LookFor shape: (319, 26)
Timed(passive): 18.9079999924
LookFor shape: (439, 26)
Timed(hostile): 27.3129999638
LookFor shape: (5, 26)
Timed(yes): 1.85299992561
LookFor shape: (3, 26)
Timed(no): 1.76099991798
LookFor shape: (119, 26)
Timed(negate): 8.55900001526
LookFor shape: (554, 26)
Timed(activeCount): 31.3399999142
LookFor shape: (319, 26)
Timed(passiveCount): 18.8489999771
LookFor shape: (439, 26)
Timed(hostileCount): 27.6599998474
LookFor shape: (5, 26)
Timed(yesCount): 2.22500014305
LookFor shape: (3, 26)
Timed(noCount): 1.90999984741
LookFor shape: (119, 26)
Timed(negateCount): 8.77400016785
LookFor shape: (1552, 26)
Timed(priorpolaritynegativeCount): 93.1559998989
LookFor shape: (1040, 26)
Timed(priorpolaritypositiveCount): 63.8999998569


In [9]:
polarizedSentences[polarizedSentences['activeBefore'] == True].shape
polarizedSentences.columns

Index([u'index', u'docName', u'dirName', u'idx', u'startByte', u'endByte',
       u'sentLen', u'annotsCount', u'text', u'sentiment-measured',
       u'sentiment-type', u'sentiment-intensity', u'textUpper', u'wordsList',
       u'wordCount', u'wordsListStemmed', u'negations',
       u'priorpolarityCountnegative', u'priorpolarityCountpositive',
       u'priorpolarityCountnegativestrongsubj',
       u'priorpolarityCountpositivestrongsubj',
       u'priorpolarityCountnegativeweaksubj',
       u'priorpolarityCountpositiveweaksubj', u'Allweaksubj', u'Allstrongsubj',
       u'hostileCountweaksubj', u'strongCountweaksubj',
       u'hostileCountstrongsubj', u'strongCountstrongsubj', u'hostileCount',
       u'strongCount', u'activeCountweaksubj', u'passiveCountweaksubj',
       u'activeCountstrongsubj', u'passiveCountstrongsubj', u'activeCount',
       u'passiveCount', u'positivCountweaksubj', u'negativCountweaksubj',
       u'positivCountstrongsubj', u'negativCountstrongsubj', u'positivCount',


In [10]:
from sklearn.decomposition import PCA

In [11]:
pca = PCA()
pca.fit(polarizedSentences[[
       'positivCount', 'negativCount', 'sentiment-intensity', 'priorpolaritynegativeBefore', 'priorpolaritynegativeAfter',
        'priorpolaritypositiveBefore', 'priorpolaritypositiveAfter', 'activeBefore', 'activeAfter', 'passiveBefore', 'passiveAfter',
        'hostileBefore', 'hostileAfter', 'yesBefore', 'yesAfter', 'noBefore', 'noAfter', 'negateBefore', 'negateAfter',
        'activeCountBefore', 'activeCountAfter', 'passiveCountBefore', 'passiveCountAfter', 'hostileCountBefore', 'hostileCountAfter',
        'yesCountBefore', 'yesCountAfter', 'noCountBefore', 'noCountAfter', 'negateCountBefore', 'negateCountAfter',
        'priorpolaritynegativeCountBefore', 'priorpolaritynegativeCountAfter', 'priorpolaritypositiveCountBefore',
        'priorpolaritypositiveCountAfter']].values)
pca.explained_variance_ratio_

array([  3.20827862e-01,   2.56039600e-01,   1.47232621e-01,
         9.58005469e-02,   5.80041782e-02,   3.60129525e-02,
         2.34791256e-02,   1.15447412e-02,   9.29938489e-03,
         7.60716270e-03,   6.00594705e-03,   4.69987367e-03,
         4.17364890e-03,   4.08769479e-03,   2.40026114e-03,
         1.81914217e-03,   1.54510663e-03,   1.37477551e-03,
         1.18922057e-03,   9.65460475e-04,   9.50901019e-04,
         8.68713376e-04,   7.24852777e-04,   6.64687129e-04,
         6.40167088e-04,   5.96902501e-04,   5.41258799e-04,
         5.25846387e-04,   3.38845801e-04,   2.88672031e-05,
         9.65189773e-06,   2.97819385e-33,   2.97819385e-33,
         2.97819385e-33,   2.97819385e-33])

In [None]:
print(min())

In [None]:
def get_equal_number_sentiments(df):
    
    

In [12]:
def create_n_stratified_parts(df, n_cuts, project_columns):
    copy_df = df[project_columns].copy().reset_index()
    
    copy_df = copy_df.reindex(np.random.permutation(copy_df.index))
    positive = copy_df[copy_df['sentiment-intensity'] > 0].reset_index(drop = True)
    neutral = copy_df[copy_df['sentiment-intensity'] == 0].reset_index(drop = True)
    negative = copy_df[copy_df['sentiment-intensity'] < 0].reset_index(drop = True)
    
    positive_parts = []
    negative_parts = []
    neutral_parts = []
    positive_step = positive.shape[0] / n_cuts
    negative_step = negative.shape[0] / n_cuts
    neutral_step = neutral.shape[0] / n_cuts
    for part in range(n_cuts):
        if part == n_cuts - 1:
            positive_parts.append(positive[part * positive_step:])
            negative_parts.append(negative[part * negative_step:])
            neutral_parts.append(neutral[part * neutral_step:])
        else:
            positive_parts.append(positive[part * positive_step : (part+1) * positive_step])
            negative_parts.append(negative[part * negative_step : (part+1) * negative_step])
            neutral_parts.append(neutral[part * neutral_step : (part+1) * neutral_step])
            
    
    return positive_parts, negative_parts, neutral_parts

In [13]:
polarizedSentences.loc[polarizedSentences['sentiment-intensity'] < 0, 'sentiment-intensity' ] = -1
polarizedSentences.loc[polarizedSentences['sentiment-intensity'] > 0, 'sentiment-intensity' ] = 1

In [24]:
#'priorpolarityCountnegative', 'negations', 'priorpolarityCountpositive', 'priorpolarityCountnegativestrongsubj',
#       'priorpolarityCountpositivestrongsubj', 
n_cuts = 7
positive_parts, negative_parts, neutral_parts = create_n_stratified_parts(polarizedSentences, n_cuts, [
        'sentiment-intensity', 
        'negations',
        'positivCount', 'negativCount', 'priorpolaritynegativeAfter',
#        'priorpolaritypositiveBefore', 'activeBefore', 'passiveBefore', , 'priorpolaritynegativeBefore'
        'hostileBefore', 'yesBefore', 'yesAfter', 'noBefore', 'negateBefore',
#        'activeCountBefore','passiveCountBefore', 'hostileCountBefore',  'noCountBefore', 'negateCountBefore', 'yesCountBefore', 
        'priorpolaritynegativeCountBefore', 'priorpolaritypositiveCountBefore',
        'activeCountAfter', 'passiveCountAfter', 'hostileCountAfter',  'passiveAfter',
        'morePositiveThanNegativeStrong', 'morePositiveThanNegativeWeak', 'morePositiveThanNegative',
 #       'yesCountAfter', 'noCountAfter', 'negateCountAfter',  'activeAfter',
        'priorpolaritynegativeCountAfter', 'priorpolaritypositiveAfter',    'negateAfter',
        'priorpolaritypositiveCountAfter'
#        'hostileAfter', 'noAfter'
    ])

In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB

In [29]:
#        print(train.shape)
#    clasifier = AdaBoostClassifier(
#                    RandomForestClassifier(n_estimators = 50),
#                    DecisionTreeClassifier(max_depth=4, max_features = 'log2'),
#                    OneVsRestClassifier(SVC(kernel='linear')),
#                   n_estimators=500,
#                   algorithm="SAMME",
#                   learning_rate=1.03)
#    clasifier = OneVsRestClassifier(SVC(kernel='sigmoid'))
#    clasifier = ExtraTreesClassifier(n_estimators = 2000, max_features = 'sqrt', max_depth = 8, n_jobs = -1)

best_test = None
best_train = None
best_predicted = None
best_accuracy = 0
best_clasifier = None

accuracy = []
num_repeats = 3
consts = [4, 6, 8, 10, 12]

for idx_const in range(len(consts)):
    print("######### Next Constant")
    for part in range(n_cuts):
        test = pd.DataFrame({})
        train = pd.DataFrame({})
        # create test and training set
        for idx in range(n_cuts):
            if idx == part:
                test = pd.concat([positive_parts[idx], negative_parts[idx], neutral_parts[idx]])
            else:
                train = pd.concat([train, positive_parts[idx], negative_parts[idx], neutral_parts[idx]])

        # train & evaluate
        train_results = train['sentiment-intensity'].values
        train.drop('sentiment-intensity', axis = 1, inplace = True)
        train_index = train['index'].values
        train.drop('index', axis = 1, inplace = True)    
        test_results = test['sentiment-intensity'].values
        test.drop('sentiment-intensity', axis = 1, inplace = True)
        test_index = test['index'].values
        test.drop('index', axis = 1, inplace = True)    
        acc = []
        for repeat_test in range(num_repeats):
            clasifier = RandomForestClassifier(n_estimators = 800, max_features = None, max_depth = consts[idx_const], n_jobs = -1)
            clasifier.fit(train.values, train_results)

            predicted = clasifier.predict(test.values)
            list_predicted = list(predicted)
            current_accuracy = accuracy_score(test_results, predicted)
            acc.append(current_accuracy)

            if best_accuracy < current_accuracy:
                best_test = test.copy()
                best_test['index'] = test_index
                best_test['sentiment-intensity'] = test_results
                best_train = train.copy()
                best_train['index'] = train_index
                best_train['sentiment-intensity'] = train_results
                best_predicted = predicted.copy()
                best_clasifier = clasifier
                best_accuracy = current_accuracy
                print("Best Configuration: idx = {}, round = {}, cut = {}, accuracy = {}".format(idx_const, repeat_test, part, current_accuracy))

        accuracy.append(np.mean(acc))


    print("{}) Mean Accuracy: {}".format(consts[idx_const], np.mean(accuracy)))


######### Next Constant
4) Mean Accuracy: 0.539279040354
######### Next Constant
6) Mean Accuracy: 0.542820819165
######### Next Constant
8) Mean Accuracy: 0.545418668238
######### Next Constant
10) Mean Accuracy: 0.546067850907
######### Next Constant
12) Mean Accuracy: 0.545760966765


In [31]:
joblib.dump(best_clasifier, './models/sentiment/version_1/version_1.pkl') 

['./models/sentiment/version_1.pkl',
 './models/sentiment/version_1.pkl_01.npy',
 './models/sentiment/version_1.pkl_02.npy',
 './models/sentiment/version_1.pkl_03.npy',
 './models/sentiment/version_1.pkl_04.npy',
 './models/sentiment/version_1.pkl_05.npy',
 './models/sentiment/version_1.pkl_06.npy',
 './models/sentiment/version_1.pkl_07.npy',
 './models/sentiment/version_1.pkl_08.npy',
 './models/sentiment/version_1.pkl_09.npy',
 './models/sentiment/version_1.pkl_10.npy',
 './models/sentiment/version_1.pkl_11.npy',
 './models/sentiment/version_1.pkl_12.npy',
 './models/sentiment/version_1.pkl_13.npy',
 './models/sentiment/version_1.pkl_14.npy',
 './models/sentiment/version_1.pkl_15.npy',
 './models/sentiment/version_1.pkl_16.npy',
 './models/sentiment/version_1.pkl_17.npy',
 './models/sentiment/version_1.pkl_18.npy',
 './models/sentiment/version_1.pkl_19.npy',
 './models/sentiment/version_1.pkl_20.npy',
 './models/sentiment/version_1.pkl_21.npy',
 './models/sentiment/version_1.pkl_22.n

In [30]:
accuracy_score(best_test['sentiment-intensity'], best_predicted)

0.5694789081885856

In [None]:
best_test

In [None]:
predicted_dt = pd.DataFrame(best_predicted, columns = ['predicted'])
best_test_noindex = best_test.reset_index(drop = True)
best_test_with_predicted = pd.concat([best_test_noindex, predicted_dt], axis = 1)
best_test_with_predicted

In [None]:
wrongClass = best_test_with_predicted.loc[best_test_with_predicted['sentiment-intensity'] != best_test_with_predicted['predicted'], 'index']
wrongClass.shape

In [None]:
wrongData = data2.loc[wrongClass, :].copy()
wrongData['predicted'] = best_test_with_predicted.loc[best_test_with_predicted['sentiment-intensity'] != best_test_with_predicted['predicted'], 'predicted'].values
wrongData['truth'] = 0
wrongData.loc[wrongData['sentiment-intensity'] < 0, 'truth'] = -1
wrongData.loc[wrongData['sentiment-intensity'] == 0, 'truth'] = 0
wrongData.loc[wrongData['sentiment-intensity'] > 0, 'truth'] = 1
wrongData.drop('sentiment-intensity', axis = 1, inplace = True)

In [None]:
with open('./wrong_fit.csv', 'wb') as csvfile:
    wrongData.to_csv(csvfile, index = False)

In [None]:
len(data2.loc[wrongClass,'docName'].unique())

In [None]:
wrongData[(wrongData['truth'] == -1) & (wrongData['predicted'] != -1)]

In [None]:
best_test_with_predicted[best_test_with_predicted['sentiment-intensity'] < 0].shape

In [53]:
def directClassification(row, pos_column, neg_column, suffix):
    if row[pos_column] < row[neg_column]:
        row['apriori-class-'+suffix] = -1
    elif row[pos_column] > row[neg_column]:
        row['apriori-class-'+suffix] = 1
    else:
        row['apriori-class-'+suffix] = 0
    
    return row

In [54]:
polarizedSentences = polarizedSentences.apply(lambda row: directClassification(row, 'priorpolarityCountpositivestrongsubj', 'priorpolarityCountnegativestrongsubj', 'strong' ), axis = 1)
correctAprioriClassesWeighted = polarizedSentences[polarizedSentences['sentiment-intensity'] == polarizedSentences['apriori-class-strong']].shape[0]
print("Apriori class (strong) accurracy: {}".format( correctAprioriClassesWeighted / float(polarizedSentences.shape[0]) ))

Apriori class (strong) accurracy: 0.525327665604


In [56]:
polarizedSentences = polarizedSentences.apply(lambda row: directClassification(row, 'priorpolarityCountpositive', 'priorpolarityCountnegative', 'direct' ), axis = 1)
correctAprioriClasses = polarizedSentences[polarizedSentences['sentiment-intensity'] == polarizedSentences['apriori-class-direct']].shape[0]
print("Apriori class (direct) accurracy: {}".format( correctAprioriClasses / float(polarizedSentences.shape[0]) ))

Apriori class (direct) accurracy: 0.526567481403


In [57]:
polarizedSentences = polarizedSentences.apply(lambda row: directClassification(row, 'positivCount', 'negativCount', 'pos-neg' ), axis = 1)
correctAprioriClasses = polarizedSentences[polarizedSentences['sentiment-intensity'] == polarizedSentences['apriori-class-pos-neg']].shape[0]
print("Apriori class (direct) accurracy: {}".format( correctAprioriClasses / float(polarizedSentences.shape[0]) ))

Apriori class (direct) accurracy: 0.523556500177


In [None]:
import 