In [2]:
import string
import pandas as pd
import numpy as np
from nltk import word_tokenize
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [5]:
with open('./corpora/processed/mpqa-sentences.csv', 'r') as f:
    df = pd.read_csv(f)

In [6]:
df.shape

(15802, 8)

In [7]:
df.columns

Index([u'docName', u'dirName', u'idx', u'startByte', u'endByte', u'sentLen',
       u'annotsCount', u'text'],
      dtype='object')

In [63]:
annotCounts = df[df['annotsCount']>0]['annotsCount'].value_counts()
annotsDf = pd.DataFrame( zip(annotCounts.index, annotCounts.values), columns=['Count', 'Frequency'])

In [79]:
annotsDf.sort_values(by='Count', inplace=True, ascending=True)
annotsDf['Count'] = annotsDf['Count'].astype(int)
annotsDf.index = annotsDf['Count'].values

In [109]:
plt.figure()

<matplotlib.figure.Figure at 0x1d9d7ef0>

In [160]:
plt.xlabel('Number of annotations per sentence')
plt.ylabel('Frequency in dataset (log scale)')
plt.ylim([annotsDf['Frequency'].min(), annotsDf['Frequency'].max()])
plt.axvline(9.5, color='b', linestyle='--')
plt.text(10, 1500, 'Cut-off boundary', fontsize=14)

<matplotlib.text.Text at 0x28e72be0>

In [161]:
annotsDf['Frequency'].plot \
                    .bar(logy=True, rot=0)

<matplotlib.axes._subplots.AxesSubplot at 0x28e4cd68>

In [162]:
plt.show()

In [148]:
float(annotsDf[annotsDf['Frequency'] < 100]['Frequency'].sum()) / annotsDf['Frequency'].sum()

0.019621870209504344

In [149]:
annotsDf[annotsDf['Frequency'] < 100]['Frequency'].sum()

192L

In [196]:
df[df['sentLen'] < 2]

Unnamed: 0,docName,dirName,idx,startByte,endByte,sentLen,annotsCount,text
13676,im_401b_e73i32c22_031705-2,ula,59,2130,2131,1.0,0.0,0
14536,sw2078-UTF16-ms98-a-trans,ula,133,9378,9379,1.0,0.0,i
14835,wsj_0160,xbank,0,740,741,1.0,0.0,""""
15466,wsj_0768,xbank,20,4935,4936,1.0,0.0,""""


In [202]:
def wordCount(row):
    row['wordCount'] = len(word_tokenize(str(row['text']).translate(None, string.punctuation).strip()))
    return row

In [203]:
dfWC = df[df['annotsCount'] > 10].apply(lambda row: wordCount(row), axis=1)
df = df.apply(lambda row: wordCount(row), axis=1)

In [265]:
df[(df['wordCount'] > 40) & (df['wordCount'] < 61)]['annotsCount'].value_counts()

0.0     200
4.0     111
2.0     104
5.0     103
3.0     102
6.0      76
7.0      70
1.0      59
8.0      56
9.0      36
10.0     31
11.0     16
13.0     13
12.0     11
14.0      8
15.0      8
17.0      4
16.0      2
18.0      2
21.0      1
19.0      1
30.0      1
Name: annotsCount, dtype: int64

In [278]:
df[(df['annotsCount'] > 1) & (df['wordCount'] > 30) & (df['wordCount'] < 41)]['annotsCount'].quantile(0.9)

8.0

In [255]:
df3142 = df[(df['wordCount'] > 31) & (df['annotsCount'] > 0)]['wordCount'].value_counts()

In [1]:
df3142Graph = pd.DataFrame(zip(df3142.index, df3142.values), columns=['Words', 'Occureces'])

NameError: name 'pd' is not defined

In [279]:
plt.figure()
plt.xlabel('Number of words per sentence')
plt.ylabel('Frequenct of occurence in dataset')
plt.ylim([df3142Graph['Occureces'].min(), df3142Graph['Occureces'].sum()*1.1])
df3142Graph['Occureces'].cumsum().plot \
                    .line(rot=0, linewidth=2)
plt.show()

In [284]:
df[(df['annotsCount'] > 0) & (df['wordCount'] < 31)].shape

(7113, 9)

In [159]:
dfWC.describe()

Unnamed: 0,idx,startByte,endByte,sentLen,annotsCount,wordCount
count,192.0,192.0,192.0,192.0,192.0,192.0
mean,13.598958,2276.255208,2541.786458,265.53125,12.869792,48.229167
std,15.97087,1808.325447,1820.174795,96.165319,2.61264,17.455691
min,0.0,2.0,205.0,126.0,11.0,24.0
25%,4.0,924.5,1167.0,196.5,11.0,35.0
50%,9.0,1866.5,2128.0,245.5,12.0,45.0
75%,18.0,3225.5,3467.25,317.25,14.0,58.0
max,129.0,8179.0,8588.0,893.0,30.0,161.0


In [167]:
dfWC[dfWC['wordCount'] > 57]['annotsCount'].value_counts()

11.0    12
12.0     9
13.0     6
14.0     5
15.0     4
17.0     3
16.0     3
21.0     2
18.0     2
30.0     1
26.0     1
22.0     1
Name: annotsCount, dtype: int64

In [289]:
from tools.parsers.corpora_sentiment import generalinquirer as generalInquirerParser
from tools.parsers.corpora_sentiment import largemoviereviews as largeMovieReviewsParser
from tools.parsers.corpora_sentiment import generalinquirer as generalInquirerParser
from tools.parsers.corpora_sentiment import negation as negationParser 

from tools.sentimentanalysis import preparation

In [290]:
prepData = preparation.Preparation()
parserInquirer = generalInquirerParser.GeneralInquirer()
sentencesMpqa = parserInquirer.readFileCsv(prepData.defaultFileNameSentimentSentencesNormalized)
non_english_text = ['im_401b_e73i32c22_031705-2', 'IZ-060316-01-Trans-1', '20000815_AFP_ARB.0084.IBM-HA-NEW', 'NapierDianne']

In [293]:
polarizedSentencesMpqa = sentencesMpqa[np.invert(sentencesMpqa['docName'].isin(non_english_text))]

In [297]:
polarizedSentencesMpqa[(polarizedSentencesMpqa['sentiment-measured'] == 1) & (polarizedSentencesMpqa['sentiment-type'] == 0)].shape

(7896, 12)

In [298]:
sentencesMpqa[(sentencesMpqa['sentiment-measured'] == 1) & (sentencesMpqa['sentiment-type'] == 0)].shape

(7959, 12)

In [308]:
sentencesMpqa = sentencesMpqa.apply(lambda row: wordCount(row), axis=1)

In [304]:
polarizedSentencesMpqa = polarizedSentencesMpqa.apply(lambda row: wordCount(row), axis=1)

In [310]:
def filterDf(df):
    return df[(df['annotsCount'] > 0) & (df['wordCount'] < 31) & np.invert(df['docName'].isin(non_english_text))]

In [314]:
filterDf(df).shape

(7040, 9)

In [329]:
with open('./corpora/processed/sentiment-sentences-norm.csv', 'r') as f:
    dfSent = pd.read_csv(f)

In [330]:
dfSent=dfSent.apply(lambda row: wordCount(row), axis=1)

In [331]:
filterDf(dfSent).shape

(6856, 13)

In [372]:
dfSentPos = filterDf(dfSent)

In [373]:
dfSentNeg = dfSentPos[dfSentPos['sentiment-intensity'] < 0].reset_index()
dfSentPos = dfSentPos[dfSentPos['sentiment-intensity'] > 0].reset_index()

In [374]:
dfSentPos.shape

(2613, 14)

In [379]:
valsPos = dfSentPos['sentiment-intensity'].value_counts()
valsNeg = (-dfSentNeg['sentiment-intensity']).value_counts()

dfSentPosFinal = pd.DataFrame(zip(valsPos.index, valsPos.values), columns=['Sentiment', 'Count'])
dfSentNegFinal = pd.DataFrame(zip(valsNeg.index, valsNeg.values), columns=['Sentiment', 'Count'])

In [380]:
dfSentPosFinal.sort_values(by='Sentiment', ascending=True, inplace=True)
dfSentPosFinal.index = dfSentPosFinal['Sentiment']

dfSentNegFinal.sort_values(by='Sentiment', ascending=True, inplace=True)
dfSentNegFinal.index = dfSentNegFinal['Sentiment']

In [388]:
fig, axes = plt.subplots(nrows=1, ncols=2)
#plt.xlabel('Sentiment')
#plt.ylim([df3142Graph['Occureces'].min(), df3142Graph['Occureces'].sum()*1.1])
dfSentPosFinal['Count'].plot(ax=axes[0], sharex=False)
axes[0].set_title("Positive")
axes[0].set_ylabel('Frequency of sentiment in dataset')

dfSentNegFinal['Count'].plot(ax=axes[1])
axes[1].set_title("Negative")
plt.ylabel('Frequency of sentiment in dataset')
plt.show()

In [382]:
dfSentPosFinal[(dfSentPosFinal['Sentiment'] < 0.9) & (dfSentPosFinal['Sentiment'] > 0.7)]['Count'].sum()

960L

In [231]:
df[df['wordCount'] > 31].shape

(3053, 9)