## Enforcing Deeper Language Understanding of Context: The Case of the Story Cloze Test Set

### Data Load

We load in the data as well create a vector for Right and Wrong answers

In [1]:
import pandas as pd
import nltk
from scipy import stats
import plotly.plotly as py
from plotly import tools
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)


In [2]:
data=pd.read_csv('full_data.csv', delimiter='\t')
data.index = data['InputStoryid']
right_answers=data['rightanswer']
wrong_answers=data['wronganswer']

### Word Distribution
We analyze the frequent unigrams, bigrams and trigrams between Right and Wrong answers. 

In [72]:
right_unigrams = pd.read_csv('word_dist/right_uni_dist.tsv', delimiter='\t', index_col=0)
wrong_unigrams = pd.read_csv('word_dist/wrong_uni_dist.tsv', delimiter='\t', index_col=0)

In [74]:
print(right_unigrams.head(20))

         count
word          
decided     81
time        73
got         73
went        62
day         61
home        57
made        55
great       52
felt        52
new         50
found       49
friends     44
loved       38
became      35
go          33
money       31
back        31
dog         30
enjoyed     28
better      28


In [75]:
print(wrong_unigrams.head(20))

         count
word          
decided    167
went        79
never       78
home        65
didnt       65
hated       50
felt        49
got         47
go          46
day         45
away        44
told        43
gave        39
dog         36
new         35
back        35
threw       34
time        31
like        29
get         28


In [78]:
right_bigrams = pd.read_csv('word_dist/right_bi_dist.tsv', delimiter='\t', index_col=0)
wrong_bigrams = pd.read_csv('word_dist/wrong_bi_dist.tsv', delimiter='\t', index_col=0)
print(right_bigrams.head(20))


               word  count
2689      greattime     21
76         wenthome     16
2980        nextday     14
2929        saidyes     11
1699     decidedbuy      7
1332         goback      6
3858      decidedgo      6
2771  wonderfultime      5
1251      rightaway      5
3793       icecream      5
82          gothome      5
2028         gohome      5
3055     feltbetter      5
3201       nexttime      4
1033     muchbetter      4
2011      madeplans      4
1789         lotfun      4
1884  learnedlesson      4
3802       greatday      4
1379        gotsick      4


In [79]:
print(wrong_bigrams.head(20))

              word  count
82        wenthome     18
3871     decidedgo     18
1430     didntwant     14
188   decidednever     14
3799      icecream     13
3537     didntlike     12
1371     threwaway     11
827       wentback      9
3001       nextday      9
2195    vowednever      8
547   decideddidnt      8
1693    decidedbuy      8
1786      everyday      7
32      walkedaway      6
2436     didntcare      6
1680    decidedget      6
804       backhome      6
2312       nevergo      5
2388     didntknow      5
3562   decidedquit      5


### Sentence Length and Number of Words

Create a vector for sentence length, number of words and the differences between the two. 

In [3]:
right_len=right_answers.str.len()
wrong_len=wrong_answers.str.len()
print'Avg Length of Right Ans:' , right_len.mean(), 'Avg Length of Wrong Ans:' , wrong_len.mean()

lengths = [right_len.as_matrix(),wrong_len.as_matrix()]
len_labels = ['right_answer_len', 'wrong_answer_len']
fig = ff.create_distplot(lengths, len_labels)
py.iplot(fig, filename='Plot of Answers Length')

Avg Length of Right Ans: 39.7637626937 Avg Length of Wrong Ans: 39.1063602352
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~rsharma441/0 or inside your plot.ly account where it is named 'Plot of Answers Length'


In [4]:
right_numwords=right_answers.apply(nltk.word_tokenize).apply(len)
wrong_numwords=wrong_answers.apply(nltk.word_tokenize).apply(len)
print'Avg NumWords of Right Ans:' , right_numwords.mean(), 'Avg NumWords of Wrong Ans:' , wrong_numwords.mean()
num_words = [right_numwords.as_matrix(),wrong_numwords.as_matrix()]
nw_labels = ['right_answer_numwords', 'wrong_answer_numwords']
fig = ff.create_distplot(num_words, nw_labels)
py.iplot(fig, filename='Plot of Num Word')

Avg NumWords of Right Ans: 8.75734901122 Avg NumWords of Wrong Ans: 8.59540352753


Let us conduct a t-test to see if there is a significant difference between the right and wrong length and numwords.

In [5]:
length_test = stats.ttest_ind(right_len,wrong_len,equal_var=False)
numwords_test = stats.ttest_ind(right_numwords,wrong_numwords,equal_var=False)
print 'Length Test Statistic:', length_test[0], "Length Test pValue:" , length_test[1]
print 'Num Words Test Statistic:', numwords_test[0], "Length Test pValue:", numwords_test[1]

Length Test Statistic: 1.60146923754 Length Test pValue: 0.109358238941
Num Words Test Statistic: 1.89991428244 Length Test pValue: 0.0575218185857


### Sentiment Analysis

In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
headers= ['neg', 'neu', 'pos', 'compound']

right_answers=data['rightanswer'].to_frame()
wrong_answers=data['wronganswer'].to_frame()

right_sent = right_answers.join(pd.DataFrame([analyzer.polarity_scores(sentence) for sentence in right_answers['rightanswer']], index=data.index, columns=headers))
wrong_sent = wrong_answers.join(pd.DataFrame([analyzer.polarity_scores(sentence) for sentence in wrong_answers['wronganswer']], index=data.index, columns=headers))


The twython library has not been installed. Some functionality from the twitter package will not be available.



In [7]:
neg_sent = [right_sent['neg'].as_matrix(), wrong_sent['neg'].as_matrix()]
neu_sent = [right_sent['neu'].as_matrix(), wrong_sent['neu'].as_matrix()]
pos_sent = [right_sent['pos'].as_matrix(), wrong_sent['pos'].as_matrix()]
comp_sent = [right_sent['compound'].as_matrix(), wrong_sent['compound'].as_matrix()]
neg_labels = ['right_neg', 'wrong_neg']
neu_labels = ['right_neu', 'wrong_neu']
pos_labels = ['right_pos', 'wrong_pos']
comp_labels = ['right_comp', 'wrong_comp']
fig_neg = ff.create_distplot(neg_sent, neg_labels, show_hist=False, show_rug=False)
py.iplot(fig_neg, filename='Plot of Neg Sentiment')

In [8]:
fig_neu = ff.create_distplot(neu_sent, neu_labels, show_hist=False, show_rug=False)
py.iplot(fig_neu, filename='Plot of Neu Sentiment')

In [9]:
fig_pos = ff.create_distplot(pos_sent, pos_labels, show_hist=False, show_rug=False)
py.iplot(fig_pos, filename='Plot of Pos Sentiment')

In [10]:
fig_comp = ff.create_distplot(comp_sent, comp_labels, show_hist=False, show_rug=False)
py.iplot(fig_comp, filename='Plot of Comp Sentiment')

### Part of Speech

In [11]:
import re, string
import math
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from statsmodels.stats.proportion import proportions_ztest
import pos_helpers

In [12]:
poslist = ['PRP$','VBG','VBD','VBN',',',"''",'VBP','WDT','JJ','WP','VBZ','DT','RP','$','NN','POS','.','TO','PRP','RB','NNS','NNP','``','WRB','CC','PDT','RBR','CD','EX','IN','MD','NNPS','JJS','JJR','VB','UH','RBS',':','FW']
right_answers=data['rightanswer'].apply(pos_helpers.remove_punc)
wrong_answers=data['wronganswer'].apply(pos_helpers.remove_punc)

In [13]:
right_answers_pos=pd.DataFrame(right_answers.apply(word_tokenize).apply(pos_tag).apply(pos_helpers.first_element))
right_answers_pos['unigrams']=right_answers_pos['rightanswer'].apply(lambda x: ' '.join(x))
right_answers_pos['bigrams'] = right_answers_pos['rightanswer'].apply(pos_helpers.get_bigrams).apply(lambda x: ' '.join(x))

In [14]:
wrong_answers_pos=pd.DataFrame(wrong_answers.apply(word_tokenize).apply(pos_tag).apply(pos_helpers.first_element))
wrong_answers_pos['unigrams']=wrong_answers_pos['wronganswer'].apply(lambda x: ' '.join(x))
wrong_answers_pos['bigrams'] = wrong_answers_pos['wronganswer'].apply(pos_helpers.get_bigrams).apply(lambda x: ' '.join(x))

In [19]:
headers = {}

right_uni_pos_matrix, headers['right_uni'] = pos_helpers.create_pos_matrix(right_answers_pos['unigrams'], data.index)
wrong_uni_pos_matrix, headers['wrong_uni'] = pos_helpers.create_pos_matrix(wrong_answers_pos['unigrams'], data.index)
right_bi_pos_matrix, headers['right_bi'] = pos_helpers.create_pos_matrix(right_answers_pos['bigrams'], data.index)
wrong_bi_pos_matrix, headers['wrong_bi'] = pos_helpers.create_pos_matrix(wrong_answers_pos['bigrams'], data.index)
#right_tri_pos_matrix, headers['right_tri'] = pos_helpers.create_pos_matrix(right_answers_pos['trigrams'], data.index)
#wrong_tri_pos_matrix, headers['wrong_tri'] = pos_helpers.create_pos_matrix(wrong_answers_pos['trigrams'], data.index)


right_pos_matrix = right_uni_pos_matrix.join(right_bi_pos_matrix)
wrong_pos_matrix = wrong_uni_pos_matrix.join(wrong_bi_pos_matrix)

pos_sizes = {}
pos_sizes['r_uni'] = right_answers_pos['unigrams'].str.count(" ")+1
pos_sizes['r_bi'] = right_answers_pos['bigrams'].str.count(" ")+1
#pos_sizes['r_tri'] = right_answers_pos['trigrams'].str.count(" ")+1
pos_sizes['w_uni'] = wrong_answers_pos['unigrams'].str.count(" ")+1
pos_sizes['w_bi'] = wrong_answers_pos['bigrams'].str.count(" ")+1
#pos_sizes['w_tri'] = wrong_answers_pos['trigrams'].str.count(" ")+1

###hypothesis testing
compare_dist=pos_helpers.compare_pos_dist(headers, pos_sizes, right_pos_matrix, wrong_pos_matrix)


In [20]:
pos_sizes = {}
pos_sizes['r_uni'] = right_answers_pos['unigrams'].str.count(" ")+1
pos_sizes['r_bi'] = right_answers_pos['bigrams'].str.count(" ")+1
#pos_sizes['r_tri'] = right_answers_pos['trigrams'].str.count(" ")+1
pos_sizes['w_uni'] = wrong_answers_pos['unigrams'].str.count(" ")+1
pos_sizes['w_bi'] = wrong_answers_pos['bigrams'].str.count(" ")+1
#pos_sizes['w_tri'] = wrong_answers_pos['trigrams'].str.count(" ")+1


In [34]:
compare_dist=pos_helpers.compare_pos_dist(headers, pos_sizes, right_pos_matrix, wrong_pos_matrix)
compare_dist[compare_dist['code']=='B'].head(20).sort_values(by=['z', 'w_ct', 'r_ct'], ascending=[False, False, False])

Unnamed: 0,code,pos,r_ct,w_ct,z
14,B,vb,3418.0,531.0,0.010413
7,B,nn,3711.0,2299.0,0.004478
5,B,jj,973.0,686.0,0.001231
0,B,cc,359.0,253.0,0.000791
21,B,ccnn,70.0,29.0,0.000784
9,B,prp,1178.0,1029.0,0.000658
6,B,md,77.0,61.0,0.000233
1,B,cd,45.0,34.0,0.000207
3,B,ex,14.0,8.0,0.000193
18,B,ccdt,14.0,9.0,0.000181


In [31]:
compare_dist[compare_dist['code']=='W'].head(15).sort_values(by=['w_ct'], ascending=[False])

Unnamed: 0,code,pos,r_ct,w_ct,z
179,W,vbd,0.0,2041.0,
174,W,nnp,0.0,1199.0,
176,W,nns,0.0,491.0,
181,W,vbn,0.0,249.0,
180,W,vbg,0.0,213.0,
182,W,vbp,0.0,145.0,
183,W,vbz,0.0,126.0,
172,W,jjr,0.0,33.0,
177,W,rbr,0.0,28.0,
173,W,jjs,0.0,25.0,


In [33]:
compare_dist[compare_dist['code']=='R'].head(15).sort_values(by=['r_ct'], ascending=[False])

Unnamed: 0,code,pos,r_ct,w_ct,z
68,R,jjvb,10.0,0.0,
44,R,exvb,6.0,0.0,
19,R,ccin,3.0,0.0,
13,R,uh,2.0,0.0,
25,R,cdcd,1.0,0.0,
30,R,cdvb,1.0,0.0,
31,R,cdwdt,1.0,0.0,
39,R,dtrp,1.0,0.0,
41,R,dtwp,1.0,0.0,
42,R,excc,1.0,0.0,


#### Bucketed POS
Instead of having fully treebank of POS, we bucket some (i.e. JJ, JJR and JJS all map to JJ) 

In [46]:
right_answers_pos=pd.DataFrame(right_answers.apply(word_tokenize).apply(pos_tag).apply(pos_helpers.first_element_bucket))
right_answers_pos['unigrams']=right_answers_pos['rightanswer'].apply(lambda x: ' '.join(x))
right_answers_pos['bigrams'] = right_answers_pos['rightanswer'].apply(pos_helpers.get_bigrams).apply(lambda x: ' '.join(x))
#right_answers_pos['trigrams'] = right_answers_pos['rightanswer'].apply(pos_helpers.get_trigrams).apply(lambda x: ' '.join(x))

wrong_answers_pos=pd.DataFrame(wrong_answers.apply(word_tokenize).apply(pos_tag).apply(pos_helpers.first_element_bucket))
wrong_answers_pos['unigrams']=wrong_answers_pos['wronganswer'].apply(lambda x: ' '.join(x))
wrong_answers_pos['bigrams'] = wrong_answers_pos['wronganswer'].apply(pos_helpers.get_bigrams).apply(lambda x: ' '.join(x))
#wrong_answers_pos['trigrams'] = wrong_answers_pos['wronganswer'].apply(pos_helpers.get_trigrams).apply(lambda x: ' '.join(x))

headers = {}

right_uni_pos_matrix, headers['right_uni'] = pos_helpers.create_pos_matrix(right_answers_pos['unigrams'], data.index)
wrong_uni_pos_matrix, headers['wrong_uni'] = pos_helpers.create_pos_matrix(wrong_answers_pos['unigrams'], data.index)
right_bi_pos_matrix, headers['right_bi'] = pos_helpers.create_pos_matrix(right_answers_pos['bigrams'], data.index)
wrong_bi_pos_matrix, headers['wrong_bi'] = pos_helpers.create_pos_matrix(wrong_answers_pos['bigrams'], data.index)
#right_tri_pos_matrix, headers['right_tri'] = pos_helpers.create_pos_matrix(right_answers_pos['trigrams'], data.index)
#wrong_tri_pos_matrix, headers['wrong_tri'] = pos_helpers.create_pos_matrix(wrong_answers_pos['trigrams'], data.index)


right_pos_matrix = right_uni_pos_matrix.join(right_bi_pos_matrix)
wrong_pos_matrix = wrong_uni_pos_matrix.join(wrong_bi_pos_matrix)

pos_sizes = {}
pos_sizes['r_uni'] = right_answers_pos['unigrams'].str.count(" ")+1
pos_sizes['r_bi'] = right_answers_pos['bigrams'].str.count(" ")+1
#pos_sizes['r_tri'] = right_answers_pos['trigrams'].str.count(" ")+1
pos_sizes['w_uni'] = wrong_answers_pos['unigrams'].str.count(" ")+1
pos_sizes['w_bi'] = wrong_answers_pos['bigrams'].str.count(" ")+1
#pos_sizes['w_tri'] = wrong_answers_pos['trigrams'].str.count(" ")+1
compare_dist_bucket=pos_helpers.compare_pos_dist(headers, pos_sizes, right_pos_matrix, wrong_pos_matrix)
compare_dist_bucket[compare_dist_bucket['code']=='B'].head(20).sort_values(by=['z', 'w_ct', 'r_ct'], ascending=[False, False, False])

Unnamed: 0,code,pos,r_ct,w_ct,z
5,B,jj,973.0,744.0,0.000951
0,B,cc,359.0,253.0,0.000791
9,B,prp,1178.0,1029.0,0.000658
14,B,vb,3418.0,3305.0,0.00032
6,B,md,77.0,61.0,0.000233
1,B,cd,45.0,34.0,0.000207
3,B,ex,14.0,8.0,0.000193
18,B,ccdt,14.0,9.0,0.000181
17,B,wrb,58.0,46.0,0.000171
11,B,rp,129.0,114.0,0.000144


In [51]:
compare_dist_bucket[compare_dist_bucket['code']=='W'].head(5).sort_values(by=['w_ct'], ascending=[False])

Unnamed: 0,code,pos,r_ct,w_ct,z
172,W,ccmd,0.0,2.0,
171,W,fw,0.0,1.0,
173,W,ccpdt,0.0,1.0,
174,W,ccto,0.0,1.0,
175,W,ccwrb,0.0,1.0,


In [50]:
compare_dist_bucket[compare_dist_bucket['code']=='R'].head(5).sort_values(by=['r_ct'], ascending=[False])

Unnamed: 0,code,pos,r_ct,w_ct,z
19,R,ccin,3.0,0.0,
13,R,uh,2.0,0.0,
25,R,cdcd,1.0,0.0,
31,R,cdwdt,1.0,0.0,
39,R,dtrp,1.0,0.0,


### Sentence Complexity

#### Yngve Score

In [56]:
right_yngve=pd.read_csv('complexity/yngve_rightout.csv', delimiter='\t')
right_yngve.index = data.index
wrong_yngve=pd.read_csv('complexity/yngve_wrongout.csv', delimiter='\t')
wrong_yngve.index = data.index

In [69]:
yngve_test = stats.ttest_ind(right_yngve['right_yngve'],wrong_yngve['wrong_yngve'],equal_var=True)
print('Avg Yngve Score of Right Ans:' , right_yngve['right_yngve'].mean(), 'Avg Length of Wrong Ans:' , wrong_yngve['wrong_yngve'].mean())

('Avg Yngve Score of Right Ans:', 1.1865285673409645, 'Avg Length of Wrong Ans:', 1.1416608140953382)


In [63]:
print('Yngve Hypothesis Test:', yngve_test)

('Yngve Hypothesis Test:', Ttest_indResult(statistic=0.47413139813903116, pvalue=0.63543401142994316))


#### Frazier Score

In [60]:
right_frazier=pd.read_csv('complexity/frazier_rightout.csv', delimiter='\t')
right_frazier.index = data.index
wrong_frazier=pd.read_csv('complexity/frazier_wrongout.csv', delimiter='\t')
wrong_frazier.index = data.index

In [67]:
frazier_test = stats.ttest_ind(right_frazier['right_frazier'],wrong_frazier['wrong_frazier'],equal_var=True)
print('Avg Frazier Score of Right Ans:' , right_frazier['right_frazier'].mean(), 'Avg Length of Wrong Ans:' , wrong_frazier['wrong_frazier'].mean())


('Avg Frazier Score of Right Ans:', 1.0807256557356726, 'Avg Length of Wrong Ans:', 1.0772400798322708)


In [68]:
print('FrazierHypothesis Test:', frazier_test)

('FrazierHypothesis Test:', Ttest_indResult(statistic=0.47413139813903116, pvalue=0.6354338880800825))
