In [1]:
import pandas as pd
import numpy as np
import os
import imp
import telephone_analysis
import srilm
import roark
import glob
import shutil
import scipy.stats
imp.reload(telephone_analysis)
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import kenlm

In [2]:
all_runs  = pd.read_csv('output/all_runs.csv')
# run these through each of the language models and merge the results
# make a word level table
# diff the words to see if they are not present in the next generation

In [3]:
all_runs.shape

(3193, 85)

In [4]:
all_runs.columns

Index(['Unnamed: 0', 'chain', 'character_levdau', 'check_time', 'condition',
       'flag_type', 'gold_candidate_transcription',
       'gold_comparison_transcription', 'gold_dist', 'length_accept', 'reason',
       'run', 'stimulus', 'stimulus_id', 'subject_id', 'target_length',
       'upload_time', 'upstream_pointer', 'upstream_subject_id', 'user',
       'user_candidate_transcription', 'user_comparison_transcription',
       'user_dist', 'user_short', 'word_distances', 'unique_chain_identifier',
       'global_chain', 'BNC_KNN_trigramProb', 'BNC_KNN_unigramProb',
       'WSJ_gt_unigramProb', 'WSJ_gt_trigramProb', 'WSJ_gt_5gramProb',
       'WSJ_Roark_Negative.Log.Probability', 'biglm_probability',
       'kenlm_probability', 'bllip_probability', 'bllip_wsj_probability',
       'mikolov_wsj_probability', 'length_in_words',
       'normalized_biglm_probability',
       'normalized_WSJ_Roark_Negative.Log.Probability',
       'normalized_BNC_KNN_unigramProb', 'normalized_BNC_KNN_trigra

# Word-Level Language Modeling Results

In [5]:
# index in the LM corresponds to the response: user_candidate_transcription. 3192: last input

In [5]:
lm = {}

In [6]:
bnc_knn_lm = srilm.LM("LMs/BNC_merged.LM", lower=True)

In [7]:
lm['bnc_unigram'] =  [telephone_analysis.getSRILMprob(x, {}, bnc_knn_lm, mode='single_word', unigram=True) for x in all_runs['user_candidate_transcription']]

In [8]:
assert(len(lm['bnc_unigram']) == all_runs.shape[0])

In [9]:
lm['bnc_trigram'] =  [telephone_analysis.getSRILMprob(x, {}, bnc_knn_lm, mode='single_word', unigram = False) for x in all_runs['user_candidate_transcription']]

In [10]:
assert(len(lm['bnc_trigram']) == all_runs.shape[0])

In [12]:
lm['roark_scores'] = roark.parse([str(i) for i in all_runs['user_candidate_transcription']], 
    '/home/stephan/utils/incremental-top-down-parser',
                          numWorkers=24, mode='single_word')

Input contains 3193 sentences
Input contains 1982 unique sentences
Finished external parsing


  pfix_df = pd.read_table(StringIO('\n'.join(y)), sep=' ', header=None, names = ['prefix','word']+colnames)
  return _read(filepath_or_buffer, kwds)


In [13]:
#roark_scores.columns = ['WSJ_Roark_'+x.replace(' ','.') for x in roark_scores.columns]
assert len(lm['roark_scores']) == all_runs.shape[0]

In [15]:
lm['big_lm_scores'] = telephone_analysis.getBigLMscores(all_runs['user_candidate_transcription'], 'big_lm',
    'big_lm_cache', colname='BigLM_probability', lm_1b_dir='/home/stephan/python/lm_1b', mode='single_word')


Big LM call:
cd /home/stephan/python/lm_1b && source bin/activate && bazel-bin/lm_1b/lm_1b_eval --mode eval_sentences --pbtxt data/graph-2016-09-10.pbtxt --vocab_file data/vocab-2016-09-10.txt --eval_dir /home/stephan/notebooks/telephone-analysis-public/big_lm --ckpt 'data/ckpt-*'
127


In [16]:
assert(len(lm['big_lm_scores']) == all_runs.shape[0])

In [17]:
m = kenlm.Model('LMs/deepspeech_5gram.binary')
lm['kenlm_scores'] = all_runs['kenlm_probability'] = [telephone_analysis.getKenLMProb(x, m, mode='single_word') for x in all_runs['user_candidate_transcription']]

In [18]:
assert(len(lm['kenlm_scores']) == all_runs.shape[0])

In [19]:
# sanity check: all sentences in all models must have the same number of tokens in the dataframe
languageModelNames = ['bnc_unigram', 'bnc_trigram', 'roark_scores', 'big_lm_scores', 'kenlm_scores']
for i in range(all_runs.shape[0]):
    sentences = [lm[x][i] for x in languageModelNames]
    try:
        numWords = [x.shape[0] for x in sentences]
    except:
        print('Problem counting words')
        import pdb
        pdb.set_trace()
    if not np.allclose(numWords[1:len(numWords)], numWords[0]):
        print('Different number of words')
        import pdb
        pdb.set_trace()

In [20]:
sentence_store = []
for i in range(all_runs.shape[0]):
    # need to rename what is coming out of each language model
    word_store = [] 
    for languageModelName in languageModelNames:
        df = lm[languageModelName][i].copy()
        try:
            df.columns = [languageModelName+'_'+x for x in df.columns]        
        except:
            import pdb
            pdb.set_trace()
        word_store.append(df)
    lms_combined = pd.concat(word_store, axis =1)
    lms_combined['sCounter'] = range(lms_combined.shape[0])
    lms_combined['sentence_index'] = i
    sentence_store.append(lms_combined)
wdf = pd.concat(sentence_store)

In [21]:
wdf['word'] = wdf[u'bnc_unigram_word']

# Identifying Deleted and Changed Words

In [21]:
# for generation n, get the indices of all words that have changed in n+1
# have changed: no longer appear? doesn't handle transpositions
# borrowed the function from the old version of telephone

In [22]:
# take a 2-column subset of all_trials that we can put into R
input_output = all_runs[['gold_candidate_transcription','user_candidate_transcription']]
#gold_candidate_transcription is what a participant heard
#user_candidate_transcription is what the participant produced

# remove the intitial sentences -- these are represnted as input for the first participant
%R -i input_output

  % (name, str(e)))
  % (name, str(e)))


In [24]:
%%R
source('getWordLevenshteinDistance.R')
print(paste(nrow(input_output), 'sentences'))
names(input_output) = c('input','output')  
input_output$input = tolower(as.character(input_output$input))
input_output$output = tolower(as.character(input_output$output))

computeEditTable = function(s,r){
    if (s == 'none'){
        # this is an initial sentence, return NA
        return(NA)
    } else {
        et = getReducedEditTable(s,r)
        # for python compatibility, use 0-indices
        et$sCounter = et$sCounter - 1
        et$rCounter = et$rCounter - 1
        return(et)
    }
}

editTables = mapply(computeEditTable, input_output$input, input_output$output)

[1] "3193 sentences"


In [25]:
%%R
editTables[[3193]] #this should correspond to wdf[,3192]

   code      sWord      rWord sCounter rCounter
1     M     before     before        0        0
2     M        you        you        1        1
3     M      leave      leave        2        2
4     M       make       make        3        3
5     M       sure       sure        4        4
6     M        you        you        5        5
11    S       turn     change        6        6
9     M everything everything        7        7
10    D        off       <NA>        8       NA
                                             sentence
1  before you leave make sure you turn everything off
2  before you leave make sure you turn everything off
3  before you leave make sure you turn everything off
4  before you leave make sure you turn everything off
5  before you leave make sure you turn everything off
6  before you leave make sure you turn everything off
11 before you leave make sure you turn everything off
9  before you leave make sure you turn everything off
10 before you leave make sure you 

In [26]:
%%R
print(computeEditTable('this is a', 'this is a mouse'))
print(computeEditTable('this is a mouse', 'this is a'))
print(computeEditTable('this is a mouse', 'this is a mouse'))
print(computeEditTable('this is a mouse', 'this is a house'))

  code sWord rWord sCounter rCounter  sentence        response sLeftSequence
1    M  this  this        0        0 this is a this is a mouse              
2    M    is    is        1        1 this is a this is a mouse              
3    M     a     a        2        2 this is a this is a mouse     this is a
4    I  <NA> mouse       NA        3 this is a this is a mouse              
  rLeftSequence sRightSequence rRightSequence
1                    this is a      this is a
2                                  is a mouse
3     this is a                              
4    is a mouse                              
  code sWord rWord sCounter rCounter        sentence  response sLeftSequence
1    M  this  this        0        0 this is a mouse this is a              
2    M    is    is        1        1 this is a mouse this is a              
3    M     a     a        2        2 this is a mouse this is a     this is a
4    D mouse  <NA>        3       NA this is a mouse this is a    is a mouse


In [27]:
%%R
getDSMwrtInput = function(sentence_index, editTables){
    et = editTables[[sentence_index]]
    if (is.na(et)){
        return(data.frame(sentence_index, sCounter=NA,code=NA)) # these are initial sentences
    } else{
        et = editTables[[sentence_index]]    
        et$sentence_index = sentence_index
        # here is where insertions are removed so that we can join back with wfds
        return(subset(et, !is.na(sCounter))[,c('sentence_index','sCounter','code')])
    }
}

DSMwrtInput = do.call('rbind', lapply(c(1:length(editTables)), function(i){
    dsm = getDSMwrtInput(i, editTables)  
    if (!is.na(dsm)){
        dsm$sentence_index = dsm$sentence_index - 2 
        # -1 because Python indexes from 0
        # another -1 to bring the index of edits into alignment with the languag model results     
    } 
    return(dsm)
}))   

#DSMwrtInput = subset(DSMwrtInput, !is.na(code))

In [28]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
DSMwrtInput = r['DSMwrtInput']
DSMwrtInput.iloc[0:15,]

Unnamed: 0,sentence_index,sCounter,code
1,-1.0,,NA_character_
110,0.0,0.0,M
2,0.0,1.0,D
5,0.0,2.0,M
6,0.0,3.0,M
7,0.0,4.0,M
8,0.0,5.0,M
9,0.0,6.0,M
11,0.0,7.0,S
12,0.0,8.0,M


In [29]:
initialSentenceIndices = np.array(DSMwrtInput[np.isnan(DSMwrtInput.sCounter)]['sentence_index'].tolist())
wdf_initRemoved = wdf[(~wdf['sentence_index'].isin(initialSentenceIndices)) & (wdf['bnc_unigram_word'] != '</s>')]
DSMwrtInput_nansRemoved = DSMwrtInput[~np.isnan(DSMwrtInput.sCounter)]

In [30]:
print(DSMwrtInput_nansRemoved.shape[0])
print(wdf_initRemoved.shape[0]) #still different, why?
#assert(DSMwrtInput_nansRemoved.shape[0] == wdf_initRemoved.shape[0])

27290
30298


In [31]:
wdf_initRemoved[wdf_initRemoved.sentence_index == 3192]

Unnamed: 0,bnc_unigram_index,bnc_unigram_prob,bnc_unigram_word,bnc_trigram_index,bnc_trigram_prob,bnc_trigram_word,roark_scores_prefix,roark_scores_word,roark_scores_prefix.1,roark_scores_srprsl,...,big_lm_scores_Unnamed: 0,big_lm_scores_prob,big_lm_scores_word,kenlm_scores_prob,kenlm_scores_preceding,kenlm_scores_unk,kenlm_scores_words,sCounter,sentence_index,word
0,0.0,-3.081292,before,0.0,-3.037731,before,pfix:1,before,6.993,3.037021,...,0.0,-3.060103,Before,-3.105574,2.0,False,before,0,3192,before
1,1.0,-2.186153,you,1.0,-1.185921,you,pfix:2,you,13.198,2.694363,...,1.0,-2.458716,you,-1.241865,3.0,False,you,1,3192,you
2,2.0,-3.713525,leave,2.0,-1.507196,leave,pfix:3,leave,20.94,3.362308,...,2.0,-2.782638,leave,-1.47363,4.0,False,leave,2,3192,leave
3,3.0,-3.121925,make,3.0,-4.336845,make,pfix:4,make,30.85,4.303858,...,3.0,-4.497365,make,-4.662948,1.0,False,make,3,3192,make
4,4.0,-3.643197,sure,4.0,-1.549859,sure,pfix:5,sure,36.383,2.402951,...,4.0,-1.128034,sure,-1.491675,2.0,False,sure,4,3192,sure
5,5.0,-2.186153,you,5.0,-0.752118,you,pfix:6,you,43.568,3.120406,...,5.0,-0.309889,you,-0.993335,3.0,False,you,5,3192,you
6,6.0,-3.512815,change,6.0,-3.870051,change,pfix:7,change,54.143,4.593098,...,6.0,-3.55923,change,-3.91292,2.0,False,change,6,3192,change
7,7.0,-3.76336,everything,7.0,-3.264657,everything,pfix:8,everything,63.772,4.181822,...,7.0,-3.270689,everything,-3.29039,2.0,False,everything,7,3192,everything
9,,,,,,,,,,,...,9.0,-1.231726,</S>,,,,,9,3192,


In [32]:
DSMwrtInput_nansRemoved[DSMwrtInput_nansRemoved.sentence_index == 3192] 
#merging on sentence_index merges with the downstream!

Unnamed: 0,sentence_index,sCounter,code


In [33]:
dsm_counts = DSMwrtInput_nansRemoved.groupby(['sentence_index']).sCounter.agg(np.size).reset_index()

dsm_counts.columns = ['sentence_index', 'dsm_count']
wdf_counts = wdf_initRemoved.groupby('sentence_index').sCounter.agg(np.size).reset_index()
wdf_counts.columns = ['sentence_index', 'wdf_count']
count_check = wdf_counts.merge(dsm_counts, how='outer')
count_check # differences in the counts?
count_check['difference'] = count_check.wdf_count - count_check.dsm_count
count_check.sort_values(by=['difference'], ascending=False)

Unnamed: 0,sentence_index,wdf_count,dsm_count,difference
0,0,11,10.0,1.0
2003,2141,11,10.0,1.0
1994,2131,10,9.0,1.0
1995,2132,10,9.0,1.0
1996,2133,10,9.0,1.0
1997,2134,10,9.0,1.0
1998,2135,10,9.0,1.0
1999,2136,11,10.0,1.0
2000,2138,11,10.0,1.0
2001,2139,11,10.0,1.0


In [34]:
# merge DSMwrtInput into the word data frame
wdfr = wdf_initRemoved.merge(DSMwrtInput_nansRemoved) #sCounter is NaN for input sentences

In [35]:
#where did all my happy data go?
wdfr.code.value_counts()

M    22482
D     3442
S     1366
Name: code, dtype: int64

# Merge With Word Properties

In [38]:
wdfr['word'] = wdfr['bnc_unigram_word']
lexiconch = pd.read_csv('data/lexiconch.csv', index_col=0)
print('Number of words before merging with Lexiconch: '+str(wdfr.shape[0]))
wdfl = wdfr.merge(lexiconch, how='left')
print('Number of words after merging with Lexiconch: '+str(wdfl.shape[0]))
' '.join(set(wdfr.word) - set(wdfl.word))

Number of words before merging with Lexiconch: 27290
Number of words after merging with Lexiconch: 27290


  interactivity=interactivity, compiler=compiler, result=result)


''

In [39]:
lexiconch.iloc[0]

word                               roadsweeper
conc_bigram                                  0
conc_conc_m                               4.85
conc_conc_sd                              0.37
conc_unknown                                 1
conc_total                                  27
conc_percent_known                        0.96
conc_subtlex                                 0
conc_dom_pos                                 0
kpm_alternative_spelling          road sweeper
kpm_freq_pm                                NaN
kpm_dom_pos_subtlex                        NaN
kpm_nletters                                11
kpm_nphon                                    8
kpm_nsyll                                    3
kpm_lemma_highest_pos              roadsweeper
kpm_aoa_kup                               9.83
kpm_perc_known                               1
kpm_aoa_kup_lem                           9.83
kpm_perc_known_lem                           1
kpm_aoa_bird_lem                          7.12
kpm_aoa_brist

In [40]:
yarkoni_pld = pd.read_table('data/pld20.txt', header=None)
yarkoni_pld.columns = ['word', 'pld20']

  """Entry point for launching an IPython kernel.


In [41]:
print('Number of words before merging with Yarkoni PLD: '+str(wdfl.shape[0]))
wdfy = wdfl.merge(yarkoni_pld, how='left')
print('Number of words after merging with Yarkoni PLD: '+str(wdfy.shape[0]))

Number of words before merging with Yarkoni PLD: 27290
Number of words after merging with Yarkoni PLD: 27290


In [43]:
subtlex = pd.read_csv('data/subtlex_augmented.csv')
subtlex['word'] = subtlex.Word
subtlex.iloc[0]

Word                         the
FREQcount                1501908
CDcount                     8388
FREQlow                  1339811
Cdlow                       8388
SUBTLWF                  29449.2
Lg10WF                    6.1766
SUBTLCD                      100
Lg10CD                    3.9237
ipa_                      [ð, ə]
ipa_single                [ð, ə]
ipa_diphthongs            [ð, ə]
unigram_probability    0.0302076
unigram_surprisal        3.49966
word                         the
Name: 0, dtype: object

In [44]:
print('Number of words before merging with Subtlex: '+str(wdfy.shape[0]))
wdfx = wdfy.merge(subtlex[['word','SUBTLCD']], how='left')
print('Number of words after merging with Subtlex: '+str(wdfx.shape[0]))

Number of words before merging with Subtlex: 27290
Number of words after merging with Subtlex: 27290


# Merge in Sentence Level Predictors

In [45]:
all_runs['sentence_index'] = range(all_runs.shape[0])

In [46]:
wdfs = wdfx.merge(all_runs, on='sentence_index')

In [47]:
' '.join(wdfs.columns)

'bnc_unigram_index bnc_unigram_prob bnc_unigram_word bnc_trigram_index bnc_trigram_prob bnc_trigram_word roark_scores_prefix roark_scores_word roark_scores_prefix.1 roark_scores_srprsl roark_scores_SynSp roark_scores_LexSp roark_scores_ambig roark_scores_open roark_scores_rernk roark_scores_toprr roark_scores_stps big_lm_scores_Unnamed: 0 big_lm_scores_prob big_lm_scores_word kenlm_scores_prob kenlm_scores_preceding kenlm_scores_unk kenlm_scores_words sCounter sentence_index word code conc_bigram conc_conc_m conc_conc_sd conc_unknown conc_total conc_percent_known conc_subtlex conc_dom_pos kpm_alternative_spelling kpm_freq_pm kpm_dom_pos_subtlex kpm_nletters kpm_nphon kpm_nsyll kpm_lemma_highest_pos kpm_aoa_kup kpm_perc_known kpm_aoa_kup_lem kpm_perc_known_lem kpm_aoa_bird_lem kpm_aoa_bristol_lem kpm_aoa_cort_lem kpm_aoa_schock pic_ipa pic_ipa_ss_array pic_ipa_ss pic_ipa_n pic_ortho pic_ortho_ss_array pic_ortho_ss pic_ortho_n pic_character pic_character_ss_array pic_character_ss pic_cha

# Save the Output

In [48]:
wdfs.shape

(27290, 168)

In [49]:
wdfs.to_csv('output/wordLevelChanges.csv')

See R notebook, Error Prediction Model - Logistic Regression