In [1]:
import pandas as pd
import numpy as np
import os
import imp
import telephone_analysis
import srilm
import roark
import glob
import shutil
import scipy.stats
imp.reload(telephone_analysis)
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import kenlm

import new_models
from new_models import gpt2_scores, bert_scores

In [2]:
all_runs  = pd.read_csv('output/all_runs.csv')
# run these through each of the language models and merge the results
# make a word level table
# diff the words to see if they are not present in the next generation

In [3]:
all_runs.loc[all_runs.user != "0"].shape[0]

2999

In [4]:
len(np.unique(all_runs.loc[all_runs.user != "0"].user))

266

In [5]:
all_runs.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'chain', 'character_levdau', 'check_time',
       'condition', 'flag_type', 'gold_candidate_transcription',
       'gold_comparison_transcription', 'gold_dist',
       ...
       'initial_gpt2_normal_probability_rank',
       'initial_gpt2_normal_probability_quartile',
       'initial_gpt2_medium_probability',
       'initial_gpt2_medium_probability_rank',
       'initial_gpt2_medium_probability_quartile', 'initial_bert_probability',
       'initial_bert_probability_rank', 'initial_bert_probability_quartile',
       'thread_id', 'chain_length'],
      dtype='object', length=101)

# Word-Level Language Modeling Results

In [6]:
# index in the LM corresponds to the response: user_candidate_transcription. 3192: last input

In [4]:
lm = {}

In [None]:
RESULTS_FOLDER = './intermediate_results/new_models_probs'

from new_models import prep_probs

# For consistency, it's named probability, but these are actually negative base-10 surprisals.
for model_name in ['gpt2_normal', 'gpt2_medium', 'bert']:
    lm[model_name] = prep_probs.load_word_scores(model_name)

In [8]:
bnc_knn_lm = srilm.LM("LMs/BNC_merged.LM", lower=True)

In [9]:
lm['bnc_unigram'] =  [telephone_analysis.getSRILMprob(x, {}, bnc_knn_lm, mode='single_word', unigram=True) for x in all_runs['user_candidate_transcription']]

In [11]:
assert(len(lm['bnc_unigram']) == all_runs.shape[0])

In [12]:
lm['bnc_trigram'] =  [telephone_analysis.getSRILMprob(x, {}, bnc_knn_lm, mode='single_word', unigram = False) for x in all_runs['user_candidate_transcription']]

In [13]:
assert(len(lm['bnc_trigram']) == all_runs.shape[0])

In [14]:
lm['roark_scores'] = roark.parse([str(i) for i in all_runs['user_candidate_transcription']], 
    '/home/nwong/utils/incremental-top-down-parser',
                          numWorkers=24, mode='single_word')

Input contains 3193 sentences
Input contains 1982 unique sentences
Finished external parsing


  pfix_df = pd.read_table(StringIO('\n'.join(y)), sep=' ', header=None, names = ['prefix','word']+colnames)
  return _read(filepath_or_buffer, kwds)


In [15]:
#roark_scores.columns = ['WSJ_Roark_'+x.replace(' ','.') for x in roark_scores.columns]
assert len(lm['roark_scores']) == all_runs.shape[0]

In [16]:
lm['big_lm_scores'] = telephone_analysis.getBigLMscores(all_runs['user_candidate_transcription'], 'big_lm',
    'big_lm_cache', colname='BigLM_probability', lm_1b_dir='/home/nwong/python/lm_1b', mode='single_word')


Big LM call:
cd /home/nwong/python/lm_1b && source bin/activate && bazel-bin/lm_1b/lm_1b_eval --mode eval_sentences --pbtxt data/graph-2016-09-10.pbtxt --vocab_file data/vocab-2016-09-10.txt --eval_dir /home/nwong/chompsky/serial_chain/telephone-analysis-public/big_lm --ckpt 'data/ckpt-*'
2


In [17]:
lm['big_lm_scores']

[    Unnamed: 0      prob        word
 0            0 -3.202744        Each
 1            1 -6.637383  nonfiction
 2            2 -0.854058        book
 3            3 -1.230795         has
 4            4 -0.638269           a
 5            5 -3.972122        call
 6            6 -2.584629      number
 7            7 -1.795881          on
 8            8 -1.581558         its
 9            9 -3.642073       spine
 11          11 -0.165039        </S>,     Unnamed: 0      prob     word
 0            0 -3.199316     Each
 1            1 -6.440331      non
 2            2 -5.589528  fiction
 3            3 -2.305383     book
 4            4 -1.255893      has
 5            5 -0.716491        a
 6            6 -4.127177     call
 7            7 -2.595660   number
 8            8 -1.814087       in
 9            9 -3.086613      its
 10          10 -5.677606    spine
 12          12 -0.230549     </S>,     Unnamed: 0      prob        word
 0            0 -3.196275        Each
 1           

In [18]:
assert(len(lm['big_lm_scores']) == all_runs.shape[0])

In [19]:
m = kenlm.Model('LMs/deepspeech_5gram.binary')
lm['kenlm_scores'] = all_runs['kenlm_probability'] = [telephone_analysis.getKenLMProb(x, m, mode='single_word') for x in all_runs['user_candidate_transcription']]

In [20]:
assert(len(lm['kenlm_scores']) == all_runs.shape[0])

In [None]:
lm['gpt2_scores'] = gpt2_scores.score_inputs(all_runs['user_candidate_transcription'],
                                                       mode = 'single_word')
lm['gpt2_medium_scores'] = gpt2_scores.score_inputs(all_runs['user_candidate_transcription'],
                                                    mode = 'single_word',
                                                    model_type = '-medium')
lm['bert_scores'] = bert_scores.score_inputs(all_runs['user_candidate_transcription'],
                                                    mode = 'single_word')

# Assert code from Dr. Meylan above: 3/25
assert(len(lm['gpt2_scores']) == all_runs.shape[0])
assert(len(lm['gpt2_medium_scores']) == all_runs.shape[0])
assert(len(lm['bert_scores']) == all_runs.shape[0])

gpt2
Scoring with mode: single_word
Index: 0
Index: 50
Index: 100
Index: 150
Index: 200
Index: 250
Index: 300
Index: 350
Index: 400
Index: 450
Index: 500
Index: 550
Index: 600
Index: 650
Index: 700
Index: 750
Index: 800
Index: 850
Index: 900
Index: 950
Index: 1000
Index: 1050
Index: 1100
Index: 1150
Index: 1200
Index: 1250
Index: 1300
Index: 1350
Index: 1400
Index: 1450
Index: 1500
Index: 1550
Index: 1600
Index: 1650
Index: 1700
Index: 1750
Index: 1800
Index: 1850
Index: 1900
Index: 1950
Index: 2000
Index: 2050
Index: 2100
Index: 2150
Index: 2200
Index: 2250
Index: 2300
Index: 2350
Index: 2400
Index: 2450
Index: 2500
Index: 2550
Index: 2600
Index: 2650
Index: 2700
Index: 2750
Index: 2800
Index: 2850
Index: 2900
Index: 2950
Index: 3000
Index: 3050
Index: 3100
Index: 3150
gpt2-medium
Scoring with mode: single_word
Index: 0
Index: 50
Index: 100
Index: 150
Index: 200
Index: 250
Index: 300
Index: 350
Index: 400
Index: 450
Index: 500
Index: 550
Index: 600
Index: 650


In [None]:
# sanity check: all sentences in all models must have the same number of tokens in the dataframe
languageModelNames = ['bnc_unigram', 'bnc_trigram', 'roark_scores', 'big_lm_scores', 'kenlm_scores',
                     'gpt2_scores', 'gpt2_medium_scores', 'bert_scores']
for i in range(all_runs.shape[0]):
    sentences = [lm[x][i] for x in languageModelNames]
    try:
        numWords = [x.shape[0] for x in sentences]
    except:
        print('Problem counting words')
        import pdb
        pdb.set_trace()
    if not np.allclose(numWords[1:len(numWords)], numWords[0]):
        print('Different number of words')
        import pdb
        pdb.set_trace()

In [None]:
sentence_store = []
for i in range(all_runs.shape[0]):
    # need to rename what is coming out of each language model
    word_store = [] 
    for languageModelName in languageModelNames:
        df = lm[languageModelName][i].copy()
        try:
            df.columns = [languageModelName+'_'+x for x in df.columns]        
        except:
            import pdb
            pdb.set_trace()
        word_store.append(df)
    lms_combined = pd.concat(word_store, axis =1)
    lms_combined['sCounter'] = range(lms_combined.shape[0])
    lms_combined['sentence_index'] = i
    sentence_store.append(lms_combined)
wdf = pd.concat(sentence_store)

In [None]:
wdf['word'] = wdf[u'bnc_unigram_word']

In [None]:
wdf.head()

In [None]:
# Each record is a 'produced' word (user_candidate_transcription) 

# Identifying Deleted and Changed Words

In [None]:
# for generation n, get the indices of all words that have changed in n+1
# have changed: no longer appear? doesn't handle transpositions
# borrowed the function from the old version of telephone

In [None]:
all_runs.loc[all_runs.user == "0"].head(5)

In [None]:
# take a 2-column subset of all_trials that we can put into R
input_output = all_runs[['gold_candidate_transcription','user_candidate_transcription','user',
                         'upstream_subject_id']]
#gold_candidate_transcription is what a participant heard
#user_candidate_transcription is what the participant produced

# remove the intitial sentences -- these are represnted as input for the first participant
%R -i input_output

In [None]:
input_output

In [None]:
%%R
source('getWordLevenshteinDistance.R')
print(paste(nrow(input_output), 'sentences'))
names(input_output) = c('input','output','output_subject','input_subject')  
input_output$input = tolower(as.character(input_output$input))
input_output$output = tolower(as.character(input_output$output))

computeEditTable = function(s,r,input_subject, output_subject){
    if (s == 'none'){
        # this is an initial sentence, return NA
        return(NA)
    } else {
        et = getReducedEditTable(s,r)
        # for python compatibility, use 0-indices
        et$sCounter = et$sCounter - 1
        et$rCounter = et$rCounter - 1
        et$input_subject = input_subject
        et$output_subject = output_subject
        return(et)
    }
}

editTables = mapply(computeEditTable, input_output$input, input_output$output,
                   input_output$input_subject, input_output$output_subject)

In [None]:
%%R
editTables[[2]] #this should correspond to wdf[,1]

In [None]:
%%R
print(computeEditTable('this is a', 'this is a mouse', NULL, NULL))
print(computeEditTable('this is a mouse', 'this is a', NULL, NULL ))
print(computeEditTable('this is a mouse', 'this is a mouse', NULL, NULL))
print(computeEditTable('this is a mouse', 'this is a house', NULL, NULL))

In [None]:
%%R
getDSMwrtInput = function(sentence_index, editTables){
    et = editTables[[sentence_index]]
    if (is.na(et)){
        return(data.frame(sentence_index, sCounter=NA,code=NA, input_subject=NA, output_subject=NA)) # these are initial sentences
    } else{
        et = editTables[[sentence_index]]    
        et$sentence_index = sentence_index
        # here is where insertions are removed so that we can join back with wfds
        return(subset(et, !is.na(sCounter))[,c('sentence_index','sCounter','code','input_subject','output_subject')])
    }
}

DSMwrtInput = do.call('rbind', lapply(c(1:length(editTables)), function(i){
    dsm = getDSMwrtInput(i, editTables)  
    if (!is.na(dsm)){
        dsm$sentence_index = dsm$sentence_index - 2 
        # -1 because Python indexes from 0
        # another -1 to bring the index of edits into alignment with the languag model results     
    } 
    return(dsm)
}))   

#DSMwrtInput = subset(DSMwrtInput, !is.na(code))

In [None]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
DSMwrtInput = r['DSMwrtInput']
DSMwrtInput.iloc[0:15,]

In [None]:
initialSentenceIndices = np.array(DSMwrtInput[np.isnan(DSMwrtInput.sCounter)]['sentence_index'].tolist())
wdf_initRemoved = wdf[(~wdf['sentence_index'].isin(initialSentenceIndices)) & (wdf['bnc_unigram_word'] != '</s>')]
DSMwrtInput_nansRemoved = DSMwrtInput[~np.isnan(DSMwrtInput.sCounter)]

In [None]:
print(DSMwrtInput_nansRemoved.shape[0])
print(wdf_initRemoved.shape[0])
#assert(DSMwrtInput_nansRemoved.shape[0] == wdf_initRemoved.shape[0])

In [None]:
wdf_initRemoved[wdf_initRemoved.sentence_index == 3192]

In [None]:
DSMwrtInput_nansRemoved[DSMwrtInput_nansRemoved.sentence_index == 3192] 

In [None]:
dsm_counts = DSMwrtInput_nansRemoved.groupby(['sentence_index']).sCounter.agg(np.size).reset_index()

dsm_counts.columns = ['sentence_index', 'dsm_count']
wdf_counts = wdf_initRemoved.groupby('sentence_index').sCounter.agg(np.size).reset_index()
wdf_counts.columns = ['sentence_index', 'wdf_count']
count_check = wdf_counts.merge(dsm_counts, how='outer')
count_check # differences in the counts?
count_check['difference'] = count_check.wdf_count - count_check.dsm_count
count_check.sort_values(by=['difference'], ascending=False)

In [None]:
# merge DSMwrtInput into the word data frame
wdfr = wdf_initRemoved.merge(DSMwrtInput_nansRemoved) #sCounter is NaN for input sentences

In [None]:
#where did all my happy data go?
wdfr.code.value_counts()

In [None]:
wdfr

In [None]:
wdfr.columns

# Merge With Word Properties

In [None]:
wdfr['word'] = wdfr['bnc_unigram_word']
lexiconch = pd.read_csv('data/lexiconch.csv', index_col=0)
print('Number of words before merging with Lexiconch: '+str(wdfr.shape[0]))
wdfl = wdfr.merge(lexiconch, how='left')
print('Number of words after merging with Lexiconch: '+str(wdfl.shape[0]))
' '.join(set(wdfr.word) - set(wdfl.word))

In [None]:
lexiconch.iloc[0]

In [None]:
yarkoni_pld = pd.read_table('data/pld20.txt', header=None)
yarkoni_pld.columns = ['word', 'pld20']

In [None]:
print('Number of words before merging with Yarkoni PLD: '+str(wdfl.shape[0]))
wdfy = wdfl.merge(yarkoni_pld, how='left')
print('Number of words after merging with Yarkoni PLD: '+str(wdfy.shape[0]))

In [None]:
subtlex = pd.read_csv('data/subtlex_augmented.csv')
subtlex['word'] = subtlex.Word
subtlex.iloc[0]

In [None]:
print('Number of words before merging with Subtlex: '+str(wdfy.shape[0]))
wdfx = wdfy.merge(subtlex[['word','SUBTLCD']], how='left')
print('Number of words after merging with Subtlex: '+str(wdfx.shape[0]))

# Merge in Sentence Level Predictors

In [None]:
all_runs['sentence_index'] = range(all_runs.shape[0])

In [None]:
#wdfs = wdfr.merge(all_runs, on='sentence_index') # !!! temporarily remove additional predictors
wdfs = wdfx.merge(all_runs, on='sentence_index') 

In [None]:
wdfs.loc[wdfs.sentence_index == 1][['bnc_unigram_word',
                                   'user_candidate_transcription',
                                    'gold_candidate_transcription',
                                    'input_subject',
                                    'output_subject'
                                   ]]

In [None]:
' '.join(wdfs.columns)

# Save the Output

In [None]:
wdfs.shape

In [None]:
wdfs.to_csv('output/wordLevelChanges.csv', index=False)

See R notebook, Error Prediction Model - Logistic Regression