In [None]:
%load_ext rpy2.ipython
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

In [None]:
import math
import csv
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import BertTokenizerFast, BertModel, BertForMaskedLM
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re, os
import bisect
import kenlm
import mosestokenizer
from string import punctuation
import scipy

In [None]:
%%R
install.packages('lme4')
install.packages('lmerTest')
install.packages('ggplot2')
install.packages('perm')

## Preprocessing

### LM Scoring

In [None]:
STRIDE = 200
def score_gpt(sentence, model, tokenizer, BOS=True):
      with torch.no_grad():
        all_log_probs = torch.tensor([], device=model.device)
        offset_mapping = []
        start_ind = 0

        while True:
            encodings = tokenizer(sentence[start_ind:], max_length=1022, truncation=True, return_offsets_mapping=True)
            if BOS:
                tensor_input = torch.tensor([[tokenizer.bos_token_id] + encodings['input_ids'] + [tokenizer.eos_token_id]], device=model.device)
            else:                
                tensor_input = torch.tensor([encodings['input_ids'] + [tokenizer.eos_token_id]], device=model.device)
            output = model(tensor_input, labels=tensor_input)
            shift_logits = output['logits'][..., :-1, :].contiguous()
            shift_labels = tensor_input[..., 1:].contiguous()
            log_probs = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduction='none')
            assert torch.isclose(torch.exp(sum(log_probs)/len(log_probs)),torch.exp(output['loss']))
            offset = 0 if start_ind == 0 else STRIDE-1
            all_log_probs = torch.cat([all_log_probs,log_probs[offset:-1]])
            offset_mapping.extend([(i+start_ind, j+start_ind) for i,j in encodings['offset_mapping'][offset:]])
            if encodings['offset_mapping'][-1][1] + start_ind == len(sentence):
                break
            start_ind += encodings['offset_mapping'][-STRIDE][1]
        return np.asarray(all_log_probs.cpu()), offset_mapping

def score_bert(sentence, model, tokenizer):
    mask_id = tokenizer.convert_tokens_to_ids('[MASK]')
    with torch.no_grad():
        all_log_probs = []
        offset_mapping = []
        start_ind = 0
        while True:
            encodings = tokenizer(sentence[start_ind:], max_length=500, truncation=True, return_offsets_mapping=True)
            tensor_input = torch.tensor([encodings['input_ids']], device=model.device)
            mask_input = tensor_input.clone()
            offset = 1 if start_ind == 0 else STRIDE
            while offset_mapping and encodings['offset_mapping'][offset][0] + start_ind > offset_mapping[-1][1] + 1:
                offset -= 1
            for i, word in enumerate(encodings['input_ids'][:-1]):
                if i < offset:
                    continue
                mask_input[:,i]=mask_id
                output = model(mask_input, labels=tensor_input)
                log_probs = torch.nn.functional.log_softmax(output['logits'][:,i], dim=-1).squeeze(0)
                all_log_probs.append(-log_probs[tensor_input[0,i]].item())
                mask_input[:,i] = word
            offset_mapping.extend([(i+start_ind, j+start_ind) for i,j in encodings['offset_mapping'][offset:-1]])
            if encodings['offset_mapping'][-2][1] + start_ind >= (len(sentence)-1):
                break
            start_ind += encodings['offset_mapping'][-STRIDE-1][1]
            
        return all_log_probs, offset_mapping

def score_transxl(sentence, model, tokenizer):
    def create_offset_mapping(sentence, tokens):
        start_ind = 0
        mapping = []
        for i,t in enumerate(tokens):
            # finding delimiters for <unk> tokens
            if t == '<unk>':
                while sentence[start_ind].isspace():
                    start_ind += 1
                if i == len(tokens) - 1:
                    mapping.append((start_ind, len(sentence)))
                    continue
                next_ind = 0
                while not sentence[start_ind + next_ind].isspace() and \
                        start_ind + next_ind < len(sentence) and  \
                        (tokens[i+1].strip('@')[0].isalpha() or
                        sentence[start_ind + next_ind:].find(tokens[i+1].strip('@')) != 0):
                    next_ind += 1
                mapping.append((start_ind, start_ind+next_ind))
                start_ind += next_ind
                continue

            t = t.strip('@')
            next_ind = sentence[start_ind:].find(t)
            if next_ind == -1:
                print("Error processing sentence...")
                print(t, sentence)
                mapping.append((start_ind, len(sentence)))
                return mapping
            mapping.append((next_ind+start_ind, next_ind+start_ind + len(t)))
            start_ind += next_ind + len(t)
        return mapping
    
    with torch.no_grad():
        encodings = tokenizer(sentence)
        tensor_input = torch.tensor([[tokenizer.eos_token_id] + encodings.input_ids], device=model.device)
        output = model(tensor_input[:,:-1])
        log_probs = output.prediction_scores.squeeze(0)
        target_log_probs = np.asarray(-log_probs.gather(1, tensor_input[:,1:].T).squeeze(-1).cpu())        
        offset_mapping = create_offset_mapping(sentence, tokenizer.convert_ids_to_tokens(encodings.input_ids) )
        return target_log_probs, offset_mapping
    
    
MOSESTOKENIZER = mosestokenizer.MosesTokenizer("en")
MOSESDETOKENIZER = mosestokenizer.MosesDetokenizer("en")
MOSESNORMALIZER = mosestokenizer.MosesPunctuationNormalizer("en")
def score_ngram(sentence, model, oov_nan=False):
    # put into wikitext-103 format
    # return strange characters back to original form
    tokens = [MOSESDETOKENIZER([t]) for t in MOSESTOKENIZER(sentence)]
    tokenized_sentence = " ".join(tokens)
    spans = []
    word_start = 0
    for t in tokens:
        while sentence[word_start] != t[0]:
            word_start += 1
        spans.append((word_start, word_start+len(t)))
        word_start += len(t)
    scores = model.full_scores(tokenized_sentence, eos=False, bos=True)
    base_change = np.log10(np.exp(1))
    if oov_nan:
        return np.array([-s[0]/base_change if not s[2] else np.nan for s in scores]), spans
    return np.array([-s[0]/base_change for s in scores]), spans

def get_corpus_mean(filename, model_name, n=10000):
    with open(filename, 'r') as f:
        probs = []
        for i, sentence in enumerate(f):
            if i == n:
                break
            if sentence.isspace():
                continue
            probs.extend(score(sentence.strip(), model_name)[0])
        return np.mean(probs)
#get_corpus_mean('wikitext-103/wiki.train.tokens', "bert", n=100000)

In [None]:
model, tokenizer = None, None
CORPUS_MEAN = np.nan
import gc  
def score(sentence, model_name):
    global model
    global tokenizer
    global CORPUS_MEAN
    def clear_cache():
        global model
        del model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
    if "ngram" in model_name:
        if type(model) != kenlm.Model:
            clear_cache()
            #estimated from wikitext-103 using "get_corpus_mean"
            CORPUS_MEAN = 2.7068
            model = kenlm.Model('wiki.arpa')
        if model_name == "ngram_oov_nan":
            return score_ngram(sentence, model, oov_nan=True)
        return score_ngram(sentence, model)
    if model_name == "bert":
        if type(model) != BertForMaskedLM:
            clear_cache()
            #estimated from wikitext-103
            CORPUS_MEAN = 1.4463
            model = BertForMaskedLM.from_pretrained('bert-base-cased')
            model.eval()
            if torch.cuda.is_available():
                model = model.cuda()
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
        return score_bert(sentence, model, tokenizer)
    if model_name == "transxl":
        if type(model) != TransfoXLLMHeadModel:
            clear_cache()
            #estimated from wikitext-103
            CORPUS_MEAN = 3.7033
            model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
            model.eval()
            if torch.cuda.is_available():
                model = model.cuda()
            tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        return score_transxl(sentence, model, tokenizer)
    if model_name == "dutch_gpt":
        if type(model) != GPT2LMHeadModel or model.config._name_or_path == 'GroNLP/gpt2-small-dutch':
            clear_cache()
            #estimated from wikitext-103
            CORPUS_MEAN = 3.9453
            model = GPT2LMHeadModel.from_pretrained("GroNLP/gpt2-small-dutch")
            model.eval()
            if torch.cuda.is_available():
                model = model.cuda()
            tokenizer = GPT2TokenizerFast.from_pretrained("GroNLP/gpt2-small-dutch")
        return score_gpt(sentence, model, tokenizer)
    else: 
        if type(model) != GPT2LMHeadModel or model.config._name_or_path != 'gpt2':
            clear_cache()
            #estimated from wikitext-103
            CORPUS_MEAN = 3.8845
            model = GPT2LMHeadModel.from_pretrained('gpt2')
            model.eval()
            if torch.cuda.is_available():
                model = model.cuda()
            tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
        return score_gpt(sentence, model, tokenizer)


In [None]:
### Sanity check for above function
a=['there is a pen on the desk. (HE)'*20,
                'there is a plane on the desk',
                        'there is a books in the desk']
scores = [score(i, "transxl") for i in a]
print([sum(s[0]) for s in scores])

### Unigram model

In [None]:
#Build unigram model from wikitext-103
def get_ngrams(sentence, n=3):
    words = sentence.split()
    words = ["BOS"]*(n-1) + words + ["EOS"]
    ngrams = []
    for i in range(len(words)-n+1):
        ngrams.append((tuple(words[i:i+n-1]), words[i+n-1]))
    return ngrams

def normalize(root, log=True):
    for prefix in root.keys():
        counts = root[prefix]
        total_counts = np.sum([counts[word] for word in counts.keys()])
        if log:
            root[prefix] = {k: np.log(v)- np.log(total_counts) for (k,v) in counts.items()}
        else:
            root[prefix] = {k: v/total_counts  for (k,v) in counts.items()}
    return root

def sampling_format(root, normalize=True):
    for prefix in root.keys():
        v = root[prefix]
        words = list(v.keys())
        counts = np.array([v[word] for word in words])
        if normalize:
            counts = counts/np.sum(counts)
        root[prefix] = (words, np.log(counts))
    return root

def create_ngram_model(filename, n, outfile):
    with open(filename, 'r') as f:
        root = defaultdict(lambda: defaultdict(int))
        for sentence in f:
            if not sentence:
                continue
            ngrams = get_ngrams(sentence.lower().strip(), n)
            for ngram in ngrams:
                root[ngram[0]][ngram[1]] += 1
        root = normalize(root, log=True)
        pickle.dump(dict(root), open(outfile, "wb"))
        return root

class UnigramModel:
    lang_mapping = {"en_base": "unigram.pkl",
                    "nl": "unigram_nl.pkl"}
    def __init__(self, lang):
        self.lang = lang
        if lang == "en":
            with open('unigrams.csv', mode='r') as infile:
                reader = csv.reader(infile)
                self.lookup = {rows[0]:-float(rows[1]) for rows in reader}
        else:
            self.lookup = pickle.load(open(UnigramModel.lang_mapping[self.lang], "rb"))[()]
    def __getitem__(self, key):
        try:
            tokens = [MOSESDETOKENIZER([t]) for t in MOSESTOKENIZER(key)]
            return np.sum([self.lookup.get(k, np.nan) for k in tokens])
        except:
            return np.nan


freq_model = None
def frequency(word, lang="en"):
    global freq_model
    if not freq_model or freq_model.lang != lang:
        freq_model = UnigramModel(lang)
    return freq_model[word]
        
#create_ngram_model("wikitext-103/wiki.train.tokens", 1, "unigram.pkl")


### Corpus Statistics

In [None]:
# Helpers
import nltk
from scipy.special import log_softmax, softmax
POWER_RANGE = np.arange(0., 3, 0.25)
def power(x, y): 
    if x.mask.all():
        return np.nan
    return np.nanmean(x**y)

def ent(x):
    mod_x = np.nan_to_num(x.data, copy=True, nan=-np.inf)
    l_soft = log_softmax(-mod_x)
    return -np.sum(np.exp(l_soft)*l_soft)

def ent2(x):
    return np.sum(np.exp(-x)*x)

def r_ent(x, k=2):
    mod_x = np.nan_to_num(x.data, copy=True, nan=-np.inf)
    soft = softmax(-mod_x)
    return 1/(1-k)*np.log(np.sum(soft**k))

def r_ent2(x, k=2):
    return 1/(1-k)*np.log(np.sum(np.exp(-x)**k))

def local_diff(x):
    d = 0
    for i in range(len(x)-1):
        d += abs(x[i+1]-x[i])
    return d/len(x)

def local_diff2(x):
    d = 0
    for i in range(len(x)-1):
        d += (x[i+1]-x[i])**2
    return d/len(x)

def tokenize_to_sents(s):
    sents = []
    for sen in nltk.sent_tokenize(s):
        #weird failure case of sentence tokenizer
        if sen.split()[0] != "',":
            sents.append(sen)
        else:
            sents[-1] += sen
    return sents
#nltk.download('punkt')
def string_join(x, j=''):
    return j.join(x)

def ordered_string_join(x, j=''):
    s = sorted(x, key=lambda x: x[0])
    a,b = list(zip(*s))
    return a, j.join(b)

def get_word_mapping(words):
    offsets = []
    pos = 0
    for w in words:
        offsets.append((pos,pos+len(w)))
        pos += len(w) + 1
    return offsets

def string_to_log_probs(string, probs, offsets):
    words = string.split()
    agg_log_probs = []
    word_mapping = get_word_mapping(words)
    cur_prob = 0
    cur_word_ind = 0
    last_ind = None
    for lp, ind in zip(probs, offsets):
        cur_prob += lp
        start, end = ind
        start_cur_word, end_cur_word = word_mapping[cur_word_ind]
        if end == end_cur_word:
            agg_log_probs.append(cur_prob)
            cur_prob = 0
            cur_word_ind += 1
        assert end <= end_cur_word
        last_ind = ind
    return agg_log_probs

def string_to_uni_log_probs(string):
    words = [s.strip().strip(punctuation).lower() for s in string.split()]
    return [frequency(w.strip().strip(punctuation)) for w in words]


In [None]:
def corpus_stats(stories, models=["gpt","bert","ngram"], wiki_mean=True, split_sens=True):
    stats = defaultdict(dict)
    for i, s in stories:
      # remove leading and trailing white space
        s = s.strip()
        stats['split_string'][i] = s.split()
        sents = tokenize_to_sents(s) if split_sens else [s]
        lens = [len(sen.split()) for sen in sents]
        assert len(s.split()) == sum(lens)
        stats['len'][i] = np.array(lens)
        stats['sent_markers'][i] = np.cumsum(lens) 
        stats['ch_len'][i] = np.array([sum([len(ch) for ch in sen.split()]) for sen in sents])
        stats['uni_log_probs'][i] = np.array(string_to_uni_log_probs(s))
        for p in POWER_RANGE:
            stats['uni_log_prob_power_'+ str(p)][i] = []
            for j in range(len(stats['sent_markers'][i])):
                prev = 0 if not j else stats['sent_markers'][i][j-1]
                end = stats['sent_markers'][i][j]
                sent_uni_log_probs = np.ma.masked_invalid(stats['uni_log_probs'][i][prev:end])
                stats['uni_log_prob_power_'+ str(p)][i].append(power(sent_uni_log_probs, p))  
    
    log_prob_stats = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for mod in models:
        log_prob_stats['log_probs'][mod] = {i:score(s.strip(), mod) for i,s in stories}
        log_prob_stats['uni_log_probs'] = {i:score(s.strip(), mod) for i,s in stories}
        lang_mean = CORPUS_MEAN if wiki_mean else np.nanmean(np.concatenate([s[0] for s in log_prob_stats['log_probs'][mod].values()]))
        print("Using language mean surprisal:", lang_mean)
        for i, s in stories:
            log_prob_stats['agg_log_probs'][mod][i] = np.array(string_to_log_probs(s.strip(), *log_prob_stats['log_probs'][mod][i]))
            assert len(log_prob_stats['agg_log_probs'][mod][i]) == len(stats['split_string'][i])            
            for j in range(len(stats['sent_markers'][i])):
                prev = 0 if not j else stats['sent_markers'][i][j-1]
                end = stats['sent_markers'][i][j]
                sent_log_probs = np.ma.masked_invalid(log_prob_stats['agg_log_probs'][mod][i][prev:end])
                log_prob_stats['log_prob_variance'][mod][i].append(np.var(sent_log_probs))
                log_prob_stats['log_prob_variance_lang'][mod][i].append(np.mean((sent_log_probs - lang_mean)**2))
                log_prob_stats['log_prob_max'][mod][i].append(np.amax(sent_log_probs))
                log_prob_stats['log_prob_mean'][mod][i].append(np.mean(sent_log_probs))
                log_prob_stats['log_prob_ldiff'][mod][i].append(local_diff(sent_log_probs))
                log_prob_stats['log_prob_ldiff2'][mod][i].append(local_diff2(sent_log_probs))

                for p in POWER_RANGE:
                    if p == 0:
                        log_prob_stats['log_prob_power_' + str(p)][mod][i].append(power(sent_log_probs, 0))
                        log_prob_stats['prob_power_' + str(p)][mod][i].append(power(np.exp(-sent_log_probs), 0))
                        continue
                    elif p < 1:
                        func1 = lambda x: r_ent(x, p) 
                        func2 = lambda x: r_ent2(x, p)
                    elif p == 1:
                        func1 = lambda x: 1/ent(x) if ent(x) else 0
                        func2 = lambda x: 1/ent2(x) if ent2(x) else 0
                    else:
                        func1 = lambda x: 1/r_ent(x, p) if r_ent(x, p) else 0
                        func2 = lambda x: 1/r_ent2(x, p) if r_ent2(x, p) else 0
                    log_prob_stats['log_prob_entropy_' + str(p)][mod][i].append(func1(sent_log_probs))
                    log_prob_stats['log_prob_power_' + str(p)][mod][i].append(power(sent_log_probs, p))  
                    log_prob_stats['prob_power_' + str(p)][mod][i].append(power(np.exp(-sent_log_probs), p))
    stats.update(log_prob_stats)
    return stats


In [None]:
def add_standard_columns(df, split_strings, lang="en"):
    # ref token is sanity check. should be same as word
    df['ref_token'] = df.apply(lambda x: split_strings[x['text_id']][x['new_ind']], axis=1)
    df['centered_time'] = df['time'] - df.groupby(by=["WorkerId"]).transform('mean')["time"]
    df['prev_word'] = df.apply(lambda x: split_strings[x['text_id']][x['new_ind']-1] if x['new_ind']-1 >= 0 else '', axis=1)
    df['word_len'] = df.apply(lambda x: len(x['word']), axis=1)
    df['prev_word_len'] = df.apply(lambda x: len(x['prev_word']), axis=1)
    df['freq'] = df.apply(lambda x: frequency(x['word'].strip().strip(punctuation).lower(), lang), axis=1)
    df['prev_freq'] = df.apply(lambda x: frequency(x['prev_word'].strip(punctuation).lower(), lang), axis=1)

def nancount(x):
    return x.isnull().sum()
def produce_aggregate_per_subject_sentence(main_df, line_col=False):
    aggregate_per_subject_sentence = main_df.groupby(by=["WorkerId","text_id", "sentence_num"]).agg({"time":[np.sum, np.mean, np.count_nonzero], 
                                                                                                               "word_len":[np.sum, np.mean], 
                                                                                                               "freq":[np.nansum, np.nanmean, nancount],
                                                                                                                "outlier": np.sum}).reset_index()
    aggregate_per_subject_sentence.columns = ['_'.join(col).strip() for col in aggregate_per_subject_sentence.columns.values]
    if line_col:
        aggregate_per_subject_sentence['line_breaks'] = main_df.groupby(by=["WorkerId","text_id", "sentence_num"], as_index = False).agg({line_col: lambda x:len(np.unique(x))})[line_col]
    aggregate_per_subject_sentence['id'] = aggregate_per_subject_sentence.apply(lambda x: str(int(x['text_id_'])) +'_'+str(int(x['sentence_num_'])), axis=1)
    return aggregate_per_subject_sentence
    
def produce_aggregate_per_sentence(aggregate_per_subject_sentence, remove_outliers=True):
    aggregate_per_sentence = aggregate_per_subject_sentence.groupby(by=["text_id_", "sentence_num_"]).agg({"time_sum": np.mean, 
                                                               "time_count_nonzero": lambda x: scipy.stats.mode(x, nan_policy='omit')[0],
                                                               "time_mean": np.mean})
    if remove_outliers:
        aggregate_per_subject_sentence = aggregate_per_subject_sentence.loc[aggregate_per_subject_sentence.outlier_sum ==0]
        tmp = aggregate_per_subject_sentence.groupby(by=["text_id_", "sentence_num_"]).agg({"time_sum": np.mean, 
                                                               "time_count_nonzero": lambda x: scipy.stats.mode(x, nan_policy='omit')[0],
                                                               "time_mean": np.mean})
        aggregate_per_sentence['time_sum_NO'] = tmp['time_sum']
        aggregate_per_sentence['time_count_nonzero_NO'] = tmp['time_count_nonzero']        
        aggregate_per_sentence['time_mean_NO'] = tmp['time_mean']
    aggregate_per_sentence = aggregate_per_sentence.reset_index()
    aggregate_per_sentence['id'] = aggregate_per_sentence.apply(lambda x: str(int(x['text_id_'])) +'_'+str(int(x['sentence_num_'])), axis=1)
    return aggregate_per_sentence
 

In [None]:
def add_log_prob_columns(df, stats, model="gpt", inplace=False):
    ret_df = df if inplace else df.copy(deep=False)
    ret_df['model'] = model
    ret_df['log_prob'] = ret_df.apply(lambda x: stats['agg_log_probs'][model][x['text_id']][x['new_ind']], axis=1)
    ret_df['prev_log_prob'] = ret_df.apply(lambda x: stats['agg_log_probs'][model][x['text_id']][x['new_ind']-1] if x['new_ind']-1 >= 0 else np.nan, axis=1)
    ret_df['prev2_log_prob'] = ret_df.apply(lambda x: stats['agg_log_probs'][model][x['text_id']][x['new_ind']-2] if x['new_ind']-2 >= 0 else np.nan, axis=1)
    ret_df['prev3_log_prob'] = ret_df.apply(lambda x: stats['agg_log_probs'][model][x['text_id']][x['new_ind']-3] if x['new_ind']-3 >= 0 else np.nan, axis=1)
    # rolling calcs
    ret_df['diff_par'] = ret_df.apply(lambda x: x['log_prob']-np.mean(stats['agg_log_probs'][model][x['text_id']]), axis=1)
    ret_df['diff2_par'] = ret_df.apply(lambda x: (x['log_prob']-np.mean(stats['agg_log_probs'][model][x['text_id']]))**2, axis=1)
    ret_df['diff_sen'] = ret_df.apply(lambda x: x['log_prob']-stats['log_prob_mean'][model][x['text_id']][x['sentence_num']], axis=1)
    ret_df['diff2_sen'] = ret_df.apply(lambda x: (x['log_prob']-stats['log_prob_mean'][model][x['text_id']][x['sentence_num']])**2, axis=1)
    ret_df['diff2_lang'] = ret_df.apply(lambda x: (x['log_prob']-CORPUS_MEAN)**2, axis=1)
    ret_df['cum_average'] = ret_df.sort_values(by='new_ind').groupby(by=["WorkerId","text_id", "sentence_num"])["log_prob"].transform(lambda x: x.expanding().mean().shift(1))
    ret_df['rolling_average1'] = ret_df.sort_values(by='new_ind').groupby(by=["WorkerId","text_id", "sentence_num"])["log_prob"].transform(lambda x: x.rolling(1, min_periods = 1).mean().shift(1))
    ret_df['rolling_average2'] = ret_df.sort_values(by='new_ind').groupby(by=["WorkerId","text_id", "sentence_num"])["log_prob"].transform(lambda x: x.rolling(2, min_periods = 1).mean().shift(1))
    ret_df['rolling_average3'] = ret_df.sort_values(by='new_ind').groupby(by=["WorkerId","text_id", "sentence_num"])["log_prob"].transform(lambda x: x.rolling(3, min_periods = 1).mean().shift(1))
    ret_df['rolling_average4'] = ret_df.sort_values(by='new_ind').groupby(by=["WorkerId","text_id", "sentence_num"])["log_prob"].transform(lambda x: x.rolling(4, min_periods = 1).mean().shift(1))
    ret_df['cum_lvar'] = ret_df.apply(lambda x: (x['log_prob']-x['cum_average'])**2, axis=1)
    ret_df['rolling_lvar1'] = ret_df.apply(lambda x: (x['log_prob']-x['rolling_average1'])**2, axis=1)
    ret_df['rolling_lvar2'] = ret_df.apply(lambda x: (x['log_prob']-x['rolling_average2'])**2, axis=1)
    ret_df['rolling_lvar3'] = ret_df.apply(lambda x: (x['log_prob']-x['rolling_average3'])**2, axis=1)
    ret_df['rolling_lvar4'] = ret_df.apply(lambda x: (x['log_prob']-x['rolling_average4'])**2, axis=1)

    return ret_df

def add_log_prob_aggregate_cols(aggregate_per_sentence, stats, model="gpt", inplace=False):
    ret_df = aggregate_per_sentence if inplace else aggregate_per_sentence.copy()
    ret_df['model'] = model
    def add_model_attribute(name):
        ret_df[name] = aggregate_per_sentence.apply(lambda x: stats[name][model][x['text_id_']][int(x['sentence_num_'])], axis=1)
    def add_attribute(name):
        ret_df[name] = aggregate_per_sentence.apply(lambda x: stats[name][x['text_id_']][int(x['sentence_num_'])], axis=1)    
    attributes = ['log_prob_mean','log_prob_max','log_prob_variance', 
                  'log_prob_variance_lang', 'log_prob_ldiff', 'log_prob_ldiff2']
    
    for i in POWER_RANGE:
        add_attribute('uni_log_prob_power_'+str(i))
        if i == 0:
            attributes.append('log_prob_power_'+str(i))
            attributes.append('prob_power_'+str(i))
            continue
        attributes.extend(['log_prob_entropy_'+str(i), 'log_prob_power_'+str(i),'prob_power_'+str(i) ])
    for a in attributes:
        add_model_attribute(a)

    ret_df['log_prob_std'] = np.sqrt(ret_df['log_prob_variance'].astype(float))
    ret_df['len'] = ret_df.apply(lambda x: stats['len'][x['text_id_']][int(x['sentence_num_'])], axis=1)
    ret_df['ch_len'] = ret_df.apply(lambda x: stats['ch_len'][x['text_id_']][int(x['sentence_num_'])], axis=1)

    return ret_df

In [None]:
def find_outliers(df, field='time', transform=lambda x: x):
    from scipy.stats import zscore
    z_scores = zscore(transform(df[field]))
    abs_z_scores = np.abs(z_scores)
    df.loc[:,'outlier'] = abs_z_scores > 3
    print("Percentage of outliers:", sum(df['outlier'])/len(df))
    return df
#remove_outliers(dundee, transform=np.log)

In [None]:
def create_analysis_dfs(main, stats, models, inplace=True, lang="en"):
    main = main if inplace else main.copy()
    #get standard corpus statistics
    add_standard_columns(main, stats['split_string'], lang=lang)
    agg_per_subject_sentence = produce_aggregate_per_subject_sentence(main)
    agg_per_sentence = produce_aggregate_per_sentence(agg_per_subject_sentence)
    #get log_prob related corpus statistics
    main = pd.concat(
        [add_log_prob_columns(main, stats, model=mod, rolling_vals=False) for mod in models])
    agg_per_subject_sentence = pd.concat(
        [add_log_prob_aggregate_cols(agg_per_subject_sentence, stats, model=mod) for mod in models])
    agg_per_sentence = pd.concat(
        [add_log_prob_aggregate_cols(agg_per_sentence, stats, model=mod) for mod in models])
    return main, agg_per_subject_sentence, agg_per_sentence
    

In [None]:
def pickle_stats(main, subject_sen, sen, name):
    df_name = name+'_df.pkl'
    pickle.dump(main, open(df_name, "wb"))
    ss_name = name+'_subject_sen.pkl'
    pickle.dump(subject_sen, open(ss_name, "wb"))
    sen_name = name+'_sen.pkl'
    pickle.dump(sen, open(sen_name, "wb"))

def load_stats(name):
    df_name = name+'_df.pkl'
    ss_name = name+'_subject_sen.pkl'
    sen_name = name+'_sen.pkl'
    return pickle.load(open(df_name, "rb")), pickle.load(open(ss_name, "rb")), pickle.load(open(sen_name, "rb"))


## Datasets

In [None]:
MODELS = ['gpt']#,'transxl', 'ngram', 'bert' ]

### Natural Stories

In [None]:
gpt3_probs = pd.read_csv("https://raw.githubusercontent.com/languageMIT/naturalstories/master/probs/all_stories_gpt3.csv")
# To get same indexing as stories db
gpt3_probs["story"] = gpt3_probs["story"] + 1
gpt3_probs['len'] = gpt3_probs.groupby("story", sort=False)['offset'].shift(periods=-1, fill_value=0) - gpt3_probs['offset'] 
gpt3_probs['new_token'] = gpt3_probs.apply(lambda x: x['token'] if x['len'] == len(x['token']) else x['token'] + ' ', axis=1) 

In [None]:
stories_df = gpt3_probs.groupby(by=["story"], sort=False).agg({"new_token":[string_join]}).reset_index()
stories = list(zip(stories_df['story'], stories_df['new_token', 'string_join']))
ns_stats = corpus_stats(stories, models=MODELS)

In [None]:
natural_stories = pd.read_csv("https://raw.githubusercontent.com/languageMIT/naturalstories/master/naturalstories_RTS/processed_RTs.tsv", sep='\t').drop_duplicates()
natural_stories.rename(columns = {'RT':'time', 
                                   'item': 'text_id'}, inplace = True)
natural_stories['new_ind'] = natural_stories['zone'] - 1
natural_stories['sentence_num'] = natural_stories.apply(lambda x: bisect.bisect(ns_stats['sent_markers'][x['text_id']], x['new_ind']), axis=1)
natural_stories = find_outliers(natural_stories, transform=np.log)

In [None]:
natural_stories, ns_agg_per_subject_sentence, ns_mean_per_sen = create_analysis_dfs(natural_stories, ns_stats, MODELS) 
#pickle_stats(natural_stories, ns_agg_per_subject_sentence, ns_mean_per_sen, "ns")

In [None]:
# looks like there's a small mispelling somewhere ;)
natural_stories[natural_stories['word'] != natural_stories['ref_token']]

In [None]:
natural_stories, ns_agg_per_subject_sentence, ns_mean_per_sen = load_stats("ns")

### Provo

In [None]:
provo = pd.read_csv('corpora/provo.csv')
provo.rename(columns = {'IA_DWELL_TIME':'time', 'Participant_ID': 'WorkerId', 'Word':'word', 
                        "Text_ID":"text_id", "Sentence_Number":"sentence_num",
                       "IA_FIRST_RUN_DWELL_TIME": 'time2', 'IA_FIRST_FIXATION_DURATION':'time3'}, inplace = True)
provo = provo.dropna(subset=["Word_Number"])
provo = provo.astype({"Word_Number": 'Int64', "sentence_num": 'Int64'})
provo['word'] = provo.apply(lambda x: MOSESNORMALIZER(x['word']).strip(), axis=1)
#fixing small discrepancy
provo.loc[provo['word'] == '0.9', 'word'] = '90%'

In [None]:
provo_text = pd.read_csv('corpora/provo_norms.csv')[['Text_ID','Text']].drop_duplicates().sort_values(by=['Text_ID'])
provo_text.drop(provo_text[(provo_text.Text_ID == 27) & (~provo_text.Text.str.contains("doesn't", regex=False))].index, inplace=True)
inds = provo_text.apply(lambda x: list(range(1,len(x['Text'].split())+1)), axis=1)
inds = {i:j for i,j in zip(provo_text['Text_ID'], inds)}
paragraphs = {i:j.replace(u"\uFFFD", "?") for i,j in provo_text[['Text_ID','Text']].itertuples(index=False, name=None)}
paragraphs_split = {i:[k.strip(punctuation) for k in j.lower().split()] for i,j in paragraphs.items()}

In [None]:
provo_stats = corpus_stats(paragraphs.items(), models=MODELS)

In [None]:
provo["new_ind"] = provo["Word_Number"] - 2
provo['new_ind'] = provo.apply(lambda x: x["new_ind"] + paragraphs_split[x['text_id']][x["new_ind"]:].index(x["word"].lower().strip(punctuation)), axis=1)
provo['sentence_num'] = provo.apply(lambda x: bisect.bisect(provo_stats['sent_markers'][x['text_id']], x['new_ind']), axis=1)
provo = find_outliers(provo.loc[provo['time'] != 0], transform=np.log)

In [None]:
provo, provo_agg_per_subject_sentence, provo_mean_per_sen = create_analysis_dfs(provo, provo_stats, MODELS) 
provo_agg_per_subject_sentence['time2_sum'] = provo.groupby(by=["WorkerId","text_id", "sentence_num", "model"]).agg({"time2":np.sum}).reset_index()['time2']
provo_agg_per_subject_sentence['time3_sum'] = provo.groupby(by=["WorkerId","text_id", "sentence_num", "model"]).agg({"time3":np.sum}).reset_index()['time3']
#pickle_stats(provo, provo_agg_per_subject_sentence, provo_mean_per_sen, "provo2")

In [None]:
provo, provo_agg_per_subject_sentence, provo_mean_per_sen = load_stats("provo")

### UCL Reading

In [None]:
ucl = pd.read_csv('corpora/ucl/selfpacedreading.RT.txt','\t')
ucl.rename(columns = {'RT':'time', 'subj_nr': 'WorkerId', 
                        "sent_nr":"text_id"}, inplace = True)
ucl['word'] = ucl.apply(lambda x: MOSESNORMALIZER(x['word']).strip(), axis=1)

In [None]:
inds, paragraphs = zip(*ucl[['text_id','word_pos','word']].drop_duplicates().dropna().groupby(by = ['text_id']).apply(lambda x: ordered_string_join(zip(x['word_pos'], x['word']), ' ')))
ucl_stats = corpus_stats(list(enumerate(paragraphs,1)), models=MODELS)

In [None]:
ucl['new_ind'] = ucl.apply(lambda x: inds[x['text_id']-1].index(x["word_pos"]), axis=1)
ucl['sentence_num'] = 0
ucl = find_outliers(ucl, transform=np.log)

In [None]:
ucl, ucl_agg_per_subject_sentence, ucl_mean_per_sen = create_analysis_dfs(ucl, ucl_stats, MODELS) 
pickle_stats(ucl, ucl_agg_per_subject_sentence, ucl_mean_per_sen, "ucl")

In [None]:
ucl, ucl_agg_per_subject_sentence, ucl_mean_per_sen = load_stats("ucl")

### UCL Eye

In [None]:
ucl_eye = pd.read_csv('corpora/ucl/eyetracking.RT.txt','\t')
ucl_eye.rename(columns = {'RTfirstpass':'time', 'subj_nr': 'WorkerId', 
                        "sent_nr":"text_id"}, inplace = True)
ucl_eye['word'] = ucl_eye.apply(lambda x: MOSESNORMALIZER(x['word']).strip(), axis=1)

In [None]:
joined = ucl_eye[['text_id','word_pos','word']].drop_duplicates().dropna().groupby(by = ['text_id']).apply(lambda x: ordered_string_join(zip(x['word_pos'], x['word']), ' '))
inds, paragraphs = zip(*joined)
ucl_eye_stats = corpus_stats(list(zip(joined.index, paragraphs)), models=MODELS)

In [None]:
inds_dict = {i: ind_set for i, ind_set in zip(joined.index, inds)}
ucl_eye['new_ind'] = ucl_eye.apply(lambda x: inds_dict[x['text_id']].index(x["word_pos"]), axis=1)
ucl_eye['sentence_num'] = 0
ucl_eye = find_outliers(ucl_eye.loc[ucl_eye['time'] != 0], transform=np.log)

In [None]:
ucl_eye, ucl_eye_agg_per_subject_sentence, ucl_eye_mean_per_sen = create_analysis_dfs(ucl_eye, ucl_eye_stats, MODELS) 
pickle_stats(ucl_eye, ucl_eye_agg_per_subject_sentence, ucl_eye_mean_per_sen, "ucl_eye")

In [None]:
ucl_eye, ucl_eye_agg_per_subject_sentence, ucl_eye_mean_per_sen = load_stats("ucl_eye")

### Dundee Corpus

In [None]:
DUNDEE_MODELS = ['gpt', 'ngram', 'bert' ]
def predict_encoding(file_path, n_lines=50):
    '''Predict a file's encoding using chardet'''
    import chardet

    # Open the file as binary data
    with open(file_path, 'rb') as f:
        # Join binary lines for specified number of lines
        rawdata = b''.join([f.readline() for _ in range(n_lines)])

    return chardet.detect(rawdata)['encoding']

In [None]:
# s\w\d+ma2: data in fixation order
# WNUM in eyetracking DF maps to word index in text DF
dundeeDir = 'corpora/dundee/eye-tracking'
fileList = [os.path.join(dundeeDir, f) for f in os.listdir(dundeeDir) if re.match(r's\w\d+ma2p*\.dat', f)]
cols = ['WorkerId', 'text_id', 'WORD','TEXT','LINE','OLEN','WLEN','XPOS','WNUM','FDUR','OBLP','WDLP','FXNO','TXFR']
dundee = pd.DataFrame(columns = cols)
for file in fileList:
    temp = pd.read_csv(file, sep='\s+', encoding='Windows-1252')
    match = re.search(r'(s\w)(\d+)ma2p*\.dat', file.split('/')[-1])
    subjId = match.group(1)
    text = int(match.group(2))
    temp.insert(loc=0, column='text_id', value=text)
    temp.insert(loc=0, column='WorkerId', value=subjId)
    dundee = dundee.append(temp)
dundee.rename(columns = {'FDUR':'time', 'WORD':'word', 'WNUM': 'Word_Number'}, inplace = True)
dundee['time'] = dundee.time.astype('int64')
dundee = dundee.reset_index().drop(columns=['index','OLEN','XPOS','OBLP','WDLP','FXNO','TXFR'])

In [None]:
dundeeDir = 'corpora/dundee/texts'
textList = [os.path.join(dundeeDir, f) for f in os.listdir(dundeeDir) if re.match(r'tx\d+wrdp\.dat', f)]
cols = ['word', 'text_id', 'screen_nr', 'line_nr', 'pos_on_line', 'serial_nr', 'initial_letter_position', 'word_len_punct', 'word_len', 'punc_code', 'n_chars_before','n_chars_after', 'Word_Number', 'local_word_freq']
dundeeTexts = pd.DataFrame(columns = cols)
for text in textList:
    temp = pd.read_csv(text, sep='\s+', names=cols, encoding='Windows-1252')
    dundeeTexts = dundeeTexts.append(temp)
dundee['word'] = dundee.apply(lambda x: re.sub(r"\s+", ' ', MOSESNORMALIZER(x['word'].strip().replace('""','"').replace('\n',' '))), axis=1)#dundee.apply(lambda x: MOSESNORMALIZER(x['word']).strip(), axis=1)

In [None]:
inds, paragraphs = zip(*dundeeTexts[['text_id','Word_Number','word']].drop_duplicates().dropna().groupby(by = ['text_id']).apply(lambda x: ordered_string_join(zip(x['Word_Number'], x['word']), ' ')))
dundee_stats = corpus_stats(list(enumerate(paragraphs,1)), models=DUNDEE_MODELS)

In [None]:
dundee = dundee.drop(dundee[dundee['word'].map(len) > 20].index)
dundee['new_ind'] = dundee.apply(lambda x: inds[x['text_id']-1].index(x["Word_Number"]), axis=1)
dundee['sentence_num'] = dundee.apply(lambda x: bisect.bisect(dundee_stats['sent_markers'][x['text_id']], x['new_ind']), axis=1)
#total reading time
temp = dundee.groupby(by=["WorkerId","text_id", "sentence_num", "new_ind", 'word']).agg({'time': np.nansum})
dundee = dundee.loc[dundee['time'] != 0].drop_duplicates(subset=["WorkerId","text_id", "sentence_num", "new_ind", 'word'])
dundee = dundee.groupby(by=["WorkerId","text_id", "sentence_num", "new_ind", 'word']).agg({'time': np.sum})
#first pass reading time
dundee['time2'] = dundee['time']
dundee['time'] = temp['time']
dundee=dundee.reset_index()
dundee = find_outliers(dundee.loc[dundee['time'] > 0], transform=np.log)
#See Smith & Levy 2013
dundee.loc[dundee.WorkerId=='sg','outlier'] = True

In [None]:
temp_sum = dundee.groupby(by=["WorkerId","text_id", "sentence_num"]).agg({"time2":np.sum}).reset_index()['time2']
dundee, dundee_agg_per_subject_sentence, dundee_mean_per_sen = create_analysis_dfs(dundee, dundee_stats, DUNDEE_MODELS)
dundee_agg_per_subject_sentence['time2_sum'] = temp_sum
#pickle_stats(dundee, dundee_agg_per_subject_sentence, dundee_mean_per_sen, "dundee")

In [None]:
dundee, dundee_agg_per_subject_sentence, dundee_mean_per_sen = load_stats("dundee")

### Brown Corpus

In [None]:
brown = pd.read_csv('corpora/brown_spr.csv')
brown = brown.drop(columns='Unnamed: 0')
brown.rename(columns = {'subject': 'WorkerId', 
                        "text_pos":"Word_Number"}, inplace = True)
brown['word'] = brown.apply(lambda x: MOSESNORMALIZER(x['word']).strip(), axis=1)

In [None]:
inds, paragraphs = zip(*brown[['text_id','Word_Number','word']].drop_duplicates().dropna().groupby(by = ['text_id']).apply(lambda x: ordered_string_join(zip(x['Word_Number'], x['word']), ' ')))
brown_stats = corpus_stats(list(enumerate(paragraphs)), models=MODELS)

In [None]:
brown['new_ind'] = brown.apply(lambda x: inds[x['text_id']].index(x["Word_Number"]), axis=1)
brown['sentence_num'] = brown.apply(lambda x: bisect.bisect(brown_stats['sent_markers'][x['text_id']], x['new_ind']), axis=1)
brown = find_outliers(brown, transform=np.log)

In [None]:
brown, brown_agg_per_subject_sentence, brown_mean_per_sen = create_analysis_dfs(brown, brown_stats, MODELS) 
#pickle_stats(brown, brown_agg_per_subject_sentence, brown_mean_per_sen, "brown")

In [None]:
brown, brown_agg_per_subject_sentence, brown_mean_per_sen = load_stats("brown")

### GECO

In [None]:
texts_df = pd.read_csv("corpora/DutchMaterials.csv")
texts_df[['part','sentence']] = texts_df['SENTENCE_ID'].str.split('-',expand=True)
texts_df['sentence'] = texts_df['sentence'].astype('int32')
idx = texts_df.groupby(["part","sentence"])['CHRON_ID'].idxmax()
texts_df.loc[idx, "WORD"] += '.'
inds, paragraphs = zip(*texts_df.groupby(by=["part"]).apply(lambda x: ordered_string_join(zip(x['CHRON_ID'], x['WORD']), ' ')))
#texts_df = texts_df.sort_values(['CHRON_ID']).groupby(by=["part"]).agg({"WORD":lambda x: string_join(x, j=' ')}).reset_index()

In [None]:
geco = pd.read_csv("corpora/L1ReadingData.csv").rename(columns = {'WORD_TOTAL_READING_TIME':'time', 
                                                          'PP_NR': 'WorkerId',
                                                          "WORD_ID_WITHIN_TRIAL":"Word_Number",
                                                          "PART":"text_id",
                                                          "WORD": "word"})
geco['time'] = pd.to_numeric(geco['time'], errors="coerce")

In [None]:
geco_stats = corpus_stats(list(enumerate(paragraphs,1)), models=['dutch_gpt'])

In [None]:
word_mapping = {x['IA_ID']: x['CHRON_ID'] for i, x in texts_df.iterrows()}
geco['new_ind'] = geco.apply(lambda x: inds[x['text_id']-1].index(word_mapping[x["WORD_ID"]]) if x["WORD_ID"] in word_mapping else None, axis=1)
geco['sentence_num'] = geco.apply(lambda x: bisect.bisect(geco_stats['sent_markers'][x['text_id']], x['new_ind']), axis=1)
geco = geco[geco['time']!= 0].dropna(subset = ["time", "new_ind"])
geco['new_ind'] = geco['new_ind'].astype('int32')
geco = find_outliers(geco, transform=np.log)

In [None]:
geco, geco_agg_per_subject_sentence, geco_mean_per_sen = create_analysis_dfs(geco, geco_stats, ['dutch_gpt'], lang="nl") 


In [None]:
geco, geco_agg_per_subject_sentence, geco_mean_per_sen = load_stats("geco")

### CoLA

In [None]:
def get_sentence_freq(sen):
    words = [MOSESDETOKENIZER([t]) for t in MOSESTOKENIZER(MOSESNORMALIZER(sen))]
    total = [frequency(w.strip().strip(punctuation).lower(), lang="en2") for w in words]
    return np.nansum(list(filter(lambda x: x != None, total)))

def add_lau_accept_measures(df):
    df['log_freq'] = df.apply(lambda x: abs(get_sentence_freq(x['sentence'])), axis=1)
    df['slor'] = df['log_prob_mean'] - df['log_freq']/df['len']
    df['normlp'] = df['log_prob_mean']*df['len']/df['log_freq']
    

In [None]:
cola = pd.read_csv('corpora/cola_public/raw/in_domain_train.tsv','\t', header=None, names=['ID','accept','NA','sentence'])
cola = cola.drop(columns='NA')
cola['text_id_'] = cola.index
cola['sentence_num_'] = 0
cola['sentence'] = cola.apply(lambda x: MOSESNORMALIZER(x['sentence']).strip(), axis=1)
cola_stats = corpus_stats(list(enumerate(cola['sentence'])), models=MODELS, split_sens=False)

In [None]:
cola = pd.concat(
    [add_log_prob_aggregate_cols(cola, cola_stats, model=mod) for mod in MODELS])
add_lau_accept_measures(cola)
#pickle.dump(cola, open("cola.pkl", "wb"))

In [None]:
cola = pickle.load(open("cola.pkl", "rb"))

### BNC

In [None]:
bnc = pd.read_csv('corpora/bnc.csv','\t')
bnc.rename(columns = {'mean_rating':'accept', 'text':'sentence', 'length':'len'}, inplace = True)
bnc['text_id_'] = bnc.index
bnc['sentence_num_'] = 0
bnc['sentence'] = bnc.apply(lambda x: re.sub(r"\s+", ' ', re.sub(r'[\u4e00-\u9fff|\u00b0]+', '', MOSESNORMALIZER(x['sentence'].strip().replace('""','"')))), axis=1)
#bnc_stats = corpus_stats(list(enumerate(bnc['sentence'])), models=MODELS, split_sens=False)

In [None]:
bnc = pd.concat(
    [add_log_prob_aggregate_cols(bnc, bnc_stats, model=mod) for mod in MODELS])
add_lau_accept_measures(bnc)
pickle.dump(bnc, open("bnc.pkl", "wb"))

In [None]:
bnc = pickle.load(open("bnc.pkl", "rb"))

In [None]:
bnc = bnc[bnc.MOP=="MOP2"]
bnc.loc[bnc.accept<2.5,"accept"] = 0
bnc.loc[bnc.accept>=2.5,"accept"] = 1

## Experiments

In [None]:
agg_per_subject_sentence_full = pd.concat([ns_agg_per_subject_sentence.assign(dataset="Natural Stories"),
                                           provo_agg_per_subject_sentence.assign(dataset="Provo"),
                                           dundee_agg_per_subject_sentence.assign(dataset="Dundee"),
                                           brown_agg_per_subject_sentence.assign(dataset="Brown"),
                                           ucl_eye_agg_per_subject_sentence.assign(dataset="UCL (ET)"),
                                           ucl_agg_per_subject_sentence.assign(dataset="UCL (R)")])
#agg_per_subject_sentence_full = geco_agg_per_subject_sentence.assign(dataset="GECO")
agg_per_subject_sentence_full['WorkerId_'] = agg_per_subject_sentence_full['WorkerId_'].astype(str)
try:
    # In case columns from different dfs are of different types
    types = agg_per_subject_sentence_full.applymap(type).apply(set)
    cols = types[types.apply(len) > 1].index
    agg_per_subject_sentence_full[cols] = agg_per_subject_sentence_full[cols].apply(lambda x: x.astype(np.float64), 1)
except TypeError:
    pass

In [None]:
agg_per_sentence_full = pd.concat([ns_mean_per_sen.assign(dataset="Natural Stories"),
                                   provo_mean_per_sen.assign(dataset="Provo"),
                                   dundee_mean_per_sen.assign(dataset="Dundee"),
                                   brown_mean_per_sen.assign(dataset="Brown"),
                                   ucl_eye_mean_per_sen.assign(dataset="UCL (ET)"),
                                   ucl_mean_per_sen.assign(dataset="UCL (R)")])
try:
    types = agg_per_sentence_full.applymap(type).apply(set)
    cols = types[types.apply(len) > 1].index
    agg_per_sentence_full[cols] = agg_per_sentence_full[cols].apply(lambda x: x.astype(np.float64), 1)
except TypeError:
    pass

In [None]:
%R -i agg_per_subject_sentence_full
%R -i agg_per_sentence_full

In [None]:
acceptability = pd.concat([cola.drop(['ID', 'sentence'], axis=1).assign(dataset='CoLA'), 
                           bnc.drop(['MOP', 'language', 'sentence', 'rating_list'], 1).assign(dataset='BNC')])

In [None]:
%R -i acceptability

In [None]:
%%R
library(lme4)
library(ggplot2)
library(dplyr)
library(perm)

In [None]:
%%R
lme_cross_val <- function(form, df, d_var, num_folds=10, shuffle=FALSE){
    if(shuffle){
        df <- df[sample(nrow(df)),]
    }
    folds <- cut(seq(1,nrow(df)),breaks=num_folds,labels=FALSE)
    estimates <- c()
    for(i in 1:num_folds){
        testIndexes <- which(folds==i,arr.ind=TRUE)
        testData <- df[testIndexes,]
        trainData <- df[-testIndexes,]
        model <- lmer(form,  data=trainData, REML=FALSE)
        sigma <- mean(residuals(model)^2)
        
        estimate <- log(dnorm(testData[[d_var]], 
                              mean=predict(model, newdata=testData, allow.new.levels=TRUE), 
                              sd=sqrt(sigma)))
        estimates <- c(estimates, estimate)
    }
    estimates
}


In [None]:
%%R
lm_cross_val <- function(formula, df, d_var, family=gaussian, num_folds=10, shuffle=FALSE){
    if(shuffle){
        df <- df[sample(nrow(df)),]
    }
    folds <- cut(seq(1,nrow(df)),breaks=num_folds,labels=FALSE)
    estimates <- c()
    for(i in 1:num_folds){
        testIndexes <- which(folds==i,arr.ind=TRUE)
        testData <- df[testIndexes,]
        trainData <- df[-testIndexes,]
        model <- glm(formula, data=trainData, family=family)
        sigma <- sigma(model)
        if(identical(binomial, family)){
            predictions <- predict(model, newdata=testData, type="response")
        }else{
            predictions <- predict(model, newdata=testData)
        }
        estimate <- log(dnorm(testData[[d_var]], 
                              mean=predictions, 
                              sd=sigma))
        estimates <- c(estimates, estimate)
        
    }
    estimate
}

## EMNLP Experiments

### Case Study 1

#### Psychometric Predictions

In [None]:
%%R
powers_np_format <- c('0.0','0.25', '0.5' , '0.75','1.0', '1.25', '1.5' , '1.75', '2.0', '2.25', '2.5'  )
labels <- as.numeric(powers_np_format)

In [None]:
%%R
df_full <- filter(agg_per_subject_sentence_full, agg_per_subject_sentence_full$log_prob_mean < 15,
                                                  agg_per_subject_sentence_full$outlier_sum==0)
outcome <- 'time_sum'
predictors <- list(b=c('len', 'I(len*uni_log_prob_power_1.0)*ch_len'))
models <- c('bert','gpt','ngram', 'transxl')
datasets <- c('Dundee','Brown','Provo','Natural Stories')

dataset_func <- function(ds){
    print(ds)
    data_per_ds <- filter(df_full, df_full$dataset == ds)
    if(nrow(data_per_ds) == 0){
                return(NULL)
    }
    predictors_func <- function(preds){
        other_preds <- paste0(preds, collapse="+")
        model_func <- function(model_name){
            data <- filter(data_per_ds, data_per_ds$model == model_name)
            if(nrow(data) == 0){
                return(NULL)
            }
            set.seed(42)
            shuffled_order <- sample(nrow(data))
            agg_baseline <- lme_cross_val(paste0(outcome,"~(len + 0 | WorkerId_)+", other_preds), data[shuffled_order,],outcome)
            power_func <- function(x){
                pred <- paste0("log_prob_power_", x,":len")
                formula <- paste0(outcome, "~ ",pred,"+(",pred,"+ len + 0 | WorkerId_) +", other_preds)
                cv <- lme_cross_val(formula, data[shuffled_order,], outcome, num_folds=10)
                c(mean(cv-agg_baseline, na.rm=TRUE), var(cv-agg_baseline, na.rm=TRUE)/length(cv), mean(cv, na.rm=TRUE), sum(is.na(cv)))
            }
            cbind(labels, as.data.frame(do.call(rbind, lapply(powers_np_format, power_func))), model_name)
        }
        cbind(as.data.frame(do.call(rbind, lapply(models, model_func))), other_preds)
    }
    print(nrow(data_per_ds))
    cbind(as.data.frame(do.call(rbind, lapply(predictors, predictors_func))), ds)
}
out <- as.data.frame(do.call(rbind, lapply(datasets, dataset_func)))

In [None]:
%%R
df_full <- acceptability
predictors <- list(b=c('1'))
models <- c('bert', 'ngram', 'gpt', "transxl")
datasets <- c('BNC', 'CoLA')

dataset_func <- function(ds){
    print(ds)
    data_per_ds <- filter(df_full, df_full$dataset == ds)
    if(nrow(data_per_ds) == 0){
                return(NULL)
    }
    family <- binomial
    predictors_func <- function(preds){
        other_preds <- paste0(preds, collapse="+")
        model_func <- function(model_name){
            d <- filter(data_per_ds, data_per_ds$model == model_name)
            if(nrow(d) == 0){
                return(NULL)
            }
            set.seed(42)
            shuffled_order <- sample(nrow(d))
            baseline <- lm_cross_val(paste("accept ~ ", other_preds),
                              d[shuffled_order,], 
                              'accept', 
                              family)

            power_func <- function(x){
                    name <- paste0("log_prob_power_",x, ":len")
                    formula <- paste0("accept ~ ", name," +", other_preds)
                    cv <- lm_cross_val(formula, d[shuffled_order,], 'accept', family)
                    c(mean(cv-baseline, na.rm=TRUE), var(cv-baseline, na.rm=TRUE)/length(cv),mean(cv, na.rm=TRUE))
                }
            cbind(labels, as.data.frame(do.call(rbind,lapply(powers_np_format, power_func))), model_name)
            }
        cbind(as.data.frame(do.call(rbind, lapply(models, model_func))),other_preds)
    }
    cbind(as.data.frame(do.call(rbind, lapply(predictors, predictors_func))), ds)
}

out_accept <- as.data.frame(do.call(rbind, lapply(datasets, dataset_func)))

In [None]:
%%R
ggplot(aes(x = labels, y = V1, color=model_name), data=rbind(select( out, -c('V4')), out_accept)) + 
    geom_line() +
    geom_vline(aes(xintercept=1),linetype=2) +
    geom_point(size=5) +
    geom_ribbon(aes(ymin=V1-sqrt(V2), ymax=V1+sqrt(V2)), alpha = 0.2) +
    labs(title="",y="Per Sentence ∆LogLik",x=expression(italic("k"))) +
    scale_color_discrete(name = "", labels=c(bert='Bert',gpt='GPT-2', ngram=expression(paste(italic("n"),"-gram")),transxl='TransXL')) +
    facet_wrap(~ds, scales="free", ncol=6) +
    theme_minimal() +
    theme(text=element_text(size=22,family="serif"), 
         axis.text.y=element_text(size=16,family="serif"),
        axis.text.x=element_text(size=16,family="serif"),
          aspect.ratio=1.5) 
#ggsave('test.png', width = 14, height = 8, dpi=300)

In [None]:
%%R
df_full <- acceptability
lau_preds <- c('slor', 'normlp')
dataset_func <- function(ds){
    data_per_ds <- filter(df_full, df_full$dataset == ds)
    if(nrow(data_per_ds) == 0){
                return(NULL)
    }
    family <- binomial
    predictors_func <- function(preds){
        other_preds <- paste0(preds, collapse="+")
        model_func <- function(model_name){
            d <- filter(data_per_ds, data_per_ds$model == model_name)
            if(nrow(d) == 0){
                return(NULL)
            }
            set.seed(42)
            shuffled_order <- sample(nrow(d))
            baseline <- lm_cross_val(paste("accept ~ ", other_preds),
                              d[shuffled_order,], 
                              'accept', 
                              family)

            inner <- function(x){
                    formula <- paste0("accept ~ ", x," +", other_preds)
                    cv <- lm_cross_val(formula, d[shuffled_order,], 'accept', family)
                    c(mean(cv-baseline, na.rm=TRUE), var(cv-baseline, na.rm=TRUE)/length(cv),mean(cv, na.rm=TRUE))
                }
            cbind(lau_preds, as.data.frame(do.call(rbind,lapply(lau_preds, inner))), model_name)
            }
        cbind(as.data.frame(do.call(rbind, lapply(models, model_func))),other_preds)
    }
    cbind(as.data.frame(do.call(rbind, lapply(predictors, predictors_func))), ds)
}

lau_out <- as.data.frame(do.call(rbind, lapply(datasets, dataset_func)))

In [None]:
%%R
ggplot(aes(x = labels, y = V1, fill=other_preds, shape=model_name ), data=out_accept) + 
    geom_line() +
    geom_point(size=5) +
    geom_ribbon(aes(ymin=V1-sqrt(V2), ymax=V1+sqrt(V2)), alpha = 0.2) +
    geom_hline(aes(yintercept = V1, linetype=model_name, color=lau_preds ), data=filter(lau_out, ds != "coa")) +
    ylab("Per Sentence ∆LogLik") +
    xlab("k") +
    ggtitle("") +
    scale_shape_discrete(name = "Model", labels=c('Bert','5-gram','GPT-2', "Transxl")) +
    theme_minimal() +
    facet_wrap(~ds, scales="free") +
    theme(text=element_text(size=20,family="serif"))
#ggsave('enwiki_perp_pred.png', width = 8, height = 8, dpi=700)

### Case Study 2

In [None]:
%R -i agg_per_subject_sentence_full

In [None]:
attributes = ['mean','max','variance', 'variance_lang', 'ldiff', 'ldiff2', 'std']
p = [0.25, 1.0, 1.25, 1.5, 1.75, 2.0]
for i in p:
    attributes.extend(['entropy_'+str(i), 'power_'+str(i)])
attributes = sorted(attributes)
%R -i attributes

In [None]:
%%R
aggregate_per_sentence <- filter(agg_per_subject_sentence_full, 
                                 agg_per_subject_sentence_full$model == 'gpt', 
                                 agg_per_subject_sentence_full$log_prob_mean < 15,
                                agg_per_subject_sentence_full$outlier_sum==0)
datasets <- c('Dundee','Brown','Provo','Natural Stories')
#datasets <- c('CoLA','BNC')
df <- c()
names <- c()
for(d in datasets){
    print(d)
    names <- c(names, paste0(d, "_mean"), paste0(d,"_var"))
    set.seed(42)
    data <- filter(aggregate_per_sentence, dataset == d)
    shuffled_order <- sample(nrow(data))
    baseline <- lme_cross_val("time_sum ~   time_count_nonzero +len + I(len*uni_log_prob_power_1.0)*ch_len + (  len+0 | WorkerId_) ", 
                              data[shuffled_order,],
                             'time_sum')
    #baseline <- lm_cross_val("accept~1", data[shuffled_order,], 'accept', binomial)
    out1 <- list()
    out1['var'] <- c()
    out1['mean'] <- c()
    for(v in attributes){
        pred <- paste0("log_prob_",v,":len ")
        formula <- paste0("time_sum ~ ",pred," + time_count_nonzero+len +I(len*uni_log_prob_power_1.0)*ch_len+ (",pred," +len+0 | WorkerId_) ")
        diff <- lme_cross_val(formula, data[shuffled_order,], 'time_sum') - baseline
        #diff <- lm_cross_val(formula, data[shuffled_order,], 'accept', binomial) - baseline
        out1[['var']] <- c(out1[['var']],  var(diff, na.rm=TRUE)/length(diff))
        out1[['mean']] <- c(out1[['mean']],  mean(diff, na.rm=TRUE))
    }
    
    df <- cbind(df, out1[['mean']], out1[['var']])
}
colnames(df) <- names
df <- cbind(attributes, df)

### Case Study 3

In [None]:
# Need to do this each time you switch data sets!
ds = "dundee"
model = "gpt"
data_full = dundee.drop(["word"], axis=1)
data = data_full.loc[(data_full['model'] == model) & (data_full['outlier'] == False)]

%R -i data
%R -i ds
%R -i model

In [None]:
%%R 
set.seed(42)
shuffled_order <- sample(nrow(data))
powers <- seq(0.25, 2.75, by=0.25)

In [None]:
%%R
powers_np_format2 <- c('1.0', '1.25', '1.5' , '1.75', '2.0', '2.25', '2.5' )
other_preds <- paste(c('log_prob', 'prev_log_prob', 'prev_freq*prev_word_len','freq*word_len'), collapse=" + ")
baseline <- lme_cross_val(paste0("time ~", other_preds, "+ (1 | WorkerId)"), data[shuffled_order,], 'time')

predictor_func <- function(name){
        formula <- paste0("time ~ ",name,"+(1 +", name,"| WorkerId)+", other_preds)
        cv <- lme_cross_val(formula, data[shuffled_order,], 'time')
        diff <- cv-baseline
        c(mean(diff[!is.infinite(diff)], na.rm=TRUE), var(diff[!is.infinite(diff)], na.rm=TRUE)/length(cv), mean(cv[!is.infinite(cv)], na.rm=TRUE))
    }

In [None]:
%%R
out <- list()
out[["cum_lvar"]] <- predictor_func('cum_lvar')
out[["rolling_lvar1"]] <- predictor_func('rolling_lvar1')
out[["rolling_lvar2"]] <- predictor_func('rolling_lvar2')
out[["rolling_lvar3"]] <- predictor_func('rolling_lvar3')
out[["rolling_lvar4"]] <- predictor_func('rolling_lvar4')

out[["diff_par"]] <- predictor_func('diff_par')
out[["diff2_par"]] <- predictor_func('diff2_par')
out[["diff_sen"]] <- predictor_func('diff_sen')
out[["diff2_sen"]] <- predictor_func('diff2_sen')
out[["diff2_lang"]] <- predictor_func('diff2_lang')


In [None]:
%%R
atts <- c("rolling_lvar1", "rolling_lvar2", "rolling_lvar3", "rolling_lvar4", "cum_lvar", "diff2_sen","diff2_par","diff2_lang")
att_names <- c("-1","-2","-3","-4","-n","sent","doc","lang")
levels <- seq(length(atts))

In [None]:
%%R
all_vars <- as_tibble(rbind(cbind(do.call(rbind, out_ns2[atts]), model="GPT-2", df="Natural\nStories", levels),
                            cbind(do.call(rbind, out_provo2[atts]), model="GPT-2",df="Provo", levels),
                            cbind(do.call(rbind, out_dundee2[atts]), model="GPT-2",df="Dundee", levels),
                            cbind(do.call(rbind, out_brown2[atts]), model="GPT-2",df="Brown", levels)))
all_vars[c('V1','V2','V3')] <- lapply(all_vars[c('V1','V2','V3')], as.numeric)


In [None]:
%%R
ggplot(aes(x = levels, y = V1, color=df), data=all_vars[all_vars$model=="GPT-2",]) + 
    geom_point(size=7) +
    geom_errorbar(aes(ymin=V1-sqrt(V2), ymax=V1+sqrt(V2))) +
    theme_minimal() +
      labs(x = "Window", y="Per Token ∆LogLik", title="")+
    theme(text=element_text(size=16,family="serif"), 
         title=element_text(size=20,family="serif"),
         axis.text.x = element_text(angle=45, size=14),
         axis.text.y = element_text(size=8, angle=45),
          axis.title.y = element_text(size=14),
         strip.text.x = element_text(size=18),
         aspect.ratio = 2,
         legend.position = "none",
         panel.spacing = unit(0, "lines")) +
    scale_x_discrete(labels=att_names) +
    facet_wrap(~df, scales="free",ncol=4) 
#ggsave('windows_re.png', width = 9, height = 8, dpi=300)

#### Misc

### Correlation

In [None]:
#model='bert'
p = POWER_RANGE[(POWER_RANGE>0) & (POWER_RANGE<3)]
names = ['log_prob_power_' + str(i) for i in p]
names2 = ['slor', 'normlp']
cola_r = cola.loc[:,names+names2+['len', 'accept','model']].copy()
cola_r.loc[:,names+names2] = -cola_r[names+names2].multiply(cola_r['len'], axis="index")
total = cola_r.groupby('model').agg(['count']).iloc[0]
cola_r = cola_r.groupby('model').corr().accept.reset_index()
cola_r['se'] = np.sqrt((1-cola_r.accept**2)/(total[0]/2 - 2))
%R -i cola_r
%R -i names
%R -i names2
%R -i p

In [None]:
bnc_r = bnc.loc[:,names+names2+['len', 'accept','model']].copy()
bnc_r.loc[:,names+names2] = -bnc_r[names+names2].multiply(bnc_r['len'], axis="index")
total = bnc_r.groupby('model').agg(['count']).iloc[0]
bnc_r = bnc_r.groupby('model').corr().accept.reset_index()
bnc_r['se'] = np.sqrt((1-cola_r.accept**2)/(total[0]/2 - 2))
%R -i bnc_r

In [None]:
%%R
corrs <- rbind(cbind(filter(cola_r, cola_r$level_1 %in% unlist(names)),df="CoLA",p=rep(p,length(unique(cola_r$model)))),
              cbind(filter(bnc_r, bnc_r$level_1 %in% unlist(names)),df="BNC",p=rep(p,length(unique(bnc_r$model)))))
corrs_base <- rbind(cbind(filter(cola_r, cola_r$level_1 %in% unlist(names2)),df="CoLA"),
              cbind(filter(bnc_r, bnc_r$level_1 %in% unlist(names2)),df="BNC"))

ggplot(aes(x = p, y = accept, color=model, fill=model), data=corrs) + 
    geom_vline(aes(xintercept = 1),linetype=2) +
    geom_line() +
    geom_point(size=5) +
    geom_ribbon(aes(ymin=accept-se, ymax=accept+se), alpha = 0.2) +
    geom_hline(aes(yintercept = as.numeric(accept), color=model, linetype=level_1), data=corrs_base) +
    theme_minimal() +
    scale_linetype_discrete(name = "", labels=c('NormLP', 'SLOR'))+
    scale_color_discrete(name = "Model", labels=c('Bert','GPT-2', 'n-gram','TransXL')) +
    scale_fill_discrete(name = "Model", labels=c('Bert','GPT-2', 'n-gram','TransXL')) +
    labs(x = "Exponent", y="Pearson's Correlation", title="Surprisal-Acceptability Correlation")+
    theme(text=element_text(size=26,family="serif"), 
         title=element_text(size=23,family="serif"),
          aspect.ratio=1.5) +
    facet_wrap(~df, scales="free") + 
    xlim(0.25,2.75)
#ggsave('front_alt.png', width = 8, height = 8, dpi=700)