import modules

In [None]:
!pip install transformers

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch

### Load Model
We want a GPT-2 model that's trained on German texts: https://huggingface.co/dbmdz/german-gpt2



In [None]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")
model = AutoModelWithLMHead.from_pretrained("dbmdz/german-gpt2", pad_token_id=tokenizer.eos_token_id) # check EOS

###Import CSV with texts used in EXNAT-1 
The csv contains 4 columns: 

word_punct = single words with punctuation

word_no_punct = single words without punctuation (exception: dashes between 2 words)

text_nr = Number of text, text_01 - text_09

trial_nr = Index of word in the text, starting with 1. There are 300 words (aka trials) in each text.


---


*Hint: Make sure you upload the csv in Google Colab before trying to read it in. You can upload the csv by clicking the file icon in the side bar.*

In [11]:
# read in CSV with texts
texts_df = pd.read_csv('/content/Texts_surprisal_scores.csv', sep=";")

###Compute Surprisal Scores for each Word

Loop texts in Text_surprisal_scres.csv. For each word, get chunk with x previous words and calculate probability for predicting the actual word in the text. 

In [None]:
""" Compute surprisal scores with different "context" chunk sizes """
# (running this takes about 32 minutes)

# get unique text numbers in texts_df
text_nrs = list(set(texts_df["text_nr"]))
#print("preparing surprisal scores for the following texts:")
#print(text_nrs)

# collect surprisal scores for different chunk sizes here.
# The chunk sizes I use here are completely arbitrary values.
word_surprisal_128 = []
word_surprisal_100 = []
word_surprisal_64  = []
word_surprisal_50  = []
word_surprisal_32  = []
word_surprisal_16  = []
word_surprisal_8   = []
word_surprisal_4   = []
word_surprisal_3   = []
word_surprisal_2   = []
word_surprisal_1   = []


chunk_size = [128, 100, 64, 50, 32, 16, 8, 4, 3, 2, 1]
chunk_size_lists = [word_surprisal_128, word_surprisal_100, word_surprisal_64, 
                    word_surprisal_50, word_surprisal_32, word_surprisal_16,
                    word_surprisal_8, word_surprisal_4, word_surprisal_3, 
                    word_surprisal_2, word_surprisal_1]

# loop chunk sizes so I get 11 lists with surprisal scores
for chunk_idx in range(0, len(chunk_size)):
    print("start preparing surprisal scores for chunk size = " + str(chunk_size[chunk_idx]))

    """ loop texts """
    for text_nr in text_nrs:
      # get subset of df with current text
      curr_text = texts_df[texts_df["text_nr"] == text_nr]

      """ loop words - for chunk size = x, start at idx = x (aka with the x + 1th word) """
      chunk_size_lists[chunk_idx] = chunk_size_lists[chunk_idx] + [None] * chunk_size[chunk_idx] # for chunk size x, the first x words don't get surprisal scores
      
      for word_idx in range(chunk_size[chunk_idx], len(curr_text["word_punct"])):

        # get x previous words as text string (careful, get words + punctuation!)
        previous_words = list(curr_text["word_punct"])[word_idx - chunk_size[chunk_idx] : word_idx]
        # turn list of previous words into 1 string
        previous_words = ' '.join(previous_words)

        """ Predict next word """
        # get current word (with punctuation):
        actual_word = list(curr_text["word_punct"])[word_idx]

        """ Generate prediction for next token(s) """
        ids_list = tokenizer.encode(previous_words) # generate token ids for each of the x previous words
        ids_array = np.expand_dims((ids_list), axis = 0) # put the token IDs into array
        # predict the next x words
        output = model.generate(torch.tensor(ids_array), 
                                return_dict_in_generate = True, 
                                output_scores = True, 
                                max_new_tokens = 1) # set output length here!
        
        """ read out next-word probabilities for all words in the vocabulary """
        logits = output.scores[0] # logits = probabilities with range [0,1] transformed to range [inf, -inf]
        probs = tf.nn.softmax(logits) # transform logits back to probabilities

        """ Get probability for actual next word """
        # We should also predict punctuation. 
        # It's not like the words are shown without punctuation on screen.

        # get probability for actual word
        # problem: actual word might have multiple token IDs
        act_word_id = tokenizer.encode(actual_word) #Output looks somewhat like this: [44, 305, 479, 5283]

        # get probabilities for token ids of the current word
        act_word_probs = []
        for curr_token in act_word_id:
          #print(tokenizer.decode(curr_token)) # print single tokens (e.g. tokens "Gold" & "Grube" for word "Goldgrube")
          # get probability for current token id and append to list of probabilities.
          act_word_probs.append(probs.numpy()[0][curr_token]) 

        # multiply values in the list
        act_word_prob = np.prod(act_word_probs)


        # transform probability value into surprisal score (negative log of the probability)
        # neg log = log(1 / x) with x being the value you want to get the neg log of.
        # I use e as a base value for the log here.
        surprisal_score = np.log( 1 / act_word_prob )

        print("Chunk size = " + str(chunk_size[chunk_idx]) + " - Surprisal score for actual word " + actual_word +" is " + str(surprisal_score) + ".")
        print("Text Nr = " + str(text_nr) + " - Trial Nr = " + str(word_idx))

        # append to array where all surprisal scores are collected
        chunk_size_lists[chunk_idx].append(surprisal_score)

print("finished generating surprisal scores")



""" append new surprisal score columns to texts_df """

surprisal_128 = chunk_size_lists[0]
surprisal_100 = chunk_size_lists[1]
surprisal_64  = chunk_size_lists[2]
surprisal_50  = chunk_size_lists[3]
surprisal_32  = chunk_size_lists[4]
surprisal_16  = chunk_size_lists[5]
surprisal_8   = chunk_size_lists[6]
surprisal_4   = chunk_size_lists[7]
surprisal_3   = chunk_size_lists[8]
surprisal_2   = chunk_size_lists[9]
surprisal_1   = chunk_size_lists[10]


texts_df = texts_df.assign(surprisal_1 = surprisal_1, 
                           surprisal_2 = surprisal_2,
                           surprisal_3 = surprisal_3,
                           surprisal_4 = surprisal_4,
                           surprisal_8 = surprisal_8,
                           surprisal_16 = surprisal_16,
                           surprisal_32 = surprisal_32,
                           surprisal_50 = surprisal_50,
                           surprisal_64 = surprisal_64,
                           surprisal_100 = surprisal_100,
                           surprisal_128 = surprisal_128)

# print first 370 rows of df to check if it looks correct
# (depending on chunk size x, first x values of a text should be NaN)
#print(texts_df.head(370))

""" download texts_df as surprisal_scores.csv """
texts_df.to_csv('surprisal_scores.csv', encoding = 'utf-8-sig') 
#surprisal_scores.csv.download('surprisal_scores.csv')

from google.colab import files
files.download("surprisal_scores.csv")


start preparing surprisal scores for chunk size = 128
Chunk size = 128 - Surprisal score for actual word Petra is 35.31081098538522.
Text Nr = text_08 - Trial Nr = 128
Chunk size = 128 - Surprisal score for actual word kreuzten is 30.21974867532466.
Text Nr = text_08 - Trial Nr = 129
Chunk size = 128 - Surprisal score for actual word sich is 13.867515650127752.
Text Nr = text_08 - Trial Nr = 130
Chunk size = 128 - Surprisal score for actual word mehrere is 32.59679126579894.
Text Nr = text_08 - Trial Nr = 131
Chunk size = 128 - Surprisal score for actual word Handelswege, is 33.480889525337474.
Text Nr = text_08 - Trial Nr = 132
Chunk size = 128 - Surprisal score for actual word darunter is 14.453735272480584.
Text Nr = text_08 - Trial Nr = 133
Chunk size = 128 - Surprisal score for actual word die is 11.889444721091245.
Text Nr = text_08 - Trial Nr = 134
Chunk size = 128 - Surprisal score for actual word uralte is 34.30237511398857.
Text Nr = text_08 - Trial Nr = 135
Chunk size = 128 