Our second hypothesis tests the effect of degree of misunderstanding on the magnitude of effort. 

We operationalize degree of misunderstanding as a conceptual similarity between target concept and answer offered by a guesser. To have a reproducible measure of conceptual similarity, we use the ConceptNet Numberbatch embeddings (REF). Alongside, in online anonymous rating study, we have collected data from XX people (XX English, XX Dutch) who were asked to rate the similarity between each pair of words. We then compare the 'perceived similarity' with cosine similarity computed from ConceptNet embeddings, to validate the use of ConceptNet embeddings as a measure of conceptual similarity.


In [1]:
#| code-fold: true
#| code-summary: "Code to load packages and prepare environment"

import numpy as np
import os
import pandas as pd

curfolder = os.getcwd()
datafolder = curfolder + '\\dataset\\'

# load df_all from datafolder
df_all = pd.read_csv(datafolder + 'all_data_raw.csv')

df_all.head(15)

Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID
0,1,practice,0,koken,combinatie,koken,0,10_1,1,10,1
1,2,practice,0,knippen,combinatie,kapper,0,10_1,1,10,1
2,3,target,0,geur,combinatie,ruiken,0,10_1,1,10,1
3,4,target,0,ademen,combinatie,ademen,0,10_1,1,10,1
4,5,target,0,eten,combinatie,eten,0,10_1,1,10,1
5,6,target,0,zwemmen,combinatie,zwemmen,0,10_1,1,10,1
6,7,target,0,gooien,combinatie,gooien,0,10_1,1,10,1
7,8,target,0,water,combinatie,regen,0,10_1,1,10,1
8,9,target,0,wind,combinatie,wind,0,10_1,1,10,1
9,10,practice,1,rijk,combinatie,geld,0,10_1,1,10,2


In [2]:
# how many unique sessionID
df_all['pcnID'].nunique()

142

First we need to do some data-wrangling to get all in the right format for the embedding extraction and comparison


In [3]:
# concept list
df_concepts = pd.read_excel(datafolder + '/conceptlist_info.xlsx')

# in df_concepts, keep only English and Dutch
df_concepts = df_concepts[['English', 'Dutch']]

# rename Dutch to word
df_concepts = df_concepts.rename(columns={'Dutch': 'word'})

# merge df and df_concepts on word
df = pd.merge(df_all, df_concepts, on='word', how='left')

# show rows where English is NaN
df[df['English'].isnull()]

# add translations manually for each (these are practice trials)
df.loc[df['word'] == 'bloem', 'English'] = 'flower'
df.loc[df['word'] == 'dansen', 'English'] = 'to dance'
df.loc[df['word'] == 'auto', 'English'] = 'car'
df.loc[df['word'] == 'olifant', 'English'] = 'elephant'
df.loc[df['word'] == 'comfortabel', 'English'] = 'comfortable'
df.loc[df['word'] == 'bal', 'English'] = 'ball'
df.loc[df['word'] == 'haasten', 'English'] = 'to hurry'
df.loc[df['word'] == 'gek', 'English'] = 'crazy'
df.loc[df['word'] == 'snijden', 'English'] = 'to cut'
df.loc[df['word'] == 'koken', 'English'] = 'to cook'
df.loc[df['word'] == 'juichen', 'English'] = 'to cheer'
df.loc[df['word'] == 'zingen', 'English'] = 'to sing'
df.loc[df['word'] == 'glimlach', 'English'] = 'smile'
df.loc[df['word'] == 'klok', 'English'] = 'clock'
df.loc[df['word'] == 'fiets', 'English'] = 'bicycle'
df.loc[df['word'] == 'vliegtuig', 'English'] = 'airplane'
df.loc[df['word'] == 'geheim', 'English'] = 'secret'
df.loc[df['word'] == 'telefoon', 'English'] = 'telephone'
df.loc[df['word'] == 'zwaaien', 'English'] = 'to wave'
df.loc[df['word'] == 'sneeuw', 'English'] = 'snow'
df.loc[df['word'] == 'rijk', 'English'] = 'rich'
df.loc[df['word'] == 'leeg', 'English'] = 'empty'
df.loc[df['word'] == 'hond', 'English'] = 'dog'
df.loc[df['word'] == 'knippen', 'English'] = 'to cut'
df.loc[df['word'] == 'eend', 'English'] = 'duck'

# make a list of English answers
#answers_en = ['party', 'to cheer', 'tasty', 'to shoot', 'to breathe', 'zombie', 'bee', 'sea', 'dirty', 'tasty', 'car', 'to eat', 'to eat', 'to blow', 'hose', 'hose', 'to annoy', 'to make noise', 'to make noise', 'to run away', 'elephant', 'to cry', 'cold', 'outfit', 'silence', 'to ski', 'wrong', 'to play basketball', 'to search', 'disturbed', 'to run', 'to lick', 'to lift', 'lightning', 'to think', 'to jump', 'to fall', 'to write', 'to dance', 'shoulder height', 'horn', 'dirty', 'boring', 'to drink', 'strong', 'elderly', 'to mix', 'fish', 'fish', 'dirty', 'wrong', 'smart', 'to box', 'to box', 'dog', 'to catch', 'to cheer', 'to sing', 'pregnant', 'hair', 'to shower', 'pain', 'burnt', 'hot', 'I', 'to chew', 'bird', 'airplane', 'to fly', 'to think', 'to choose', 'to doubt', 'graffiti', 'fireworks', 'bomb', 'to smile', 'to laugh', 'smile', 'clock', 'to wonder', 'height', 'big', 'height', 'space', 'to misjudge', 'to wait', 'satisfied', 'happy', 'fish', 'to smell', 'wind', 'pain', 'to burn', 'hot', 'to cycle', 'to fly', 'airplane', 'bird', 'to crawl', 'to drink', 'waterfall', 'water', 'fire', 'top', 'good', 'to hear', 'to point', 'distance', 'there', 'to whisper', 'quiet', 'to be silent', 'telephone', 'to blow', 'to distribute', 'to give', 'cat', 'to laugh', 'tasty', 'to eat', 'yummy', 'to sleep', 'mountain', 'dirty', 'to vomit', 'to be disgusted', 'to greet', 'hello', 'goodbye', 'to smell', 'nose', 'odor', 'to fly', 'fireworks', 'to blow', 'to cut', 'pain', 'hot', 'to slurp', 'to throw', 'to fall', 'to fall', 'whistle', 'heartbeat', 'mouse', 'to hit', 'to catch', 'to grab', 'to throw', 'to fall', 'to shoot', 'circus', 'trunk', 'to fall', 'to fight', 'pain', 'to push open', 'to growl', 'to cut', 'to eat', 'knife', 'to slurp', 'to drink', 'drink', 'to eat', 'delicious', 'tasty', 'to cough', 'sick', 'to cry', 'to cry']

# get rid of English 'to beat'
df = df[df['English'] != 'to beat']
# and to weep
df = df[df['English'] != 'to weep']
# and loud
df = df[df['English'] != 'noisy']

# add those to df as answers_en
#df['answer_en'] = answers_en

# keep only rows where word is not NaN
df = df[df['word'].notnull()]

# make a list of English targets
#meanings_en = list(df['English'])
df.head(15)

Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English
0,1,practice,0,koken,combinatie,koken,0,10_1,1,10,1,to cook
1,2,practice,0,knippen,combinatie,kapper,0,10_1,1,10,1,to cut
2,3,target,0,geur,combinatie,ruiken,0,10_1,1,10,1,odor
3,4,target,0,ademen,combinatie,ademen,0,10_1,1,10,1,to breathe
4,5,target,0,eten,combinatie,eten,0,10_1,1,10,1,to eat
5,6,target,0,zwemmen,combinatie,zwemmen,0,10_1,1,10,1,to swim
6,7,target,0,gooien,combinatie,gooien,0,10_1,1,10,1,to throw
7,8,target,0,water,combinatie,regen,0,10_1,1,10,1,water
8,9,target,0,wind,combinatie,wind,0,10_1,1,10,1,wind
9,10,practice,1,rijk,combinatie,geld,0,10_1,1,10,2,rich


We need to manually repair some incorrect answers (with typo etc.)

In [4]:
# in answer, replace langaam by langzaam
df['answer'] = df['answer'].str.replace('langaam', 'langzaam')
df['answer'] = df['answer'].str.replace('langsaam', 'langzaam')
df['answer'] = df['answer'].str.replace('comfortable', 'comfortabel')
df['answer'] = df['answer'].str.replace('neurien', 'neuriën')
df['answer'] = df['answer'].str.replace('neurieen', 'neuriën')
df['answer'] = df['answer'].str.replace('verdietig', 'verdrietig')
df['answer'] = df['answer'].str.replace('skien', 'skiën')
df['answer'] = df['answer'].str.replace('skieen', 'skiën')
df['answer'] = df['answer'].str.replace('geirriteerd', 'geïrriteerd')
df['answer'] = df['answer'].str.replace('vliegtug', 'vliegtuig')
df['answer'] = df['answer'].str.replace('basketba', 'basketbal')
df['answer'] = df['answer'].str.replace('basketball', 'basketbal')
df['answer'] = df['answer'].str.replace('geirriteerd', 'geïrriteerd')
df['answer'] = df['answer'].str.replace('shift', '')
df['answer'] = df['answer'].str.replace('svhieten', 'schieten', regex=False)
df['answer'] = df['answer'].str.replace('scrheeuwen', 'schreeuwen', regex=False)
df['answer'] = df['answer'].str.replace('neerkomem', 'neerkomen', regex=False)
df['answer'] = df['answer'].str.replace('watet', 'water', regex=False)
df['answer'] = df['answer'].str.replace('mastuberen', 'masturberen', regex=False)
df['answer'] = df['answer'].str.replace('shrikken', 'schrikken', regex=False)
df['answer'] = df['answer'].str.replace('grafiti', 'graffiti', regex=False)
df['answer'] = df['answer'].str.replace('vliegtuid', 'vliegtuig', regex=False)
df['answer'] = df['answer'].str.replace('grinikken', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('nurien', 'neuriën', regex=False)
df['answer'] = df['answer'].str.replace('optijd', 'op tijd', regex=False)
df['answer'] = df['answer'].str.replace('ontwetend', 'onwetend', regex=False)
df['answer'] = df['answer'].str.replace('verluisteren', 'fluisteren', regex=False)
df['answer'] = df['answer'].str.replace('luchtballin', 'luchtballon', regex=False)
df['answer'] = df['answer'].str.replace('omhooh', 'omhoog', regex=False)
df['answer'] = df['answer'].str.replace('rodellen', 'roddelen', regex=False)
df['answer'] = df['answer'].str.replace('snappem', 'snappen', regex=False)
df['answer'] = df['answer'].str.replace('indrukwekkebd', 'indrukwekkend', regex=False)
df['answer'] = df['answer'].str.replace('zwaairn', 'zwaaien', regex=False)
df['answer'] = df['answer'].str.replace('heigen', 'hijgen', regex=False)
df['answer'] = df['answer'].str.replace('gestressd', 'gestrest', regex=False)
df['answer'] = df['answer'].str.replace('kouwen', 'kauwen', regex=False)
df['answer'] = df['answer'].str.replace('shouders', 'schouders', regex=False)
df['answer'] = df['answer'].str.replace('ballom', 'ballon', regex=False)
df['answer'] = df['answer'].str.replace('autocoereur', 'autocoureur', regex=False)
df['answer'] = df['answer'].str.replace('lachrn', 'lachen', regex=False)
df['answer'] = df['answer'].str.replace('fitesen', 'fietsen', regex=False)
df['answer'] = df['answer'].str.replace('scieten', 'schieten', regex=False)
df['answer'] = df['answer'].str.replace('stamoen', 'stamperen', regex=False)
df['answer'] = df['answer'].str.replace('blixem', 'bliksem', regex=False)
df['answer'] = df['answer'].str.replace('proefen', 'proeven', regex=False)
df['answer'] = df['answer'].str.replace('blokfuit', 'blokfluit', regex=False)
df['answer'] = df['answer'].str.replace('verdrietig ', 'verdrietig', regex=False)
df['answer'] = df['answer'].str.replace('galloperen', 'galopperen', regex=False)
df['answer'] = df['answer'].str.replace('leegl', 'leeg', regex=False)
df['answer'] = df['answer'].str.replace('kinker', 'klinker', regex=False)
df['answer'] = df['answer'].str.replace('gehiem', 'geheim', regex=False)
df['answer'] = df['answer'].str.replace('voge', 'vogel', regex=False)
df['answer'] = df['answer'].str.replace('vogell', 'vogel', regex=False)
df['answer'] = df['answer'].str.replace('grinnikken', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('drinken ', 'drinken', regex=False)
df['answer'] = df['answer'].str.replace('gieberen', 'gibberen', regex=False)
df['answer'] = df['answer'].str.replace('juichenl', 'juichen', regex=False)
df['answer'] = df['answer'].str.replace('juigen', 'juichen', regex=False)
df['answer'] = df['answer'].str.replace('backlash', '', regex=False)
df['answer'] = df['answer'].str.replace('backslash', '', regex=False)
df['answer'] = df['answer'].str.replace('vlief', 'vlieg', regex=False)
df['answer'] = df['answer'].str.replace('leegopen', 'leeglopen', regex=False)
df['answer'] = df['answer'].str.replace('sprinkelen', 'sprenkelen', regex=False)
df['answer'] = df['answer'].str.replace('fohn', 'föhn', regex=False) 
df['answer'] = df['answer'].str.replace('busstop', 'bushalte', regex=False)
df['answer'] = df['answer'].str.replace('buitenadem', 'buiten adem', regex=False)
df['answer'] = df['answer'].str.replace('slowmoting', 'slowmotion', regex=False)
df['answer'] = df['answer'].str.replace('olifcapslockn', 'olifant', regex=False)
df['answer'] = df['answer'].str.replace('ski vakante', 'skivakantie', regex=False)
df['answer'] = df['answer'].str.replace('verbeisterd', 'verbijsterd', regex=False)
df['answer'] = df['answer'].str.replace('fles ontpoppen', 'fles ontkurken', regex=False)
df['answer'] = df['answer'].str.replace('margeren', 'marcheren', regex=False)
df['answer'] = df['answer'].str.replace('knock out', 'knockout', regex=False)
df['answer'] = df['answer'].str.replace('knarzen', 'knarsen', regex=False)
df['answer'] = df['answer'].str.replace('ping pong', 'pingpong', regex=False)
df['answer'] = df['answer'].str.replace('typgeluid', 'typegeluid', regex=False)
df['answer'] = df['answer'].str.replace('typgeluid', 'typegeluid', regex=False)
df['answer'] = df['answer'].str.replace('oorkest', 'orkest', regex=False)
df['answer'] = df['answer'].str.replace('ruizen', 'ruisen', regex=False)
df['answer'] = df['answer'].str.replace('skydiving', 'skydiven', regex=False)
df['answer'] = df['answer'].str.replace('lslash', '', regex=False)
df['answer'] = df['answer'].str.replace('auto rijden', 'autorijden', regex=False)
df['answer'] = df['answer'].str.replace('disgust', 'walging', regex=False)
df['answer'] = df['answer'].str.replace('elektrisiteit', 'elektriciteit', regex=False)
df['answer'] = df['answer'].str.replace('sleeen', 'sleeën', regex=False)
df['answer'] = df['answer'].str.replace('reaching', 'reiken', regex=False)
df['answer'] = df['answer'].str.replace('skippiebal', 'skippybal', regex=False)
df['answer'] = df['answer'].str.replace('kokhalsen', 'kokhalzen', regex=False)
df['answer'] = df['answer'].str.replace('grinnik', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('pinguin', 'pinguïn', regex=False)
df['answer'] = df['answer'].str.replace('discodisco', 'disco', regex=False)
df['answer'] = df['answer'].str.replace('oudpersoon', 'oud persoon', regex=False)
df['answer'] = df['answer'].str.replace('griniken', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('ping pongen', 'pingpongen', regex=False)
df['answer'] = df['answer'].str.replace('boederij', 'boerderij', regex=False)
df['answer'] = df['answer'].str.replace('paardijden', 'paardrijden', regex=False)
df['answer'] = df['answer'].str.replace('neuril', '', regex=False)
df['answer'] = df['answer'].str.replace('f1', 'formula 1', regex=False)
df['answer'] = df['answer'].str.replace('stamperen', 'stampen', regex=False)
df['answer'] = df['answer'].str.replace('stilzijn', 'stil zijn', regex=False)
df['answer'] = df['answer'].str.replace('bbq', 'barbecue', regex=False)
df['answer'] = df['answer'].str.replace('zur', 'zuur', regex=False)
df['answer'] = df['answer'].str.replace('hgvlgi', '', regex=False)
df['answer'] = df['answer'].str.replace('awkward', 'ongemakkelijk', regex=False)
df['answer'] = df['answer'].str.replace('typem', 'typen', regex=False)
df['answer'] = df['answer'].str.replace('goedzo', 'goed zo', regex=False)
df['answer'] = df['answer'].str.replace('chirpen', 'tjirpen', regex=False)
df['answer'] = df['answer'].str.replace('onwetenheid', 'onwetendheid', regex=False)
df['answer'] = df['answer'].str.replace('verweg', 'ver weg', regex=False)
df['answer'] = df['answer'].str.replace('kotsem', 'kotsen', regex=False)
df['answer'] = df['answer'].str.replace('afgrijzing', 'afgrijzen', regex=False)
df['answer'] = df['answer'].str.replace('kostem', 'kotsen', regex=False)
df['answer'] = df['answer'].str.replace('boxen', 'boksen', regex=False)
df['answer'] = df['answer'].str.replace('blasen', 'blazen', regex=False)
df['answer'] = df['answer'].str.replace('telefooneren', 'telefoneren', regex=False)
df['answer'] = df['answer'].str.replace('motor rijden', 'motorrijden', regex=False)
df['answer'] = df['answer'].str.replace('ademenen', 'ademen', regex=False)
df['answer'] = df['answer'].str.replace('lrijden', 'rijden', regex=False)
df['answer'] = df['answer'].str.replace('pijlenboog', 'pijl-en-boog', regex=False)
df['answer'] = df['answer'].str.replace('giegelen', 'giechelen', regex=False)
df['answer'] = df['answer'].str.replace('nijdi', 'nijdig', regex=False)
df['answer'] = df['answer'].str.replace('banden oppompe', 'banden oppompen', regex=False)
df['answer'] = df['answer'].str.replace('giegelen', 'giechelen', regex=False)
df['answer'] = df['answer'].str.replace('iets pakken', 'pakken', regex=False)
df['answer'] = df['answer'].str.replace('iets geven', 'geven', regex=False)
df['answer'] = df['answer'].str.replace('grinnikenen', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('grinneken', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('over weging', 'over wegen', regex=False)
df['answer'] = df['answer'].str.replace('grinneken', 'grinniken', regex=False)
df['answer'] = df['answer'].str.replace('feluisteren', 'fluisteren', regex=False)
df['answer'] = df['answer'].str.replace('openhaart', 'open haard', regex=False)
df['answer'] = df['answer'].str.replace('silte vragen', 'stilte vragen', regex=False)
df['answer'] = df['answer'].str.replace('stewardes', 'stewardess', regex=False)
df['answer'] = df['answer'].str.replace('diamand', 'diamant', regex=False)
df['answer'] = df['answer'].str.replace('volzitten', 'vol zitten', regex=False)
df['answer'] = df['answer'].str.replace('bergklimmen', 'bergbeklimmen', regex=False)
df['answer'] = df['answer'].str.replace('uitgleiden', 'uitglijden', regex=False)
df['answer'] = df['answer'].str.replace('formula 1', 'formule 1', regex=False)
df['answer'] = df['answer'].str.replace('openhaard', 'open haard', regex=False)
df['answer'] = df['answer'].str.replace('over wegen', 'overwegen', regex=False)
df['answer'] = df['answer'].str.replace('doodslikken', 'doorslikken', regex=False)
df['answer'] = df['answer'].str.replace('kukkelen', 'kukelen', regex=False)


# sometimes people directly answered they don't know 
df['answer'] = df['answer'].str.replace('geen idee', '', regex=False)
df['answer'] = df['answer'].str.replace('ik weet het niet', '', regex=False)
df['answer'] = df['answer'].str.replace('wtf', '', regex=False)


# if there is any string that has space in the beginning or the end, remove it
df['answer'] = df['answer'].str.strip()

# where word comfortabel and answer illen, change answer to chillen
df.loc[(df['word'] == 'comfortabel') & (df['answer'] == 'illen'), 'answer'] = 'chillen'

# where word is gek and answer llen, change answer to ''
df.loc[(df['word'] == 'gek') & (df['answer'] == 'llen'), 'answer'] = ''

In [5]:
# Dutch targets
meanings_nl = list(df['word'])
# Dutch answers
answers_nl = list(df['answer'])

In [6]:
# Sanity check: print rows that have any NA
df[df.isna().any(axis=1)]

# In 14 trials, the answer have been given as 'geen idee' or 'ik weet het niet', or something unrecognizible/unusable

Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English
366,32,target,1,klein,geluiden,,0,12_1,1,12,6,small
3218,1,practice,0,gek,geluiden,,0,28_1,1,28,39,crazy
3704,10,practice,1,koken,geluiden,,0,30_1,1,30,46,to cook
4662,76,target,0,klein,geluiden,,2,35_2,2,35,55,small
4666,80,target,0,kruipen,geluiden,,0,35_2,2,35,55,to crawl
6370,10,practice,1,dansen,gebaren,,0,45_1,1,45,78,to dance
6444,27,target,1,hoorn,geluiden,,1,45_2,2,45,78,horn
7108,44,practice,0,telefoon,gebaren,,0,49_2,2,49,85,telephone
7109,45,practice,0,zwaaien,gebaren,,0,49_2,2,49,85,to wave
10665,20,practice,0,gek,combinatie,,0,6_1,1,6,129,crazy


Now we will load in ConceptNet numberbatch (version XX) and compute cosine similarity for each pair


In [7]:
# Load embeddings from a file
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

We will use multilingual numberbatch to extract words in the original language of experiment - Dutch. While English has better representation in ConceptNet, the English numberbatch does not make distinction between nouns and verbs (so 'a drink' and 'to drink' have common representation - drink). Because this is important distinction for us, we opt for Dutch embeddings to avoid this problem


In [8]:
# load embeddings
embeddings = load_embeddings('numberbatch\\numberbatch.txt') # downloaded from https://github.com/commonsense/conceptnet-numberbatch?tab=readme-ov-file
#embeddings_en = load_embeddings('numberbatch-en.txt') # downloaded from https://github.com/commonsense/conceptnet-numberbatch?tab=readme-ov-file

# this is how words are represented
vec_nl = embeddings.get('/c/nl/skiën')
print(vec_nl)

[ 3.410e-02 -4.640e-02  5.490e-02  1.544e-01  1.800e-02 -5.050e-02
 -6.660e-02 -2.300e-02  5.320e-02  1.104e-01  2.770e-02  5.040e-02
 -2.010e-02  5.900e-03 -1.133e-01 -9.370e-02 -7.890e-02  3.540e-02
  3.780e-02  8.400e-02 -3.880e-02  7.680e-02 -8.010e-02  6.540e-02
 -1.493e-01 -1.036e-01  8.490e-02  1.040e-02 -6.890e-02  6.890e-02
  1.226e-01 -1.850e-02  1.520e-02  2.810e-02 -5.660e-02 -2.670e-02
 -5.700e-02 -4.480e-02  1.924e-01  5.800e-02 -7.800e-02 -7.700e-03
  1.132e-01  6.350e-02 -4.310e-02  1.900e-03 -4.820e-02  1.047e-01
  6.900e-02  7.150e-02  1.660e-02  2.730e-02  4.340e-02  1.130e-02
 -1.427e-01 -9.200e-03 -8.000e-04  2.310e-02  1.234e-01 -1.452e-01
 -1.710e-02 -1.094e-01 -1.518e-01  4.820e-02  1.400e-02 -1.460e-02
  1.023e-01  5.220e-02  1.362e-01  3.190e-02 -2.590e-02  1.220e-01
  1.750e-02  8.810e-02 -9.200e-02 -1.226e-01 -5.560e-02 -6.600e-03
  3.180e-02 -1.113e-01  6.130e-02 -1.202e-01 -2.480e-02 -8.300e-03
 -1.710e-02  3.410e-02  1.550e-02 -8.000e-02 -6.390e-02  1.170

Now we take the list of target-answer pairs, transform them into embedding format and perform cosine similarity.

There will probably be some answers that will not be represented in the numberbatch (e.g., if the answer has more than one word). So we will need to think about how to handle these.


In [10]:
# get the embeddings for the words in the list meanings_en
word_embeddings_t = {}
for word in meanings_nl:
    word_embed = '/c/nl/' + str(word)
    if word_embed in embeddings:
        word_embeddings_t[word] = embeddings[word_embed]

# get the embeddings for the words in the list answers_en
word_embeddings_ans = {}
for word in answers_nl:
    word_embed = '/c/nl/' + str(word)
    if word_embed in embeddings:
        word_embeddings_ans[word] = embeddings[word_embed]

# calculate the similarity between the first word in the list meanings_en and first word in answers_en, second word in meanings_en and second word in answers_en, etc.
cosine_similarities = []

for i in range(len(meanings_nl)):
    word1 = meanings_nl[i]
    word2 = answers_nl[i]
    vec1 = word_embeddings_t.get(word1)
    vec2 = word_embeddings_ans.get(word2)
    if vec1 is not None and vec2 is not None:
        cosine_sim = cosine_similarity(vec1, vec2)
        cosine_similarities.append(cosine_sim)
    else:
        # print which concepts could not be found
        if vec1 is None:
            print(f"Concept not found: {word1}")
        if vec2 is None:
            print(f"Concept not found: {word2}")
        cosine_similarities.append(None)

df['cosine_similarity'] = cosine_similarities
df['cosine_similarity'] = df['cosine_similarity'].round(3)
df.head(15)

Concept not found: slowmotion
Concept not found: catcallen
Concept not found: highfive
Concept not found: sniffen
Concept not found: bergwandeling
Concept not found: ver weg
Concept not found: buiten adem
Concept not found: 
Concept not found: wakker worden
Concept not found: ringtoon
Concept not found: moedergans
Concept not found: föhnen
Concept not found: vies eten
Concept not found: open haard
Concept not found: ver weg
Concept not found: kuikelen
Concept not found: highfive
Concept not found: kukelen
Concept not found: huh
Concept not found: ssst
Concept not found: startsignaal
Concept not found: banden oppompen
Concept not found: ringtoon
Concept not found: fietsbel
Concept not found: oud en nieuw
Concept not found: open haard
Concept not found: traplopen
Concept not found: 
Concept not found: zachtjes lopen
Concept not found: wc rol
Concept not found: stilte vragen
Concept not found: regendrank
Concept not found: hoge toon
Concept not found: koud hebben
Concept not found: slowmo

Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English,cosine_similarity
0,1,practice,0,koken,combinatie,koken,0,10_1,1,10,1,to cook,1.0
1,2,practice,0,knippen,combinatie,kapper,0,10_1,1,10,1,to cut,0.366
2,3,target,0,geur,combinatie,ruiken,0,10_1,1,10,1,odor,0.867
3,4,target,0,ademen,combinatie,ademen,0,10_1,1,10,1,to breathe,1.0
4,5,target,0,eten,combinatie,eten,0,10_1,1,10,1,to eat,1.0
5,6,target,0,zwemmen,combinatie,zwemmen,0,10_1,1,10,1,to swim,1.0
6,7,target,0,gooien,combinatie,gooien,0,10_1,1,10,1,to throw,1.0
7,8,target,0,water,combinatie,regen,0,10_1,1,10,1,water,0.337
8,9,target,0,wind,combinatie,wind,0,10_1,1,10,1,wind,1.0
9,10,practice,1,rijk,combinatie,geld,0,10_1,1,10,2,rich,0.171


In [13]:
# print rows where cosine similarity    is NaN
problems = df[df['cosine_similarity'].isnull()]

# save problems now
problems.to_csv(datafolder + 'problems.csv', index=False)

# show
problems.head(15)

  values = values.astype(str)


Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English,cosine_similarity
65,11,target,0,langzaam,gebaren,slowmotion,0,10_2,2,10,1,slow,
144,86,practice,1,glimlach,geluiden,catcallen,1,10_2,2,10,2,smile,
180,17,target,1,slaan,combinatie,highfive,0,11_1,1,11,4,to hit,
236,17,target,0,verdrietig,geluiden,sniffen,1,11_2,2,11,3,sad,
288,66,target,1,berg,combinatie,bergwandeling,1,11_2,2,11,4,mountain,
326,104,target,1,ver,gebaren,ver weg,1,11_2,2,11,4,far,
369,35,target,1,ademen,geluiden,buiten adem,0,12_1,1,12,6,to breathe,
431,43,practice,0,sneeuw,geluiden,,2,12_2,2,12,5,snow,
463,74,target,1,oud,geluiden,wakker worden,2,12_2,2,12,6,old,
561,1,practice,0,telefoon,geluiden,ringtoon,0,13_2,2,13,7,telephone,


In [14]:
# we also just want to save df_similarity_only where we have only word, answer, and cosine_similarity
df_similarity_only = df[['word', 'answer', 'cosine_similarity']]

# save df_similarity_only
df_similarity_only.to_csv(datafolder + 'df_similarity_only.csv', index=False)

  values = values.astype(str)


Now we add also binary yes/no for correct guess

In [15]:
# if answer == word, col guess_binary is 1, else 0
df['guess_binary'] = (df['word'] == df['answer']).astype(int)

And expressibility (Dutch)

In [16]:
express = pd.read_csv(datafolder + 'expressibility_dutch.csv')

# get rid of noisy
express = express[express['English'] != 'noisy']

In [None]:
# keep only word, modality, fit, SmenaticSubcat
express = express[['word', 'modality', 'fit', 'SemanticSubcat']]

# rename gesture, multimodal and vocal to gebaren, combinatie, and geluiden
express['modality'] = express['modality'].replace('gesture', 'gebaren')
express['modality'] = express['modality'].replace('multimodal', 'combinatie')
express['modality'] = express['modality'].replace('vocal', 'geluiden')

df_final = pd.merge(df, express, on=['word', 'modality'], how='left')

# rename fit to expressibility_dutch
df_final = df_final.rename(columns={'fit': 'expressibility_dutch'})

df_final.head(15)

  values = values.astype(str)


Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English,cosine_similarity,guess_binary,expressibility_dutch,SemanticSubcat
0,1,practice,0,koken,combinatie,koken,0,10_1,1,10,1,to cook,1.0,1,,
1,2,practice,0,knippen,combinatie,kapper,0,10_1,1,10,1,to cut,0.366,0,,
2,3,target,0,geur,combinatie,ruiken,0,10_1,1,10,1,odor,0.867,0,0.665168,olfactory
3,4,target,0,ademen,combinatie,ademen,0,10_1,1,10,1,to breathe,1.0,1,0.808141,olfactory
4,5,target,0,eten,combinatie,eten,0,10_1,1,10,1,to eat,1.0,1,0.799176,vocal_oral
5,6,target,0,zwemmen,combinatie,zwemmen,0,10_1,1,10,1,to swim,1.0,1,0.808464,locomotion
6,7,target,0,gooien,combinatie,gooien,0,10_1,1,10,1,to throw,1.0,1,0.803268,manual
7,8,target,0,water,combinatie,regen,0,10_1,1,10,1,water,0.337,0,0.574876,environment
8,9,target,0,wind,combinatie,wind,0,10_1,1,10,1,wind,1.0,1,0.641803,environment
9,10,practice,1,rijk,combinatie,geld,0,10_1,1,10,2,rich,0.171,0,,


# Add file name

In [None]:
df_final['filename'] = None

# Reset index
df_final.reset_index(drop=True, inplace=True)

# Loop over each row
for index, row in df_final.iterrows():
    if df_final['trial_type'][index] == 'target':
        trialtype = 'trial'
    else:
        trialtype = 'pr'

    number = df_final['trial_order'][index] - 1

    if df_final['exp_part'][index] == 2:
        correction = '_c' + str(df_final['correction'][index])

    else:
        correction = ''

    # This is how our audiovisual stimuli files are named
    filename = df_final['sessionID'][index] + '_' + trialtype + '_' + str(number) + '_p' + str(df_final['participant'][index]) + '_' + df_final['word'][index] + '_' + df_final['modality'][index] + correction + '_final.avi'

    df_final.loc[index, 'filename'] = filename

df_final.head(15)

In [36]:
# save it
df_final.to_csv(datafolder + 'similarity_df_final.csv')

  values = values.astype(str)


# Adding similarity from ratings

In [113]:
# Load in data from dataset folder
data = pd.read_csv(datafolder + 'similarity_df_final.csv')

# in data, replace answer slow motion to slowmotion
data['answer'] = data['answer'].str.replace('slow motion', 'slowmotion')

survey = pd.read_csv(curfolder + '\\survey\\Similarity_nl_responses.csv', header=0)

In [114]:
survey.iloc[0]


Timestamp                                                                                                             11/12/2024 14:22:20
langzaam - slowmotion                                                                                                                   8
glimlach  -  catcallen                                                                                                                  0
slaan - highfive                                                                                                                        1
verdrietig - sniffen                                                                                                                    2
                                                                                                              ...                        
bloem - bloemen ruiken                                                                                                                  6
goed - duim omhoog                

In [115]:
import pandas as pd

# Assuming your data is loaded into a DataFrame called 'survey'
# Extract the header row (the first row)
header = survey.iloc[0]

# Filter out the 'Timestamp' and the last row with comments if they exist
header = header[1:-1]

# Extract the words separated by ' - ' from each column name
word_pairs = header.index.str.split(' - ', expand=True)

# Flatten the resulting DataFrame and create a new DataFrame with the word pairs
concepts_df = pd.DataFrame(word_pairs.values.flatten(), columns=['concept'])

# Now, `concepts_df` contains the word pairs in the 'concept' column
print(concepts_df)


                       concept
0      (langzaam, slowmotion )
1     (glimlach ,  catcallen )
2           (slaan, highfive )
3        (verdrietig, sniffen)
4        (berg, bergwandeling)
..                         ...
136       (onweer, lightsaber)
137    (bloem, bloemen ruiken)
138        (goed, duim omhoog)
139  (bliksem, appels plukken)
140          (niet, niet goed)

[141 rows x 1 columns]


In [116]:

# Now, we'll calculate the mean value for each concept based on the column index
means = []
for i, concept in enumerate(concepts_df['concept']):
    # Use the column index from the original 'survey' DataFrame
    column_name = survey.columns[i+1]  # Add 1 to skip the 'Timestamp' column
    # Calculate the mean for the remaining rows (excluding the header row)
    mean_value = survey[column_name][:].mean()  # Exclude the first row (index 0)
    means.append(mean_value)

# Add the means to the concepts DataFrame
concepts_df['mean_rating'] = means

# round to 2 decimals
concepts_df['mean_rating'] = concepts_df['mean_rating'].round(3)

# Now `concepts_df` contains the word pairs and their corresponding mean ratings
print(concepts_df)

                       concept  mean_rating
0      (langzaam, slowmotion )        8.214
1     (glimlach ,  catcallen )        1.429
2           (slaan, highfive )        5.429
3        (verdrietig, sniffen)        5.357
4        (berg, bergwandeling)        6.786
..                         ...          ...
136       (onweer, lightsaber)        2.357
137    (bloem, bloemen ruiken)        6.786
138        (goed, duim omhoog)        7.786
139  (bliksem, appels plukken)        0.143
140          (niet, niet goed)        5.857

[141 rows x 2 columns]


In [117]:
# make concept a string
concepts_df['concept'] = concepts_df['concept'].astype(str)

# separate concept to word and answer
concepts_df[['word', 'answer']] = concepts_df['concept'].str.split(',', expand=True)

# now get rid of ( ) and '
concepts_df['word'] = concepts_df['word'].str.replace('\'', '')
concepts_df['word'] = concepts_df['word'].str.replace('(', '')
concepts_df['word'] = concepts_df['word'].str.replace(')', '')
concepts_df['answer'] = concepts_df['answer'].str.replace('\'', '')
concepts_df['answer'] = concepts_df['answer'].str.replace('(', '')
concepts_df['answer'] = concepts_df['answer'].str.replace(')', '')

# get rid of superfluous spaces
concepts_df['word'] = concepts_df['word'].str.strip()
concepts_df['answer'] = concepts_df['answer'].str.strip()


In [118]:
# now in data, we want to add the mean ratings to cosine_similarity where there is NA

data.loc[(data['word'] == 'langzaam') & (data['answer'] == 'slowmotion')]
# if cosine_similarity is NaN at all, add the mean rating
if data.loc[(data['word'] == 'langzaam') & (data['answer'] == 'slowmotion')]['cosine_similarity'].isnull().all():
    data.loc[(data['word'] == 'langzaam') & (data['answer'] == 'slowmotion'), 'cosine_similarity'] = concepts_df.loc[(concepts_df['word'] == 'langzaam') & (concepts_df['answer'] == 'slowmotion')]['mean_rating'].values[0]
else:
    print('cosine_similarity is not NaN')




In [None]:
def replace_rawmean(df_final, df_rawmean, word, answer):
    # if cosine_similarity is NaN at all, add the mean rating
    if df_final.loc[(df_final['word'] == word) & (df_final['answer'] == answer)]['cosine_similarity'].isnull().all():
        df_final.loc[(df_final['word'] == word) & (df_final['answer'] == answer), 'cosine_similarity'] = df_rawmean.loc[(df_rawmean['word'] == word) & (df_rawmean['answer'] == answer)]['mean_rating'].values[0]
    else:
        print('cosine_similarity is not NaN for ' + word + ' ' + answer)
    

In [120]:
# apply on all pairs from concepts_df

for index, row in concepts_df.iterrows():
    replace_rawmean(data, concepts_df, row['word'], row['answer'])

cosine_similarity is not NaN forlangzaam slowmotion
cosine_similarity is not NaN formisschien overwegen
cosine_similarity is not NaN fordood uitglijden
cosine_similarity is not NaN forei diamant
cosine_similarity is not NaN fordood doorslikken
cosine_similarity is not NaN forvliegtuig stewardess


In [121]:
# print all rows where cosine_similarity is NaN
data[data['cosine_similarity'].isnull()]

# ok we are left with only those that have no answer, meaning it was not readable/understandable or people answered things like 'i don't know'

Unnamed: 0.1,Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English,cosine_similarity,guess_binary,expressibility_dutch,SemanticSubcat,filename
420,420,43,practice,0,sneeuw,geluiden,,2,12_2,2,12,5,snow,,0,,,12_2_pr_42_p0_sneeuw_geluiden_c2_final.avi
1716,1716,32,target,1,koud,geluiden,,2,1_2,2,1,22,cold,,0,0.744751,temperature,1_2_trial_31_p1_koud_geluiden_c2_final.avi
3754,3754,105,target,1,staart,combinatie,,1,30_2,2,30,46,tail,,0,0.613189,quantity,30_2_trial_104_p1_staart_combinatie_c1_final.avi
4532,4532,71,target,0,goed,geluiden,,0,35_2,2,35,55,good,,0,0.42473,valence,35_2_trial_70_p0_goed_geluiden_c0_final.avi
6138,6138,50,target,0,jagen,combinatie,,2,44_2,2,44,75,to hunt,,0,0.682815,survival,44_2_trial_49_p0_jagen_combinatie_c2_final.avi
8172,8172,5,target,0,boos,geluiden,,0,56_2,2,56,101,angry,,0,0.74298,valence,56_2_trial_4_p0_boos_geluiden_c0_final.avi
8750,8750,90,target,1,vuur,gebaren,,0,59_2,2,59,108,fire,,0,0.612736,object,59_2_trial_89_p1_vuur_gebaren_c0_final.avi
8757,8757,97,target,1,man,gebaren,,0,59_2,2,59,108,male,,0,0.57887,animate,59_2_trial_96_p1_man_gebaren_c0_final.avi
9374,9374,71,practice,0,fiets,geluiden,,2,62_2,2,62,115,bicycle,,0,,,62_2_pr_70_p0_fiets_geluiden_c2_final.avi
10198,10198,2,practice,0,gek,geluiden,,0,69_1,1,69,127,crazy,,0,,,69_1_pr_1_p0_gek_geluiden_final.avi


In [122]:
# save data
data.to_csv(datafolder + 'similarity_df_final.csv', index=False)

In [123]:
data

Unnamed: 0.1,Unnamed: 0,trial_order,trial_type,participant,word,modality,answer,correction,sessionID,exp_part,dyad,pcnID,English,cosine_similarity,guess_binary,expressibility_dutch,SemanticSubcat,filename
0,0,1,practice,0,koken,combinatie,koken,0,10_1,1,10,1,to cook,1.000,1,,,10_1_pr_0_p0_koken_combinatie_final.avi
1,1,2,practice,0,knippen,combinatie,kapper,0,10_1,1,10,1,to cut,0.366,0,,,10_1_pr_1_p0_knippen_combinatie_final.avi
2,2,3,target,0,geur,combinatie,ruiken,0,10_1,1,10,1,odor,0.867,0,0.665168,olfactory,10_1_trial_2_p0_geur_combinatie_final.avi
3,3,4,target,0,ademen,combinatie,ademen,0,10_1,1,10,1,to breathe,1.000,1,0.808141,olfactory,10_1_trial_3_p0_ademen_combinatie_final.avi
4,4,5,target,0,eten,combinatie,eten,0,10_1,1,10,1,to eat,1.000,1,0.799176,vocal_oral,10_1_trial_4_p0_eten_combinatie_final.avi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11517,11517,111,target,1,dik,gebaren,dik,0,9_2,2,9,142,thick,1.000,1,0.756105,quantity,9_2_trial_110_p1_dik_gebaren_c0_final.avi
11518,11518,112,target,1,kauwen,gebaren,bijten,0,9_2,2,9,142,to chew,0.712,0,0.732601,gustatory,9_2_trial_111_p1_kauwen_gebaren_c0_final.avi
11519,11519,113,target,1,kauwen,gebaren,eten,1,9_2,2,9,142,to chew,0.497,0,0.732601,gustatory,9_2_trial_112_p1_kauwen_gebaren_c1_final.avi
11520,11520,114,target,1,kauwen,gebaren,tanden,2,9_2,2,9,142,to chew,0.355,0,0.732601,gustatory,9_2_trial_113_p1_kauwen_gebaren_c2_final.avi
