## Load/Clean Data

In [22]:
import pandas as pd

In [23]:
import matplotlib.pyplot as plt

In [24]:
from collections import Counter

In [25]:
import re

In [26]:
df = pd.read_csv('Cognitive Science Experiment.csv')


#### Clean up free-text field responses

In [27]:
def clean_langs(string):
    string = ''.join([x for x in string if re.match(r'[\w\s]', x)])
    if 'Dansk' in string:
        return 'Danish'
    if 'Tysk' in string:
        return 'German'
    return string

In [28]:
# make function to split languages into lists where applicable, then we can get counts of native languages

In [29]:
Counter(df['What is/are your native language(s)? (Can be more than one)'].str.title().str.strip().apply(clean_langs))

Counter({'Bulgarien': 1,
         'Danish': 51,
         'Danish And English': 3,
         'Danish And Russian': 1,
         'Danish English': 1,
         'Dutch': 2,
         'English': 11,
         'English American': 1,
         'EnglishDanish': 1,
         'French': 1,
         'German': 3,
         'Greek': 2,
         'Hebrew': 1,
         'Hindi English': 1,
         'Hungarian': 1,
         'Language': 1,
         'Lithuanian': 2,
         'Nepali': 1,
         'Polish': 2,
         'Romanian': 4,
         'Russian': 2,
         'Slovak': 3,
         'Spanish': 6,
         'Spanish And English': 1})

In [30]:
Counter(df['If so, what other language(s) have you studied/learned?'].dropna().str.title().str.strip().apply(clean_langs))

Counter({'Chinese Uni German School And High School French School Spanish Evening Classes And Since IM A Dane IVe Also Learnt Some Swedish And Norwegian IVe Also Studied A Bit Of Thai And Singlish If You Consider Singaporean English With Its Mixture Of Various Asian Languages A Separate Language': 1,
         'Engelsk': 2,
         'Englisg German Italian': 1,
         'English': 14,
         'English  French': 1,
         'English And French': 2,
         'English And German': 6,
         'English And Spanish': 1,
         'English French': 2,
         'English French German Norwegian Latin': 1,
         'English French Icelandic': 1,
         'English French Italian': 2,
         'English French Latin': 1,
         'English French Little Italian': 1,
         'English French Russian': 1,
         'English French Spanish': 1,
         'English French Spanish Latin Chinese Korean Russian': 1,
         'English German': 9,
         'English German And French': 1,
         'English Germa

In [31]:
set(df['What school did you study in? (e.g. Studieskolen)'].str.title().str.strip())

{'Cbs',
 'Clavis',
 'Hellerup Sprogcenter',
 'Hellerup Sprogskole',
 'I Switched Between Helsingør, Hellerup And Lyngby School, And Were In Each Only For A Short Time.',
 'Ia',
 'Ia Sprog',
 'Ia Sprog, Studieskolen',
 'Københavns Sprogcenter',
 'Leardansk',
 'Næstved Sprogskole',
 'Private Teacher, Stydieskolen, Clavis',
 'Public Language Center',
 'Speak School Of Danish',
 'Sprogcenter Aalborg',
 'Sprogcenter Hellerup',
 'Sprogskole',
 'Sprogskolen',
 'Sprogskolen, Hvidovre Gymnasium',
 'Studieskolen',
 'Studieskolen, Kbh Sprogcenter',
 'Studieskolen, Sprogcenter Hellerup',
 'Studieskolen/ Clavis',
 'University And Studieskolen',
 'Vestegnen Sprog Og Kompetence Center (Vsk)',
 'Vuf',
 nan}

In [32]:
set(df['What was your last level learned? (e.g. level 3.1, level A2)'].str.title().str.strip())


{'2.2',
 '2.3',
 '3.3',
 '3.5',
 '3.5 A3 I Think, The One With The Official Exam L',
 '3.6 - Studieprøve, C1',
 '4',
 '5',
 '5.1',
 'A2',
 'B2',
 'B2 (Pd3)',
 'Completed Level 2',
 'Dansk A',
 "Don'T Know, It Was A Basic Intro Course",
 "Don'T Remember, Not High.",
 'Finished The Danish Education (Level 4)',
 'Kan Ikke Huske',
 'Level 5',
 'Modul 5',
 'Modul 6.3 (Level C1)',
 'Module 5 (Passed Pd3)',
 'Pd3',
 'Prøve I Dansk 3',
 'Studieprøven',
 'Studieprøven, Level 6',
 nan}

Make dict of correct answers:

In [33]:
df.columns


Index(['Timestamp', 'Please pick a color',
       'Please read the text above to the best of your ability. Have you read it?',
       'Please read the text above to the best of your ability. Have you read it?.1',
       'Da Michael var 38 år, begyndte han at løbe længere distancer. Hvorfor begyndte han netop på det tidspunkt?',
       'Hvad sætter Michael pris på ved træningen? ',
       'Hvad synes Michaels kone om hans træning? ',
       'Hvor lang tid trænede Michael at svømme crawl før han gennemførte hans første ironman?',
       'Hvor var Michaels første ironman?', 'Hvor gammel er Michael nu?',
       'Hvornår vil Michael stoppe med at løbe?',
       'Hvordan er stemningen i teksten?',
       'Are you a native speaker of Danish?',
       'Have you studied Danish in a language school?',
       'What school did you study in? (e.g. Studieskolen)',
       'What was your last level learned? (e.g. level 3.1, level A2)',
       'Which of these statements best describes your Danish under

In [34]:
q_a = {
    'Da Michael var 38 år, begyndte han at løbe længere distancer. Hvorfor begyndte han netop på det tidspunkt?':
    'Han kunne nemmere holde op med at ryge, når han trænede.',
    'Hvad sætter Michael pris på ved træningen? ': 'At den foregår sammen med hans venner.',
    'Hvad synes Michaels kone om hans træning? ': 'Hun bryder sig ikke om det.',
    'Hvor lang tid trænede Michael at svømme crawl før han gennemførte hans første ironman?': 'I et år.',
    'Hvor var Michaels første ironman?' : 'Spanien',
    'Hvor gammel er Michael nu?' : '48',
    'Hvornår vil Michael stoppe med at løbe?' : 'Han vil aldrig stoppe.'
    
}

In [35]:
df_control = df[df['Please pick a color'] == 'Blå']

In [39]:
df_experiment = df[df['Please pick a color'] == 'Grøn']
#df_experiment.head()

In [40]:
df_danes = df[df['Are you a native speaker of Danish?'] == 'Yes']
df_L2 = df[df['Are you a native speaker of Danish?'] == 'No']

In [61]:
df_control_danes = df[(df['Please pick a color'] == 'Blå') & (df['Are you a native speaker of Danish?'] == 'Yes')]
df_experiment_danes = df[(df['Please pick a color'] == 'Grøn') & (df['Are you a native speaker of Danish?'] == 'Yes')]
df_control_L2 = df[(df['Please pick a color'] == 'Blå') & (df['Are you a native speaker of Danish?'] == 'No')]
df_experiment_L2 = df[(df['Please pick a color'] == 'Grøn') & (df['Are you a native speaker of Danish?'] == 'No')]

In [71]:
results_df = pd.DataFrame(columns=['Question','Control Danes', 'Experimental Danes', 'Control L2', 'Experimental L2'])
i = 0
for q, a in q_a.items():
    print(q, a)
    dk_control_correct = len(df_control_danes[df[q] == a])
    dk_experiment_correct = len(df_experiment_danes[df[q] == a])
    L2_control_correct = len(df_control_L2[df[q] == a])
    L2_experiment_correct = len(df_experiment_L2[df[q] == a])
    #control_incorrect = len(df_control_danes[df[q] != a])
    #experiment_incorrect = len(df_experiment_danes[df[q] != a])
    #print(control_correct, control_incorrect, (control_correct/len(df_control)))
    #print(experiment_correct,experiment_incorrect, experiment_correct/len(df_experiment))
    #print('Control group:\n # correct: %s # incorrect: %s  percent correct: %s' 
    #      %(control_correct, control_incorrect, (control_correct/len(df_control))))
    #print('Experimental group:\n # correct: %s # incorrect: %s  percent correct: %s' 
    #      %(experiment_correct,experiment_incorrect, experiment_correct/len(df_experiment)))
    results_df.loc[i] = [q, round((dk_control_correct/len(df_control_danes)),3),round((dk_experiment_correct/len(df_experiment_danes)),3),round((L2_control_correct/len(df_control_L2)),3),round((L2_experiment_correct/len(df_experiment_L2)),3)]
    i+=1
    
results_df

Hvad synes Michaels kone om hans træning?  Hun bryder sig ikke om det.
Hvad sætter Michael pris på ved træningen?  At den foregår sammen med hans venner.
Hvornår vil Michael stoppe med at løbe? Han vil aldrig stoppe.
Hvor var Michaels første ironman? Spanien
Hvor lang tid trænede Michael at svømme crawl før han gennemførte hans første ironman? I et år.
Da Michael var 38 år, begyndte han at løbe længere distancer. Hvorfor begyndte han netop på det tidspunkt? Han kunne nemmere holde op med at ryge, når han trænede.
Hvor gammel er Michael nu? 48


  """
  
  import sys
  


Unnamed: 0,Question,Control Danes,Experimental Danes,Control L2,Experimental L2
0,Hvad synes Michaels kone om hans træning?,0.36,0.424,0.522,0.364
1,Hvad sætter Michael pris på ved træningen?,0.76,0.667,0.609,0.773
2,Hvornår vil Michael stoppe med at løbe?,0.56,0.545,0.696,0.682
3,Hvor var Michaels første ironman?,0.92,0.939,0.913,0.864
4,Hvor lang tid trænede Michael at svømme crawl ...,0.72,0.727,0.696,0.773
5,"Da Michael var 38 år, begyndte han at løbe læn...",0.96,0.97,0.783,0.591
6,Hvor gammel er Michael nu?,0.96,0.909,1.0,0.909


In [65]:
len(df_control_danes)

25

In [44]:
results_df = pd.DataFrame(columns=['Question','Control', 'Experimental'])
i = 0
for q, a in q_a.items():
    print(q, a)
    control_correct = len(df_control[df[q] == a])
    experiment_correct = len(df_experiment[df[q] == a])
    control_incorrect = len(df_control[df[q] != a])
    experiment_incorrect = len(df_experiment[df[q] != a])
    #print(control_correct, control_incorrect, (control_correct/len(df_control)))
    #print(experiment_correct,experiment_incorrect, experiment_correct/len(df_experiment))
    print('Control group:\n # correct: %s # incorrect: %s  percent correct: %s' 
          %(control_correct, control_incorrect, (control_correct/len(df_control))))
    print('Experimental group:\n # correct: %s # incorrect: %s  percent correct: %s' 
          %(experiment_correct,experiment_incorrect, experiment_correct/len(df_experiment)))
    results_df.loc[i] = [q, (control_correct/len(df_control)),(experiment_correct/len(df_experiment))]
    i+=1
    
results_df

Hvad synes Michaels kone om hans træning?  Hun bryder sig ikke om det.
Control group:
 # correct: 21 # incorrect: 27  percent correct: 0.4375
Experimental group:
 # correct: 22 # incorrect: 33  percent correct: 0.4
Hvad sætter Michael pris på ved træningen?  At den foregår sammen med hans venner.
Control group:
 # correct: 33 # incorrect: 15  percent correct: 0.6875
Experimental group:
 # correct: 39 # incorrect: 16  percent correct: 0.7090909090909091
Hvornår vil Michael stoppe med at løbe? Han vil aldrig stoppe.
Control group:
 # correct: 30 # incorrect: 18  percent correct: 0.625
Experimental group:
 # correct: 33 # incorrect: 22  percent correct: 0.6
Hvor var Michaels første ironman? Spanien
Control group:
 # correct: 44 # incorrect: 4  percent correct: 0.9166666666666666
Experimental group:
 # correct: 50 # incorrect: 5  percent correct: 0.9090909090909091
Hvor lang tid trænede Michael at svømme crawl før han gennemførte hans første ironman? I et år.
Control group:
 # correct: 34 

  """
  
  import sys
  


Unnamed: 0,Question,Control,Experimental
0,Hvad synes Michaels kone om hans træning?,0.4375,0.4
1,Hvad sætter Michael pris på ved træningen?,0.6875,0.709091
2,Hvornår vil Michael stoppe med at løbe?,0.625,0.6
3,Hvor var Michaels første ironman?,0.916667,0.909091
4,Hvor lang tid trænede Michael at svømme crawl ...,0.708333,0.745455
5,"Da Michael var 38 år, begyndte han at løbe læn...",0.875,0.818182
6,Hvor gammel er Michael nu?,0.979167,0.909091


In [45]:
results_df = pd.DataFrame(columns=['Question','Danes', 'Immigrants'])
i = 0
for q, a in q_a.items():
    print(q, a)
    control_correct = len(df_danes[df[q] == a])
    experiment_correct = len(df_L2[df[q] == a])
    control_incorrect = len(df_danes[df[q] != a])
    experiment_incorrect = len(df_L2[df[q] != a])
    #print(control_correct, control_incorrect, (control_correct/len(df_control)))
    #print(experiment_correct,experiment_incorrect, experiment_correct/len(df_experiment))
    print('Control group:\n # correct: %s # incorrect: %s  percent correct: %s' 
          %(control_correct, control_incorrect, (control_correct/len(df_control))))
    print('Experimental group:\n # correct: %s # incorrect: %s  percent correct: %s' 
          %(experiment_correct,experiment_incorrect, experiment_correct/len(df_experiment)))
    results_df.loc[i] = [q, (control_correct/len(df_danes)),(experiment_correct/len(df_L2))]
    i+=1
    
results_df

Hvad synes Michaels kone om hans træning?  Hun bryder sig ikke om det.
Control group:
 # correct: 23 # incorrect: 35  percent correct: 0.4791666666666667
Experimental group:
 # correct: 20 # incorrect: 25  percent correct: 0.36363636363636365
Hvad sætter Michael pris på ved træningen?  At den foregår sammen med hans venner.
Control group:
 # correct: 41 # incorrect: 17  percent correct: 0.8541666666666666
Experimental group:
 # correct: 31 # incorrect: 14  percent correct: 0.5636363636363636
Hvornår vil Michael stoppe med at løbe? Han vil aldrig stoppe.
Control group:
 # correct: 32 # incorrect: 26  percent correct: 0.6666666666666666
Experimental group:
 # correct: 31 # incorrect: 14  percent correct: 0.5636363636363636
Hvor var Michaels første ironman? Spanien
Control group:
 # correct: 54 # incorrect: 4  percent correct: 1.125
Experimental group:
 # correct: 40 # incorrect: 5  percent correct: 0.7272727272727273
Hvor lang tid trænede Michael at svømme crawl før han gennemførte hans 

  """
  
  import sys
  


Unnamed: 0,Question,Danes,Immigrants
0,Hvad synes Michaels kone om hans træning?,0.396552,0.444444
1,Hvad sætter Michael pris på ved træningen?,0.706897,0.688889
2,Hvornår vil Michael stoppe med at løbe?,0.551724,0.688889
3,Hvor var Michaels første ironman?,0.931034,0.888889
4,Hvor lang tid trænede Michael at svømme crawl ...,0.724138,0.733333
5,"Da Michael var 38 år, begyndte han at løbe læn...",0.965517,0.688889
6,Hvor gammel er Michael nu?,0.931034,0.955556


In [None]:
with open('results.tex','w') as rf:
    rf.write(results_df.to_latex(index=False))