In [None]:
################################################################

# AUDIO & AV SURPRISAL PROJECT

# This script edits design matrix of the replication study (see Zhang et al., 2021, EXP2) 
# Pranava generated surprisal based on n-gram, bert and GPT2 models, which is added to the original design matrix

################################################################

In [39]:
# importing modules and setting path
import pandas as pd

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 500)

path = '/Users/yezhang/Library/CloudStorage/OneDrive-UniversityCollegeLondon/surprisal_audio/stimuli'

In [40]:
# Reading and cleaning previous design matrix
data = pd.read_csv(path + '/word_merged_replication.csv')

data.drop(columns = ['maxf0', 'minf0', 'meanf0',
       'mean_intensity', 'Pragmatic', 'Metaphoric', 'meaningfulGesture',
       'beatGesture', 'gestureCorres', 'Duration', 'syllableNum', 'Speed',
       'AveragedDist', 'mouthInfoRedo', 'Frequency_ENCOW'], inplace = True)

data.columns = ['passage_id', 'delay', 'onset', 'is_gesture', 'word',
               'bin_id', 'pos_prev', 'lemma', 'word_length', 'surprisal_prev','word_seq']
data.dropna(subset=['word'], inplace = True)

In [41]:
# Reading and cleaning surprisal values

## Combining surprisal generated from all models
model_list = ['2gram', '3gram','4gram','5gram','6gram', 'bert', 'gpt2']
surprisal_slice_list = []
for model in model_list:
    surprisal_slice = pd.read_csv(path +'/word_quantifications_replication/'+model+'_sentenceInfo_log_probs.csv',
                                 usecols = ['Word', 'Log Probability'])
    surprisal_slice.columns = ['word', 'surprisal_' + model]
    surprisal_slice_list.append(surprisal_slice)
surprisal = pd.concat(surprisal_slice_list,axis=1) 
surprisal = surprisal.loc[:,~surprisal.columns.duplicated()].copy() # dropping duplicate word columns

## Surprisal that Pranava generated are duplicated twice, because the same sentence was produced twice,
## first with gesture and then without. The surprisal is identical across the sets, so only including one
sep_point = 4059 # 4059 is the last word in the first repitation
surprisal = surprisal.iloc[0:sep_point,]
surprisal.dropna(subset=['word'], inplace = True)

In [42]:
# Preparing to combine design matrix with surprisal

surprisal['word'] = surprisal['word'].str.lower().str.strip(' ')
data['word'] = data['word'].str.lower().str.strip(' ')

data_G = data[data['is_gesture'] == 'TRUE']
data_N = data[data['is_gesture'] == 'FALSE']

In [43]:
# Combining surprisal with design matrix. 
# Note that we have to use iteration rather than merge, because there are duplicate words with different surprisal
def surprisal_mapping(surprisal_matrix, design_matrix):

    design_matrix_output = design_matrix
    design_matrix_output.loc[:,'word_mapped'] = 'NaN'
    design_matrix_output.loc[:,'is_seg'] = 'NaN'
    for model in model_list:
        design_matrix_output['surprisal_'+model] = 'NaN'
    
    surprisal_matrix.reset_index(drop=True, inplace=True)
    design_matrix.reset_index(drop=True, inplace=True)
    des_index = 0
    for index, row in surprisal_matrix.iterrows():
        design_word = design_matrix.iloc[des_index, 4]
        surprisal_word = row['word']
        try:
            if surprisal_word == design_word: # Full match
                design_matrix_output.iloc[des_index, -9] = surprisal_word
                design_matrix_output.iloc[des_index, -8] = 'False'
                design_matrix_output.iloc[des_index, -7:] = row[1:]
                des_index = des_index+1
            elif surprisal_word in design_word: # surprisal word is segmented but the design_word is full: repeat for the next surprisal word
                design_matrix_output.iloc[des_index, -9] = surprisal_word
                design_matrix_output.iloc[des_index, -8] = 'True'
                design_matrix_output.iloc[des_index, -7:] = row[1:]
                des_index = des_index+1
            elif design_word in surprisal_word: # design_word is segmented but the surprisal_word is full: skip to the next design word
                design_matrix_output.iloc[des_index, -9] = surprisal_word
                design_matrix_output.iloc[des_index, -8] = 'True'
                design_matrix_output.iloc[des_index, -7:] = row[1:]
                des_index = des_index + 2
            else: # design_word or surprisal_word contains more item
                design_word_next = design_matrix.iloc[des_index+1, 4]
                if surprisal_word == design_word_next: # design_word contains more item. PROBLEM: only searching for the next line, error if 2 addition words are inserted in design_word
                    des_index = des_index+1
                    design_matrix_output.iloc[des_index, -9] = surprisal_word
                    design_matrix_output.iloc[des_index, -8] = 'False'
                    design_matrix_output.iloc[des_index, -7:] = row[1:]
                    des_index = des_index+1
                else: # surprisal_word contains more item
                    design_matrix_output.iloc[des_index, -9] = 'NaN'
                    design_matrix_output.iloc[des_index, -8] = 'True'
                    design_matrix_output.iloc[des_index, -7:] = 'NaN'
                    des_index = des_index
        except:
            print(surprisal_word)
    
    return design_matrix_output

In [44]:
data_G_surprisal = surprisal_mapping(surprisal, data_G)
data_N_surprisal = surprisal_mapping(surprisal, data_N)
data_surprisal = pd.concat([data_G_surprisal, data_N_surprisal], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  design_matrix_output.loc[:,'word_mapped'] = 'NaN'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  design_matrix_output.loc[:,'is_seg'] = 'NaN'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  design_matrix_output['surprisal_'+model] = 'NaN'
A value is trying to be set on a copy of a slice from a DataF

In [45]:
data_surprisal.to_csv(path + '/word_merged_replication_surprisal.csv')