# Calculate word and phrasal level characteristics for reporting in paper

In [1]:
import pandas as pd

In [2]:
stim = pd.read_excel('3_phrases_2023-07-11.xlsx', index_col=0)
stim = stim[stim.denotation == 'baseline']

Look at word-level charateristics

In [3]:
columns_to_summarize = ['Length', 'Zipf', 'AoA','CNC_M','RT','IMAG','Valence']
aggregations = {col: ['mean', 'std'] for col in columns_to_summarize}

summary = stim.groupby('concreteness').agg(aggregations)
summary.columns = ['_'.join(col) for col in summary.columns]  # Flatten MultiIndex columns
summary = summary.reset_index()

print(summary)


  concreteness  Length_mean  Length_std  Zipf_mean  Zipf_std  AoA_mean  \
0     abstract         6.06    1.237626   4.136214  0.674839      5.49   
1     concrete         5.80    1.318095   4.107117  0.676063      4.60   

    AoA_std  CNC_M_mean  CNC_M_std     RT_mean     RT_std  IMAG_mean  \
0  2.847984      2.0871   0.269295  569.271818  46.577238    3.28001   
1  2.817693      4.5536   0.287717  570.020920  51.236124    5.95642   

   IMAG_std  Valence_mean  Valence_std  
0  0.590922      5.075163     2.026802  
1  0.661226      5.001574     1.787206  


Look at phrasal level statistics

In [6]:
stim = pd.read_excel('3_phrases_2023-07-11.xlsx', index_col=0)
stim = stim[stim.denotation != 'baseline']
stim

Unnamed: 0_level_0,noun,set_nr,concreteness,Zipf,Length,Syllables,Phonemes,AoA,CNC_M,Valence,RT,CNC_SD,IMAG,denotation,adjective,phrase,frequency_dep,frequency_seq,frequency_dep_log,frequency_seq_log
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,sulphur,1,concrete,3.351834,7,2,5,6,4.23,4.7647,668.250000,1.25,4.727,subsective,old,old sulphur,5.791099e-10,1.709172e-09,-9.237239,-8.767214
1,sulphur,1,concrete,3.351834,7,2,5,6,4.23,4.7647,668.250000,1.25,4.727,privative,imaginary,imaginary sulphur,0.000000e+00,0.000000e+00,-inf,-inf
3,longing,1,abstract,3.408385,7,2,5,8,2.14,4.3030,679.666667,1.13,3.235,subsective,special,special longing,6.415080e-10,5.920740e-10,-9.192798,-9.227624
4,longing,1,abstract,3.408385,7,2,5,8,2.14,4.3030,679.666667,1.13,3.235,privative,past,past longing,2.164082e-10,1.657856e-10,-9.664726,-9.780453
6,poison,2,concrete,3.916628,6,2,5,4,4.27,1.9706,520.650000,1.01,5.324,subsective,special,special poison,1.729530e-09,2.254840e-09,-8.762072,-8.646884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,failure,136,abstract,4.432371,7,2,5,4,2.08,1.7222,520.000000,1.06,3.794,privative,mock,mock failure,0.000000e+00,0.000000e+00,-inf,-inf
816,syringe,137,concrete,3.122247,7,2,5,10,4.81,3.4375,675.194444,0.57,6.400,subsective,genuine,genuine syringe,0.000000e+00,0.000000e+00,-inf,-inf
817,syringe,137,concrete,3.122247,7,2,5,10,4.81,3.4375,675.194444,0.57,6.400,privative,fictional,fictional syringe,0.000000e+00,0.000000e+00,-inf,-inf
819,wrath,137,abstract,3.409226,5,1,3,12,2.42,2.6364,627.324324,1.33,2.875,subsective,old,old wrath,0.000000e+00,0.000000e+00,-inf,-inf


In [11]:
columns_to_summarize = ['frequency_dep', 'frequency_seq', 'frequency_dep_log','frequency_seq_log']
aggregations = {col: ['mean', 'std'] for col in columns_to_summarize}

summary = stim.groupby(['concreteness','denotation']).agg(aggregations)
summary.columns = ['_'.join(col) for col in summary.columns]  # Flatten MultiIndex columns
summary = summary.reset_index()

print(summary)


  concreteness  denotation  frequency_dep_mean  frequency_dep_std  \
0     abstract   privative        2.046068e-10       1.096170e-09   
1     abstract  subsective        8.230955e-08       7.733098e-07   
2     concrete   privative        7.037095e-10       4.503612e-09   
3     concrete  subsective        6.270969e-10       2.346864e-09   

   frequency_seq_mean  frequency_seq_std  frequency_dep_log_mean  \
0        1.777369e-10       9.774835e-10                    -inf   
1        8.691105e-08       8.213438e-07                    -inf   
2        6.557272e-10       4.069248e-09                    -inf   
3        6.889655e-10       2.734303e-09                    -inf   

   frequency_dep_log_std  frequency_seq_log_mean  frequency_seq_log_std  
0                    NaN                    -inf                    NaN  
1                    NaN                    -inf                    NaN  
2                    NaN                    -inf                    NaN  
3                

In [17]:
measures = ['frequency_dep', 'frequency_seq']
final_table = pd.DataFrame(index=measures)

for index, row in summary.iterrows():
    col_name = f"{row['concreteness']}-{row['denotation']}"
    
    for measure in measures:
        mean = row[f'{measure}_mean']
        std = row[f'{measure}_std']
        final_table.loc[measure, col_name] = f"{mean:.2e} ({std:.2e})"

column_order = [
    'concrete-subsective',
    'concrete-privative',
    'abstract-subsective',
    'abstract-privative'
]
final_table = final_table.reindex(columns=column_order)


print("Descriptive Statistics for Stimulus Frequencies")
print(final_table.to_string())
final_table

Descriptive Statistics for Stimulus Frequencies
               concrete-subsective   concrete-privative  abstract-subsective   abstract-privative
frequency_dep  6.27e-10 (2.35e-09)  7.04e-10 (4.50e-09)  8.23e-08 (7.73e-07)  2.05e-10 (1.10e-09)
frequency_seq  6.89e-10 (2.73e-09)  6.56e-10 (4.07e-09)  8.69e-08 (8.21e-07)  1.78e-10 (9.77e-10)


Unnamed: 0,concrete-subsective,concrete-privative,abstract-subsective,abstract-privative
frequency_dep,6.27e-10 (2.35e-09),7.04e-10 (4.50e-09),8.23e-08 (7.73e-07),2.05e-10 (1.10e-09)
frequency_seq,6.89e-10 (2.73e-09),6.56e-10 (4.07e-09),8.69e-08 (8.21e-07),1.78e-10 (9.77e-10)
