# imports & loads

In [416]:
import json
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme(style='whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [417]:
path = Path.cwd().parent / 'data'

# idiom analysis

In [421]:
literality_threshold = 4.39

with open('idioms_map.json', 'r', encoding='utf-8') as f:
    idioms_df = pd.DataFrame.from_dict(json.load(f), orient='index')[['idiom', 'ipm', 'ipm_bri', 'frequency', 'familiarity', 'literality']]

idioms_df['idiom_id'] = idioms_df.index.astype(int)
idioms_df['is_low'] =idioms_df['literality'].apply(lambda x: 1 if x < 4.39 else 0) 
idioms_df.head(3)

Unnamed: 0,idiom,ipm,ipm_bri,frequency,familiarity,literality,idiom_id,is_low
1,протянуть ноги,0.43,0.41,4.719626,6.130841,5.579439,1,0
2,поставить на ноги,1.31,0.34,5.543103,6.37931,6.0,2,0
3,бить себя в грудь,0.89,0.78,4.922414,6.103448,5.560345,3,0


In [422]:
grouped = list(idioms_df.groupby('is_low'))
print(grouped[0][0], list(grouped[0][1].idiom))
print(grouped[1][0], list(grouped[1][1].idiom))

0 ['протянуть ноги', 'поставить на ноги', 'бить себя в грудь', 'сдувать пылинки', 'утирать сопли', 'считать ворон', 'перекрыть кислород', 'поливать грязью']
1 ['от сердца оторвать', 'включить дурака', 'снять голову', 'поставить на уши', 'торговать воздухом', 'прыгнуть выше головы', 'сложить голову', 'набивать цену']


In [423]:
idioms_df.is_low.value_counts()

0    8
1    8
Name: is_low, dtype: int64

In [424]:
idioms_df[['ipm', 'ipm_bri', 'frequency', 'familiarity', 'literality']].describe()

Unnamed: 0,ipm,ipm_bri,frequency,familiarity,literality
count,16.0,16.0,16.0,16.0,16.0
mean,0.250625,0.1775,4.932397,6.168336,4.374309
std,0.366123,0.221133,0.863965,0.473936,1.357517
min,0.01,0.01,3.230769,4.747664,2.482759
25%,0.0375,0.02,4.871717,6.096983,3.241379
50%,0.085,0.07,5.176724,6.353448,4.366674
75%,0.3325,0.3425,5.456289,6.413793,5.565118
max,1.31,0.78,5.931034,6.675214,6.153846


In [426]:
idioms_df[['ipm', 'ipm_bri', 'frequency', 'familiarity', 'literality']].corr()

Unnamed: 0,ipm,ipm_bri,frequency,familiarity,literality
ipm,1.0,0.768446,-0.013746,-0.129481,0.394085
ipm_bri,0.768446,1.0,-0.299372,-0.401469,0.253137
frequency,-0.013746,-0.299372,1.0,0.858853,0.174935
familiarity,-0.129481,-0.401469,0.858853,1.0,0.206452
literality,0.394085,0.253137,0.174935,0.206452,1.0


# preprocessing

In [427]:
dfs = []

for file in path.iterdir():
    if file.is_file() and file.name.endswith('.xlsx'):
        sheets_dict = pd.read_excel(path / file.name, sheet_name=None)

        sentence_df = pd.concat([df.assign(trial_order=i) for i, (sheet_name, df) in enumerate(sheets_dict.items()) if 'sentence' in sheet_name and 'test' not in sheet_name], axis=0, ignore_index=True)
        question_df = pd.concat([df for sheet_name, df in sheets_dict.items() if 'question' in sheet_name and 'test' not in sheet_name], axis=0, ignore_index=True)
        final_test_df = sheets_dict['trials_3']
        final_test_df = final_test_df[final_test_df.idiom_question_id.astype(str).str.match(r'^\d')]
        # leave only known idioms
        final_test_df[final_test_df['key_resp_9.keys_raw'] == 'y']
        
        sentence_df = sentence_df[sentence_df['sent_id'].astype(str).str.match(r'^\d')][['sent_id', 'sent_text', 'segment_id', 'key_resp.rt_raw', 'type', 'idiom_id', 'group']]
        sentence_df['trial_order'] = pd.factorize(sentence_df['sent_id'])[0] + 1
        # leave only answers for known idioms
        sentence_df = sentence_df[sentence_df['idiom_id'].isin(final_test_df['idiom_question_id'])]
        question_df = question_df[question_df['sent_id'].astype(str).str.match(r'^\d')][['sent_id', 'answer', 'key_resp_4.keys_raw']]

        final_df = pd.merge(sentence_df, question_df, on='sent_id')

        # leave results with correct answers only
        final_df = final_df[final_df['answer'] == final_df['key_resp_4.keys_raw']]
        # downcast to integer
        int_cols = ['segment_id', 'type', 'idiom_id', 'group']
        final_df[int_cols] = final_df[int_cols].astype(int)
        # add segment len
        count_letters = lambda x: len(re.findall(r'[А-Яа-яЁё]', x.sent_text.split(' / ')[x.segment_id - 1]))
        final_df['rt_raw_ms'] = final_df['key_resp.rt_raw'].apply(lambda x: x * 1000)
        final_df['unit_rt_ms'] = final_df.apply(lambda x: x['rt_raw_ms'] / count_letters(x), axis=1)
        final_df['unit_rt_log'] = np.log(final_df.unit_rt_ms)
        # remove unnecessary cols
        final_df = final_df.drop(columns=['key_resp_4.keys_raw', 'answer', 'sent_text', 'key_resp.rt_raw'])
        # leave only the necessary segments
        final_df = final_df[final_df.segment_id.isin((3, 4, 5))].reset_index(drop=True)
        # get participant num
        final_df['participant'] = file.name[:4]

        # add info about idioms - 0.5 for lit, 0.5 for fig
        final_df['biasing_context'] = final_df['type'].apply(lambda x: 0.5 if x in (1, 2) else -0.5)
        final_df['resolution_type'] = final_df['type'].apply(lambda x: 0.5 if x in (1, 3) else -0.5)
                
        dfs.append(final_df)

df = pd.concat(dfs, axis=0)
df = pd.merge(df, idioms_df, on='idiom_id')
df.trial_order = df.groupby('participant')['trial_order'].rank(method='dense').astype(int)

print(df.participant.nunique())
print(df.shape)
df.head()

32
(2814, 19)


Unnamed: 0,sent_id,segment_id,type,idiom_id,group,trial_order,rt_raw_ms,unit_rt_ms,unit_rt_log,participant,biasing_context,resolution_type,idiom,ipm,ipm_bri,frequency,familiarity,literality,is_low
0,42,3,3,11,2,1,1353.8656,123.078691,4.812824,12,-0.5,0.5,снять голову,0.31,0.35,3.299065,4.747664,3.392523,1
1,42,4,3,11,2,1,2672.395,127.256905,4.846208,12,-0.5,0.5,снять голову,0.31,0.35,3.299065,4.747664,3.392523,1
2,42,5,3,11,2,1,8165.3534,480.314906,6.174442,12,-0.5,0.5,снять голову,0.31,0.35,3.299065,4.747664,3.392523,1
3,41,3,2,11,2,4,1008.5834,91.6894,4.518407,12,0.5,-0.5,снять голову,0.31,0.35,3.299065,4.747664,3.392523,1
4,41,4,2,11,2,4,1361.7218,85.107612,4.443916,12,0.5,-0.5,снять голову,0.31,0.35,3.299065,4.747664,3.392523,1


# general preprocessing: outlier removal

In [428]:
new_df = df[(df.rt_raw_ms < 3000) & (df.rt_raw_ms > 100)]
print(f'outliers removed: {df.shape[0] - new_df.shape[0]} ({round((df.shape[0] - new_df.shape[0]) / df.shape[0] * 100, 2)}%)')

outliers removed: 65 (2.31%)


# segmentation and shared preprocessing (scaling)

In [429]:
dfs = {segment_id: group.drop(columns=['segment_id']) for segment_id, group in df.groupby('segment_id')}

# scale each df beforehand
cols_to_scale = ['ipm', 'ipm_bri', 'frequency', 'familiarity', 'literality']

for segment_id, group in dfs.items():
    scaler = StandardScaler()
    group[cols_to_scale] = scaler.fit_transform(group[cols_to_scale])
    dfs[segment_id] = group

In [430]:
three_df = dfs[3]
four_df = dfs[4]
five_df = dfs[5]

path = 'data/'
three_df.to_csv(path + 'three_df.csv')
four_df.to_csv(path + 'four_df.csv')
five_df.to_csv(path + 'five_df.csv')

In [431]:
def find_outliers(data: pd.Series) -> pd.Series:
    """
    Identifies outliers in a pandas Series using the IQR method.
    
    Args:
        data (pd.Series): Input data series
        
    Returns:
        pd.Series: Boolean mask indicating outliers (True for outliers)
    """
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound)


def find_segment_outliers(segment_df: pd.DataFrame) -> pd.DataFrame:
    """
    Identifies and removes outliers from log reaction time data in a segment DataFrame.
    
    Parameters:
        segment_df (pd.DataFrame): DataFrame containing 'unit_rt_log' column
        
    Returns:
        pd.DataFrame: DataFrame with outliers removed
    """
    outlier_mask = find_outliers(segment_df['unit_rt_log'])
    n_outliers = outlier_mask.sum()
    total_points = len(segment_df)
    pct_outliers = (n_outliers / total_points) * 100 if total_points > 0 else 0
    
    print(f'found {n_outliers} outliers ({pct_outliers:.2f}%)')
    print("Outlier values:", segment_df[outlier_mask]['unit_rt_log'].values)
    
    return segment_df[~outlier_mask]

## idiom

In [433]:
print(three_df.shape)
three_df.head(3)

(938, 18)


Unnamed: 0,sent_id,type,idiom_id,group,trial_order,rt_raw_ms,unit_rt_ms,unit_rt_log,participant,biasing_context,resolution_type,idiom,ipm,ipm_bri,frequency,familiarity,literality,is_low
0,42,3,11,2,1,1353.8656,123.078691,4.812824,12,-0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
3,41,2,11,2,4,1008.5834,91.6894,4.518407,12,0.5,-0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
6,40,1,11,1,12,1036.4203,94.220027,4.545633,25,0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1


In [434]:
three_df = find_segment_outliers(three_df)

found 3 outliers (0.32%)
Outlier values: [6.43808365 6.43808365 6.43808365]


## idiom + 1

In [435]:
print(four_df.shape)
four_df.head(3)

(938, 18)


Unnamed: 0,sent_id,type,idiom_id,group,trial_order,rt_raw_ms,unit_rt_ms,unit_rt_log,participant,biasing_context,resolution_type,idiom,ipm,ipm_bri,frequency,familiarity,literality,is_low
1,42,3,11,2,1,2672.395,127.256905,4.846208,12,-0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
4,41,2,11,2,4,1361.7218,85.107612,4.443916,12,0.5,-0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
7,40,1,11,1,12,1649.1518,78.531038,4.363494,25,0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1


In [436]:
four_df = find_segment_outliers(four_df)

found 22 outliers (2.35%)
Outlier values: [5.52420834 5.93962512 5.62568694 5.52420834 5.93962512 5.93962512
 5.62568694 5.52420834 5.51348959 5.49527754 6.58155651 5.51348959
 5.49527754 5.51348959 5.49527754 6.58155651 1.71519155 1.71519155
 1.71519155 5.55894815 5.55894815 5.55894815]


## idiom + 2 (end)

In [437]:
print(five_df.shape)
five_df.head()

(938, 18)


Unnamed: 0,sent_id,type,idiom_id,group,trial_order,rt_raw_ms,unit_rt_ms,unit_rt_log,participant,biasing_context,resolution_type,idiom,ipm,ipm_bri,frequency,familiarity,literality,is_low
2,42,3,11,2,1,8165.3534,480.314906,6.174442,12,-0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
5,41,2,11,2,4,1834.3675,107.903971,4.681242,12,0.5,-0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
8,40,1,11,1,12,1612.8751,94.875006,4.55256,25,0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
11,43,4,11,1,26,685.0052,40.294424,3.696213,25,-0.5,-0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1
14,40,1,11,1,10,1779.1472,104.655718,4.650676,28,0.5,0.5,снять голову,0.191618,0.784527,-1.947058,-3.095002,-0.700889,1


In [438]:
five_df = find_segment_outliers(five_df)

found 10 outliers (1.07%)
Outlier values: [6.17444194 6.17444194 6.17444194 1.67388495 1.67388495 5.70446282
 5.70446282 5.70446282 5.69340434 5.69340434]


In [441]:
five_df[['ipm', 'ipm_bri', 'frequency', 'familiarity', 'literality']].corr()

Unnamed: 0,ipm,ipm_bri,frequency,familiarity,literality
ipm,1.0,0.78388,-0.047465,-0.161457,0.410696
ipm_bri,0.78388,1.0,-0.315791,-0.415445,0.285142
frequency,-0.047465,-0.315791,1.0,0.860834,0.158334
familiarity,-0.161457,-0.415445,0.860834,1.0,0.190013
literality,0.410696,0.285142,0.158334,0.190013,1.0


# save

In [439]:
three_df.to_csv('data/three_df.csv', index=False)
four_df.to_csv('data/four_df.csv', index=False)
five_df.to_csv('data/five_df.csv', index=False)