In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from pathlib import Path
import pickle
import warnings
warnings.simplefilter("ignore")

In [2]:
path = Path('../input/linking-writing-processes-to-writing-quality')

### Production Rate

In [3]:
train_logs = pd.read_csv(path/'train_logs.csv')
test_logs = pd.read_csv(path/'test_logs.csv')

In [4]:
DEBUG = False
if DEBUG:
    train_logs = train_logs[train_logs.id.isin(train_logs.id.unique()[:5])]
    print('Debug mode enabled.')
else:
    print('Debug mode disabled. The whole dataset will be utilized ({train_logs.id.nunique()} essays)."')

Debug mode disabled. The whole dataset will be utilized ({train_logs.id.nunique()} essays)."


In [5]:
train_logs.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


### Production Rate

In [6]:
# chars_product = input_char - remove_cut_chars - first_strings_replace_chars + second_strings_replace_chars + paste_chars
id_groups = train_logs.groupby('id')

train_logs['num_events'] = id_groups['event_id'].transform('last')

train_logs['input_chars_aux'] = id_groups['activity'].transform(lambda x: (x == 'Input').sum())
    
train_logs['remove_cut_chars_aux'] = id_groups['activity'].transform(
    lambda x: (train_logs.loc[x.index, 'text_change'][x == 'Remove/Cut']).str.len().sum()
    )

train_logs['paste_chars_aux'] = id_groups['activity'].transform(
    lambda x: (train_logs.loc[x.index, 'text_change'][x == 'Paste']).str.len().sum()
    )

train_logs['total_time_mins'] = np.round(id_groups['up_time'].transform('last') / 60000, 1)

train_logs['total_action_time_mins'] = np.round(id_groups['action_time'].transform('sum') / 60000, 1)

train_logs['mean_action_time_ms'] = np.round(id_groups['action_time'].transform('mean'), 1)

for _, group in id_groups:
    first_strings_replace_chars = 0
    second_strings_replace_chars = 0
    
    for replace_str in group[group.activity == "Replace"].text_change.values:
        arrow_idx = replace_str.find(' => ')
        first_strings_replace_chars += arrow_idx
        len_second_str = len(replace_str) - arrow_idx - len(' => ')
        second_strings_replace_chars += len_second_str
    
    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'first_strings_replace_chars_aux'] = first_strings_replace_chars
    train_logs.loc[group.index, 'second_strings_replace_chars_aux'] = second_strings_replace_chars
    
# chars_product = input_chars - remove_cut_chars - first_strings_replace_chars + second_strings_replace_chars + paste_chars
train_logs['chars_product'] = train_logs.input_chars_aux - train_logs.remove_cut_chars_aux - \
        train_logs.first_strings_replace_chars_aux + train_logs.second_strings_replace_chars_aux + train_logs.paste_chars_aux

train_logs['chars_per_min_product'] = np.round(train_logs.chars_product / train_logs.total_time_mins, 1)

# chars_process
train_logs['chars_process'] = train_logs.input_chars_aux + train_logs.second_strings_replace_chars_aux +\
                              train_logs.paste_chars_aux
train_logs['chars_per_min_process'] = np.round(train_logs.chars_process / train_logs.total_time_mins, 1)

In [7]:
# words_per_min_product
train_logs['last_word_count_aux'] = id_groups['word_count'].transform(lambda x: x.values[-1])
train_logs['words_per_min_product'] = np.round(train_logs.last_word_count_aux / train_logs.total_time_mins, 1)

In [8]:
# words_per_min_process, sentences_per_min_process, sentences_per_min_product, paragraphs_per_min_process
for _, group in id_groups:
    diff_word_count = group['word_count'].shift(1) - group['word_count']
    num_deleted_words = np.maximum(0, diff_word_count).sum()
    
    input_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Input')].event_id.count()
    
    remove_cut_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Remove/Cut')].event_id.count()
    
    input_paragraphs = group[(group.activity=='Input') & (group.text_change.str.contains('\n'))].event_id.count()
        
    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'num_deleted_words_aux'] = num_deleted_words
    train_logs.loc[group.index, 'input_sentences_aux'] = input_sentences
    train_logs.loc[group.index, 'remove_cut_sentences_aux'] = remove_cut_sentences
    train_logs.loc[group.index, 'input_paragraphs_aux'] = input_paragraphs
    
sentences_process = input_sentences
sentences_product = input_sentences - remove_cut_sentences
paragraphs_process = input_paragraphs

train_logs = (
    train_logs.assign(
    total_words_aux=lambda x: x['last_word_count_aux'] + x['num_deleted_words_aux'],
    words_per_min_process=lambda x: x['total_words_aux'] / x['total_time_mins'],
    sentences_per_min_process=lambda x: x['input_sentences_aux'] / x['total_time_mins'],
    sentences_per_min_product=lambda x: x['input_sentences_aux'] -\
                                                  x['remove_cut_sentences_aux'] / x['total_time_mins'],
    paragraphs_per_min_process=lambda x: x['input_paragraphs_aux'] / x['total_time_mins']
    )
    .round(1)
)

### Pause

In [9]:
# num_pauses, pauses_per_min, pause_time_proportion_perc, mean_pause_length
for _, group in id_groups:
    iki = group['down_time'] - group['up_time'].shift(1)
    filtered_iki = [num for num in iki if num >= 2000]
    num_pauses = len(filtered_iki)
    pause_time = sum(filtered_iki)

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'num_pauses'] = num_pauses
    train_logs.loc[group.index, 'pause_time_aux'] = pause_time  
    
train_logs = (
    train_logs.assign(
    pauses_per_min=np.round(train_logs.num_pauses / train_logs.total_time_mins, 1),
    pause_time_proportion_perc=np.round((100 * train_logs.pause_time_aux) / (60000 * train_logs.total_time_mins), 1),
    mean_pause_length=np.round(train_logs.pause_time_aux / train_logs.num_pauses, 1)
    )
    .round(1)
)

In [10]:
# mean_pause_length_btw_paragraphs
for _, group in id_groups:
    enter_input_rows_idxs = group[(group.down_event == 'Enter') & (group.activity == 'Input')].index

    # Filter consecutive Enter events
    filtered_enter_rows_idx = [idx for idx in enter_input_rows_idxs if idx - 1 not in enter_input_rows_idxs]

    # Look for time when previous paragraph ended and time when following paragraph started
    pause_time_btw_paragraphs, pauses_btw_paragraphs = 0, 0
    for idx in filtered_enter_rows_idx:
        word_count_enter = group.at[idx, 'word_count']
        if word_count_enter > 0:
            slice_i = group[(group.index >= idx - 6) & (group.index <= idx - 1) & 
                                (group.activity=='Input')]
            if not slice_i.empty:
                initial_time = slice_i['up_time'].values[-1]
                
                slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 6) & 
                                (group.word_count == word_count_enter + 1)]
                if not slice_f.empty:
                    final_time = slice_f['down_time'].values[0]
                    pause_time_btw_paragraphs += final_time - initial_time
                    pauses_btw_paragraphs += 1
                 
    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'pause_time_btw_paragraphs_aux'] = pause_time_btw_paragraphs
    if pauses_btw_paragraphs == 0:
        train_logs.loc[group.index, 'pauses_btw_paragraphs_aux'] = np.nan
    else:
        train_logs.loc[group.index, 'pauses_btw_paragraphs_aux'] = pauses_btw_paragraphs

train_logs['mean_pause_length_btw_paragraphs'] = np.round(train_logs.pause_time_btw_paragraphs_aux / \
                                                          train_logs.pauses_btw_paragraphs_aux, 1)

In [11]:
# mean_pause_length_btw_sentences
for _, group in id_groups:

    period_rows_idxs = group[(group.text_change.str.contains('[\.\;\?\!\:]', regex=True)) & 
                                                (group.activity == 'Input')].index

    # Filter consecutive events
    filtered_period_rows_idxs = [idx for idx in period_rows_idxs if idx - 1 not in period_rows_idxs]

    # Look for time when previous sentence ended and time when following sentence started
    pause_time_btw_sentences, pauses_btw_sentences = 0, 0
    for idx in filtered_period_rows_idxs:
        word_count_period = group.at[idx, 'word_count']
        if word_count_period > 0:
            slice_i = group[(group.index >= idx - 6) & (group.index <= idx - 1) & 
                            (group.activity=='Input') & (group.text_change=='q')]
            if not slice_i.empty:
                initial_time = slice_i['up_time'].values[-1]

                slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 6) & 
                                (group.word_count == word_count_period + 1)]
                if not slice_f.empty:
                    final_time = slice_f['down_time'].values[0]
                    pause_time_btw_sentences += final_time - initial_time
                    pauses_btw_sentences += 1

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'pause_time_btw_sentences_aux'] = pause_time_btw_sentences
    if pauses_btw_sentences == 0:
        train_logs.loc[group.index, 'pauses_btw_sentences_aux'] = np.nan
    else:
        train_logs.loc[group.index, 'pauses_btw_sentences_aux'] = pauses_btw_sentences

train_logs['mean_pause_length_btw_sentences'] = np.round(train_logs.pause_time_btw_sentences_aux / \
                                                         train_logs.pauses_btw_sentences_aux, 1)

In [12]:
# mean_pause_length_btw_words
for _, group in id_groups:

    space_rows_idxs = group[(group.down_event=='Space') & (group.activity=='Input')].index

    # Filter consecutive events
    filtered_space_rows_idxs = [idx for idx in space_rows_idxs if idx - 1 not in space_rows_idxs]

    # Look for time when previous word ended and time when following word started
    pause_time_btw_words, pauses_btw_words = 0, 0
    for idx in filtered_space_rows_idxs:
        word_count_space = group.at[idx, 'word_count']
        if word_count_space > 0:
            slice_i = group[(group.index >= idx - 6) & (group.index <= idx - 1) & 
                            (group['activity'] == 'Input') & (group['text_change'] == 'q')]

            if not slice_i.empty:
                initial_time = slice_i['up_time'].values[-1]

                slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 6) & 
                                (group.word_count == word_count_space + 1)]
                if not slice_f.empty:
                    final_time = slice_f['down_time'].values[0]
                    pause_time_btw_words += final_time - initial_time
                    pauses_btw_words += 1

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'pause_time_btw_words_aux'] = pause_time_btw_words
    if pauses_btw_words == 0:
        train_logs.loc[group.index, 'pauses_btw_words_aux'] = np.nan
    else:
        train_logs.loc[group.index, 'pauses_btw_words_aux'] = pauses_btw_words

train_logs['mean_pause_length_btw_words'] = np.round(train_logs.pause_time_btw_words_aux / \
                                                     train_logs.pauses_btw_words_aux, 1)

In [13]:
# mean_pause_length_w_in_words
for _, group in id_groups:
    group['word_count_up_aux'] = group.word_count > group.shift().word_count
    word_count_up_idxs = group[(group.word_count_up_aux) & (group.text_change=='q')].index

    # Look for time when word ended
    pause_time_w_in_words, pauses_w_in_words = 0, 0
    for idx in word_count_up_idxs:
        initial_time = group.at[idx, 'up_time']
        slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 10) 
                        & (group.activity=='Input') & (group.text_change.isin(['.', ',', ';', ':', ' ', '!', '?']))] 
        if not slice_f.empty:
            final_time = slice_f['down_time'].values[0]
            pause_time_w_in_words += final_time - initial_time
            pauses_w_in_words += 1

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'pause_time_w_in_words_aux'] = pause_time_w_in_words
    if pauses_w_in_words == 0:
            train_logs.loc[group.index, 'pauses_w_in_words_aux'] = np.nan
    else:
        train_logs.loc[group.index, 'pauses_w_in_words_aux'] = pauses_w_in_words

train_logs['mean_pause_length_w_in_words'] = np.round(train_logs.pause_time_w_in_words_aux / \
                                                      train_logs.pauses_w_in_words_aux, 1)

### Revision

In [14]:
# deletions, deletions_per_min, mean_length_deletions, deletions_proportion_perc, 
# imm_deletions, distant_deletions, distant_deletion_ratio
for _, group in id_groups:
    deletion_idxs = group[group.activity=='Remove/Cut'].index

    # Filter consecutive rows
    filtered_deletion_idxs = [idx for idx in deletion_idxs if idx - 1 not in deletion_idxs]

    train_logs.loc[group.index, 'deletions'] = len(filtered_deletion_idxs)
    train_logs.loc[group.index, 'deletions_per_min'] = np.round(len(filtered_deletion_idxs) / \
                                                                train_logs.total_time_mins, 1)
    if len(filtered_deletion_idxs) == 0:
            train_logs.loc[group.index, 'mean_length_deletions'] = np.nan
    else:
        train_logs.loc[group.index, 'mean_length_deletions'] = np.round(len(deletion_idxs) / len(filtered_deletion_idxs), 1)

    aux_cursor_descending_row_idxs = group[group.cursor_position < \
                                           group.shift().cursor_position].index
    deletions_time, distant_deletions = 0, 0
    for idx in filtered_deletion_idxs:
        
        initial_time = group.at[idx, 'down_time']
        if (group.at[idx - 1, 'activity'] == 'Nonproduction') & \
        (idx - 1 in aux_cursor_descending_row_idxs):
            distant_deletions += 1
        for i in range(idx + 1, len(group)):
            if (i not in deletion_idxs) & (group.at[i, 'activity'] != 'Nonproduction'):
                final_time = group.at[i - 1, 'up_time']
                deletions_time += final_time - initial_time
                break
                           
    train_logs.loc[group.index, 'deletions_time_aux'] = deletions_time
    train_logs.loc[group.index, 'distant_deletions'] = distant_deletions
                       
train_logs = (
    train_logs.assign(
        deletions_proportion_perc=lambda x: 100 * x.deletions_time_aux / (60000 * x.total_time_mins),
        imm_deletions=lambda x: x.deletions - x.distant_deletions,
        distant_deletion_ratio=lambda x: x.distant_deletions / x.deletions
        )
    .round(1)
)

In [15]:
# product vs. process ratio 
train_logs['product_process_ratio'] = np.round(train_logs.chars_product / train_logs.chars_process, 1)

### Bursts

In [16]:
# p_bursts, p_bursts_per_min, r_bursts, r_bursts_per_min, mean_p_bursts_chars, 
# mean_r_bursts_chars, p_bursts_proportion_perc, r_bursts_proportion_perc
for _, group in id_groups:
    # Extract indices of 'Input' activities
    input_idxs = group[group.activity == 'Input'].index

    # Filter out consecutive events
    filtered_input_idxs = [idx for idx in input_idxs if idx - 1 not in input_idxs]

    # Find stretches of at least 20 consecutive input events
    p_bursts, p_bursts_chars, pb_time, r_bursts, r_bursts_chars, rb_time = 0, 0, 0, 0, 0, 0

    for idx in filtered_input_idxs[:-1]:
        i = 0
        # Check for consecutive 'Input' events and duration conditions
        while (group.at[idx, 'activity'] == 'Input') and \
        (group.at[idx + 1, 'down_time'] - group.at[idx, 'up_time'] < 2000):
            i += 1
            idx += 1
        # Update counters for bursts
        if i >= 20:
            if group.at[idx, 'activity'] == 'Input':
                p_bursts += 1
                p_bursts_chars += i
                pb_time += group.at[idx - 1, 'up_time'] - group.at[idx - i, 'down_time']
            else:
                r_bursts += 1
                r_bursts_chars += i
                rb_time += group.at[idx - 1, 'up_time'] - group.at[idx - i, 'down_time']

    # Update the corresponding columns in train_logs
    columns_to_update = ['p_bursts', 'p_bursts_chars_aux', 'pb_time_aux', 'r_bursts', 'r_bursts_chars_aux', 'rb_time_aux']
    train_logs.loc[group.index, columns_to_update] = p_bursts, p_bursts_chars, pb_time, r_bursts, r_bursts_chars, rb_time
    
train_logs = (
    train_logs.assign(
        p_bursts_per_min=lambda x: x.p_bursts / x.total_time_mins,
        r_bursts_per_min=lambda x: x.r_bursts / x.total_time_mins,
        mean_p_bursts_chars = lambda x: np.where(x.p_bursts != 0, x.p_bursts_chars_aux / x.p_bursts, np.nan),
        mean_r_bursts_chars = lambda x: np.where(x.r_bursts != 0, x.r_bursts_chars_aux / x.r_bursts, np.nan),
        p_bursts_proportion_perc=lambda x: (100 * x.pb_time_aux) / (60000 * x.total_time_mins),
        r_bursts_proportion_perc=lambda x: (100 * x.rb_time_aux) / (60000 * x.total_time_mins)
        )
    .round(1)
)


### Process Variance

In [17]:
# std_chars_interval
train_logs['time_interval_aux'] = 0

for _, group in id_groups:
    time_intervals = pd.cut(group['up_time'], bins=10)
    train_logs.loc[group.index, 'time_interval_aux'] = time_intervals
    
for _, time_group in train_logs.groupby(['id', 'time_interval_aux']):
    
    input_chars_group = time_group[time_group.activity=='Input'].event_id.count()
    
    second_str_replace_chars_group = 0
    for replace_str in time_group[time_group.activity=="Replace"].text_change.values:
            arrow_idx = replace_str.find(' => ')
            len_second_str = len(replace_str) - arrow_idx - len(' => ')
            second_str_replace_chars_group += len_second_str
    
    paste_chars_group = len(''.join(time_group[time_group.activity=='Paste'].text_change.values))
    chars_process_group = input_chars_group + second_str_replace_chars_group + paste_chars_group

    train_logs.loc[time_group.index, 'chars_time_group_aux'] = chars_process_group
    
train_logs['chars_proportion_aux'] = np.round(100 * train_logs.chars_time_group_aux / train_logs.chars_process, 1)

for _, group in train_logs.groupby('id'):
    train_logs.loc[group.index, 'std_chars_interval'] = np.round(group.chars_proportion_aux.std(), 1)

In [18]:
train_logs = train_logs.filter(regex='^(?!.*aux).*$')

In [19]:
train_logs

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,num_events,total_time_mins,total_action_time_mins,mean_action_time_ms,chars_product,chars_per_min_product,chars_process,chars_per_min_process,words_per_min_product,words_per_min_process,sentences_per_min_process,sentences_per_min_product,paragraphs_per_min_process,num_pauses,pauses_per_min,pause_time_proportion_perc,mean_pause_length,mean_pause_length_btw_paragraphs,mean_pause_length_btw_sentences,mean_pause_length_btw_words,mean_pause_length_w_in_words,deletions,deletions_per_min,mean_length_deletions,distant_deletions,deletions_proportion_perc,imm_deletions,distant_deletion_ratio,product_process_ratio,p_bursts,r_bursts,p_bursts_per_min,r_bursts_per_min,mean_p_bursts_chars,mean_r_bursts_chars,p_bursts_proportion_perc,r_bursts_proportion_perc,std_chars_interval
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,2557,30.0,5.0,116.2,1528.0,50.9,2017.0,67.2,8.5,11.6,0.7,21.8,0.1,124.0,4.1,65.7,9537.1,19790.5,8271.9,1835.9,780.7,96.0,3.2,4.3,3.0,8.5,93.0,0.0,0.8,16.0,7.0,0.5,0.2,28.1,25.9,4.5,2.0,5.2
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,2557,30.0,5.0,116.2,1528.0,50.9,2017.0,67.2,8.5,11.6,0.7,21.8,0.1,124.0,4.1,65.7,9537.1,19790.5,8271.9,1835.9,780.7,96.0,3.2,4.3,3.0,8.5,93.0,0.0,0.8,16.0,7.0,0.5,0.2,28.1,25.9,4.5,2.0,5.2
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,2557,30.0,5.0,116.2,1528.0,50.9,2017.0,67.2,8.5,11.6,0.7,21.8,0.1,124.0,4.1,65.7,9537.1,19790.5,8271.9,1835.9,780.7,96.0,3.2,4.3,3.0,8.5,93.0,0.0,0.8,16.0,7.0,0.5,0.2,28.1,25.9,4.5,2.0,5.2
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,2557,30.0,5.0,116.2,1528.0,50.9,2017.0,67.2,8.5,11.6,0.7,21.8,0.1,124.0,4.1,65.7,9537.1,19790.5,8271.9,1835.9,780.7,96.0,3.2,4.3,3.0,8.5,93.0,0.0,0.8,16.0,7.0,0.5,0.2,28.1,25.9,4.5,2.0,5.2
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,2557,30.0,5.0,116.2,1528.0,50.9,2017.0,67.2,8.5,11.6,0.7,21.8,0.1,124.0,4.1,65.7,9537.1,19790.5,8271.9,1835.9,780.7,96.0,3.2,4.3,3.0,8.5,93.0,0.0,0.8,16.0,7.0,0.5,0.2,28.1,25.9,4.5,2.0,5.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240,3619,34.5,5.0,83.2,1492.0,43.2,2471.0,71.6,7.0,11.8,0.6,18.8,0.6,120.0,3.5,50.1,8645.0,22132.5,15997.4,1345.1,980.8,108.0,3.1,2.9,5.0,0.0,103.0,0.0,0.6,9.0,25.0,0.3,0.7,38.4,32.4,4.1,8.4,2.2
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240,3619,34.5,5.0,83.2,1492.0,43.2,2471.0,71.6,7.0,11.8,0.6,18.8,0.6,120.0,3.5,50.1,8645.0,22132.5,15997.4,1345.1,980.8,108.0,3.1,2.9,5.0,0.0,103.0,0.0,0.6,9.0,25.0,0.3,0.7,38.4,32.4,4.1,8.4,2.2
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240,3619,34.5,5.0,83.2,1492.0,43.2,2471.0,71.6,7.0,11.8,0.6,18.8,0.6,120.0,3.5,50.1,8645.0,22132.5,15997.4,1345.1,980.8,108.0,3.1,2.9,5.0,0.0,103.0,0.0,0.6,9.0,25.0,0.3,0.7,38.4,32.4,4.1,8.4,2.2
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240,3619,34.5,5.0,83.2,1492.0,43.2,2471.0,71.6,7.0,11.8,0.6,18.8,0.6,120.0,3.5,50.1,8645.0,22132.5,15997.4,1345.1,980.8,108.0,3.1,2.9,5.0,0.0,103.0,0.0,0.6,9.0,25.0,0.3,0.7,38.4,32.4,4.1,8.4,2.2


In [20]:
file_path = Path('../working/train_logs.pkl')

with open(file_path, 'wb') as file:
        pickle.dump(train_logs, file)

### Apply all transformations to `test_logs`

In [21]:
id_groups = test_logs.groupby('id')

test_logs['num_events'] = id_groups['event_id'].transform('last')

test_logs['input_chars_aux'] = id_groups['activity'].transform(lambda x: (x == 'Input').sum())
    
test_logs['remove_cut_chars_aux'] = id_groups['activity'].transform(
    lambda x: (test_logs.loc[x.index, 'text_change'][x == 'Remove/Cut']).str.len().sum()
    )

test_logs['paste_chars_aux'] = id_groups['activity'].transform(
    lambda x: (test_logs.loc[x.index, 'text_change'][x == 'Paste']).str.len().sum()
    )

test_logs['total_time_mins'] = np.round(id_groups['up_time'].transform('last') / 60000, 1)

test_logs['total_action_time_mins'] = np.round(id_groups['action_time'].transform('sum') / 60000, 1)

test_logs['mean_action_time_ms'] = np.round(id_groups['action_time'].transform('mean'), 1)

for _, group in id_groups:
    first_strings_replace_chars = 0
    second_strings_replace_chars = 0
    
    for replace_str in group[group.activity == "Replace"].text_change.values:
        arrow_idx = replace_str.find(' => ')
        first_strings_replace_chars += arrow_idx
        len_second_str = len(replace_str) - arrow_idx - len(' => ')
        second_strings_replace_chars += len_second_str
    
    test_logs.loc[group.index, 'first_strings_replace_chars_aux'] = first_strings_replace_chars
    test_logs.loc[group.index, 'second_strings_replace_chars_aux'] = second_strings_replace_chars
    
test_logs['chars_product'] = test_logs.input_chars_aux - test_logs.remove_cut_chars_aux - \
        test_logs.first_strings_replace_chars_aux + test_logs.second_strings_replace_chars_aux + test_logs.paste_chars_aux

test_logs['chars_per_min_product'] = np.round(test_logs.chars_product / test_logs.total_time_mins, 1)

test_logs['chars_process'] = test_logs.input_chars_aux + test_logs.second_strings_replace_chars_aux +\
                              test_logs.paste_chars_aux
test_logs['chars_per_min_process'] = np.round(test_logs.chars_process / test_logs.total_time_mins, 1)

test_logs['last_word_count_aux'] = id_groups['word_count'].transform(lambda x: x.values[-1])
test_logs['words_per_min_product'] = np.round(test_logs.last_word_count_aux / test_logs.total_time_mins, 1)

for _, group in id_groups:
    diff_word_count = group['word_count'].shift(1) - group['word_count']
    num_deleted_words = np.maximum(0, diff_word_count).sum()
    
    input_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Input')].event_id.count()
    
    remove_cut_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Remove/Cut')].event_id.count()
    
    input_paragraphs = group[(group.activity=='Input') & (group.text_change.str.contains('\n'))].event_id.count()
        
    test_logs.loc[group.index, 'num_deleted_words_aux'] = num_deleted_words
    test_logs.loc[group.index, 'input_sentences_aux'] = input_sentences
    test_logs.loc[group.index, 'remove_cut_sentences_aux'] = remove_cut_sentences
    test_logs.loc[group.index, 'input_paragraphs_aux'] = input_paragraphs
    
sentences_process = input_sentences
sentences_product = input_sentences - remove_cut_sentences
paragraphs_process = input_paragraphs

test_logs = (
    test_logs.assign(
    total_words_aux=lambda x: x['last_word_count_aux'] + x['num_deleted_words_aux'],
    words_per_min_process=lambda x: x['total_words_aux'] / x['total_time_mins'],
    sentences_per_min_process=lambda x: x['input_sentences_aux'] / x['total_time_mins'],
    sentences_per_min_product=lambda x: x['input_sentences_aux'] -\
                                                  x['remove_cut_sentences_aux'] / x['total_time_mins'],
    paragraphs_per_min_process=lambda x: x['input_paragraphs_aux'] / x['total_time_mins']
    )
    .round(1)
)

for _, group in id_groups:
    iki = group['down_time'] - group['up_time'].shift(1)
    filtered_iki = [num for num in iki if num >= 2000]
    num_pauses = len(filtered_iki)
    pause_time = sum(filtered_iki)

    test_logs.loc[group.index, 'num_pauses'] = num_pauses
    test_logs.loc[group.index, 'pause_time_aux'] = pause_time  
    
test_logs = (
    test_logs.assign(
    pauses_per_min=np.round(test_logs.num_pauses / test_logs.total_time_mins, 1),
    pause_time_proportion_perc=np.round((100 * test_logs.pause_time_aux) / (60000 * test_logs.total_time_mins), 1),
    mean_pause_length=np.round(test_logs.pause_time_aux / test_logs.num_pauses, 1)
    )
    .round(1)
)

for _, group in id_groups:
    enter_input_rows_idxs = group[(group.down_event == 'Enter') & (group.activity == 'Input')].index

    filtered_enter_rows_idx = [idx for idx in enter_input_rows_idxs if idx - 1 not in enter_input_rows_idxs]

    pause_time_btw_paragraphs, pauses_btw_paragraphs = 0, 0
    for idx in filtered_enter_rows_idx:
        word_count_enter = group.at[idx, 'word_count']
        if word_count_enter > 0:
            slice_i = group[(group.index >= idx - 6) & (group.index <= idx - 1) & 
                                (group.activity=='Input')]
            if not slice_i.empty:
                initial_time = slice_i['up_time'].values[-1]
                
                slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 6) & 
                                (group.word_count == word_count_enter + 1)]
                if not slice_f.empty:
                    final_time = slice_f['down_time'].values[0]
                    pause_time_btw_paragraphs += final_time - initial_time
                    pauses_btw_paragraphs += 1
                 
    test_logs.loc[group.index, 'pause_time_btw_paragraphs_aux'] = pause_time_btw_paragraphs
    if pauses_btw_paragraphs == 0:
        test_logs.loc[group.index, 'pauses_btw_paragraphs_aux'] = np.nan
    else:
        test_logs.loc[group.index, 'pauses_btw_paragraphs_aux'] = pauses_btw_paragraphs

test_logs['mean_pause_length_btw_paragraphs'] = np.round(test_logs.pause_time_btw_paragraphs_aux / \
                                                          test_logs.pauses_btw_paragraphs_aux, 1)

for _, group in id_groups:

    period_rows_idxs = group[(group.text_change.str.contains('[\.\;\?\!\:]', regex=True)) & 
                                                (group.activity == 'Input')].index

    filtered_period_rows_idxs = [idx for idx in period_rows_idxs if idx - 1 not in period_rows_idxs]

    pause_time_btw_sentences, pauses_btw_sentences = 0, 0
    for idx in filtered_period_rows_idxs:
        word_count_period = group.at[idx, 'word_count']
        if word_count_period > 0:
            slice_i = group[(group.index >= idx - 6) & (group.index <= idx - 1) & 
                            (group.activity=='Input') & (group.text_change=='q')]
            if not slice_i.empty:
                initial_time = slice_i['up_time'].values[-1]

                slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 6) & 
                                (group.word_count == word_count_period + 1)]
                if not slice_f.empty:
                    final_time = slice_f['down_time'].values[0]
                    pause_time_btw_sentences += final_time - initial_time
                    pauses_btw_sentences += 1

    test_logs.loc[group.index, 'pause_time_btw_sentences_aux'] = pause_time_btw_sentences
    if pauses_btw_sentences == 0:
        test_logs.loc[group.index, 'pauses_btw_sentences_aux'] = np.nan
    else:
        test_logs.loc[group.index, 'pauses_btw_sentences_aux'] = pauses_btw_sentences

test_logs['mean_pause_length_btw_sentences'] = np.round(test_logs.pause_time_btw_sentences_aux / \
                                                         test_logs.pauses_btw_sentences_aux, 1)

for _, group in id_groups:

    space_rows_idxs = group[(group.down_event=='Space') & (group.activity=='Input')].index

    filtered_space_rows_idxs = [idx for idx in space_rows_idxs if idx - 1 not in space_rows_idxs]

    pause_time_btw_words, pauses_btw_words = 0, 0
    for idx in filtered_space_rows_idxs:
        word_count_space = group.at[idx, 'word_count']
        if word_count_space > 0:
            slice_i = group[(group.index >= idx - 6) & (group.index <= idx - 1) & 
                            (group['activity'] == 'Input') & (group['text_change'] == 'q')]

            if not slice_i.empty:
                initial_time = slice_i['up_time'].values[-1]

                slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 6) & 
                                (group.word_count == word_count_space + 1)]
                if not slice_f.empty:
                    final_time = slice_f['down_time'].values[0]
                    pause_time_btw_words += final_time - initial_time
                    pauses_btw_words += 1

    test_logs.loc[group.index, 'pause_time_btw_words_aux'] = pause_time_btw_words
    if pauses_btw_words == 0:
        test_logs.loc[group.index, 'pauses_btw_words_aux'] = np.nan
    else:
        test_logs.loc[group.index, 'pauses_btw_words_aux'] = pauses_btw_words

test_logs['mean_pause_length_btw_words'] = np.round(test_logs.pause_time_btw_words_aux / \
                                                     test_logs.pauses_btw_words_aux, 1)

for _, group in id_groups:
    group['word_count_up_aux'] = group.word_count > group.shift().word_count
    word_count_up_idxs = group[(group.word_count_up_aux) & (group.text_change=='q')].index

    pause_time_w_in_words, pauses_w_in_words = 0, 0
    for idx in word_count_up_idxs:
        initial_time = group.at[idx, 'up_time']
        slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 10) 
                        & (group.activity=='Input') & (group.text_change.isin(['.', ',', ';', ':', ' ', '!', '?']))] 
        if not slice_f.empty:
            final_time = slice_f['down_time'].values[0]
            pause_time_w_in_words += final_time - initial_time
            pauses_w_in_words += 1

    test_logs.loc[group.index, 'pause_time_w_in_words_aux'] = pause_time_w_in_words
    if pauses_w_in_words == 0:
        test_logs.loc[group.index, 'pauses_w_in_words_aux'] = np.nan
    else:
        test_logs.loc[group.index, 'pauses_w_in_words_aux'] = pauses_w_in_words

test_logs['mean_pause_length_w_in_words'] = np.round(test_logs.pause_time_w_in_words_aux / \
                                                      test_logs.pauses_w_in_words_aux, 1)

for _, group in id_groups:
    deletion_idxs = group[group.activity=='Remove/Cut'].index

    filtered_deletion_idxs = [idx for idx in deletion_idxs if idx - 1 not in deletion_idxs]

    test_logs.loc[group.index, 'deletions'] = len(filtered_deletion_idxs)
    test_logs.loc[group.index, 'deletions_per_min'] = np.round(len(filtered_deletion_idxs) / \
                                                                test_logs.total_time_mins, 1)
    if len(filtered_deletion_idxs) == 0:
            test_logs.loc[group.index, 'mean_length_deletions'] = np.nan
    else:
        test_logs.loc[group.index, 'mean_length_deletions'] = np.round(len(deletion_idxs) / len(filtered_deletion_idxs), 1)

    aux_cursor_descending_row_idxs = group[group.cursor_position < \
                                           group.shift().cursor_position].index
    deletions_time, distant_deletions = 0, 0
    for idx in filtered_deletion_idxs:
        
        initial_time = group.at[idx, 'down_time']
        if (group.at[idx - 1, 'activity'] == 'Nonproduction') & \
        (idx - 1 in aux_cursor_descending_row_idxs):
            distant_deletions += 1
        for i in range(idx + 1, len(group)):
            if (i not in deletion_idxs) & (group.at[i, 'activity'] != 'Nonproduction'):
                final_time = group.at[i - 1, 'up_time']
                deletions_time += final_time - initial_time
                break
                           
    test_logs.loc[group.index, 'deletions_time_aux'] = deletions_time
    test_logs.loc[group.index, 'distant_deletions'] = distant_deletions
                       
test_logs = (
    test_logs.assign(
        deletions_proportion_perc=lambda x: 100 * x.deletions_time_aux / (60000 * x.total_time_mins),
        imm_deletions=lambda x: x.deletions - x.distant_deletions,
        distant_deletion_ratio=lambda x: x.distant_deletions / x.deletions
        )
    .round(1)
)

test_logs['product_process_ratio'] = np.round(test_logs.chars_product / test_logs.chars_process, 1)

for _, group in id_groups:
    input_idxs = group[group.activity == 'Input'].index

    filtered_input_idxs = [idx for idx in input_idxs if idx - 1 not in input_idxs]

    p_bursts, p_bursts_chars, pb_time, r_bursts, r_bursts_chars, rb_time = 0, 0, 0, 0, 0, 0

    for idx in filtered_input_idxs[:-1]:
        i = 0
        while (group.at[idx, 'activity'] == 'Input') and \
        (group.at[idx + 1, 'down_time'] - group.at[idx, 'up_time'] < 2000):
            i += 1
            idx += 1
        if i >= 20:
            if group.at[idx, 'activity'] == 'Input':
                p_bursts += 1
                p_bursts_chars += i
                pb_time += group.at[idx - 1, 'up_time'] - group.at[idx - i, 'down_time']
            else:
                r_bursts += 1
                r_bursts_chars += i
                rb_time += group.at[idx - 1, 'up_time'] - group.at[idx - i, 'down_time']

    columns_to_update = ['p_bursts', 'p_bursts_chars_aux', 'pb_time_aux', 'r_bursts', 'r_bursts_chars_aux', 'rb_time_aux']
    test_logs.loc[group.index, columns_to_update] = p_bursts, p_bursts_chars, pb_time, r_bursts, r_bursts_chars, rb_time
    
test_logs = (
    test_logs.assign(
        p_bursts_per_min=lambda x: x.p_bursts / x.total_time_mins,
        r_bursts_per_min=lambda x: x.r_bursts / x.total_time_mins,
        mean_p_bursts_chars = lambda x: np.where(x.p_bursts != 0, x.p_bursts_chars_aux / x.p_bursts, np.nan),
        mean_r_bursts_chars = lambda x: np.where(x.r_bursts != 0, x.r_bursts_chars_aux / x.r_bursts, np.nan),
        p_bursts_proportion_perc=lambda x: (100 * x.pb_time_aux) / (60000 * x.total_time_mins),
        r_bursts_proportion_perc=lambda x: (100 * x.rb_time_aux) / (60000 * x.total_time_mins)
        )
    .round(1)
)

test_logs['time_interval_aux'] = 0

for _, group in id_groups:
    time_intervals = pd.cut(group['up_time'], bins=10)
    test_logs.loc[group.index, 'time_interval_aux'] = time_intervals
    
for _, time_group in test_logs.groupby(['id', 'time_interval_aux']):
    
    input_chars_group = time_group[time_group.activity=='Input'].event_id.count()
    
    second_str_replace_chars_group = 0
    for replace_str in time_group[time_group.activity=="Replace"].text_change.values:
            arrow_idx = replace_str.find(' => ')
            len_second_str = len(replace_str) - arrow_idx - len(' => ')
            second_str_replace_chars_group += len_second_str
    
    paste_chars_group = len(''.join(time_group[time_group.activity=='Paste'].text_change.values))
    chars_process_group = input_chars_group + second_str_replace_chars_group + paste_chars_group

    test_logs.loc[time_group.index, 'chars_time_group_aux'] = chars_process_group
    
test_logs['chars_proportion_aux'] = np.round(100 * test_logs.chars_time_group_aux / test_logs.chars_process, 1)

for _, group in test_logs.groupby('id'):
    test_logs.loc[group.index, 'std_chars_interval'] = np.round(group.chars_proportion_aux.std(), 1)

test_logs = test_logs.filter(regex='^(?!.*aux).*$')

In [22]:
file_path = Path('../working/test_logs.pkl')

with open(file_path, 'wb') as file:
        pickle.dump(test_logs, file)