In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import pickle

In [2]:
path = Path('../input/linking-writing-processes-to-writing-quality')

In [3]:
train_logs = pd.read_csv(path/'train_logs.csv')

In [4]:
train_logs['num_events'] = train_logs.groupby('id')['event_id'].transform('last')
train_logs['total_time_mins'] = np.round(train_logs.groupby('id')['up_time'].transform('last') / 60000, 1)
train_logs['input_chars_aux'] = train_logs.groupby('id')['activity'].transform(lambda x: (x == 'Input').sum())
    
for _, group in train_logs.groupby('id'):
    first_strings_replace_chars = 0
    second_strings_replace_chars = 0

    for replace_str in group[group.activity == "Replace"].text_change.values:
        arrow_idx = replace_str.find(' => ')
        len_second_str = len(replace_str) - arrow_idx - len(' => ')
        second_strings_replace_chars += len_second_str

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'second_strings_replace_chars_aux'] = second_strings_replace_chars

train_logs['paste_chars_aux'] = train_logs.groupby('id')['activity'].transform(
lambda x: (train_logs.loc[x.index, 'text_change'][x == 'Paste']).str.len().sum()
)

train_logs['chars_process'] = train_logs.input_chars_aux + train_logs.second_strings_replace_chars_aux +\
                          train_logs.paste_chars_aux
train_logs['chars_per_min_process'] = np.round(train_logs.chars_process / train_logs.total_time_mins, 1)

train_logs['last_word_count_aux'] = train_logs.groupby('id')['word_count'].transform(lambda x: x.values[-1])
train_logs['words_per_min_product'] = np.round(train_logs.last_word_count_aux / train_logs.total_time_mins, 1)

for _, group in train_logs.groupby('id'):
    input_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                            (group.activity=='Input')].event_id.count()

    remove_cut_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                            (group.activity=='Remove/Cut')].event_id.count()

    train_logs.loc[group.index, 'input_sentences_aux'] = input_sentences
    train_logs.loc[group.index, 'remove_cut_sentences_aux'] = remove_cut_sentences


train_logs['sentences_per_min_product'] = np.round((train_logs['input_sentences_aux'] - 
                                            train_logs['remove_cut_sentences_aux']) / train_logs['total_time_mins'],1)

for _, group in train_logs.groupby('id'):
    iki = group['down_time'] - group['up_time'].shift(1)
    filtered_iki = [num for num in iki if num >= 2000]
    pause_time = sum(filtered_iki)

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'pause_time_aux'] = pause_time

train_logs['pause_time_proportion_perc'] = np.round((100 * train_logs.pause_time_aux) / (60000 * train_logs.total_time_mins), 1)

for _, group in train_logs.groupby('id'):
    group['word_count_up_aux'] = group.word_count > group.shift().word_count
    word_count_up_idxs = group[(group.word_count_up_aux) & (group.text_change=='q')].index

    # Look for time when word ended
    pause_time_w_in_words, pauses_w_in_words = 0, 0
    for idx in word_count_up_idxs:
        initial_time = group.at[idx, 'up_time']
        slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 10) 
                        & (group.activity=='Input') & (group.text_change.isin(['.', ',', ';', ':', ' ', '!', '?']))] 
        if not slice_f.empty:
            final_time = slice_f['down_time'].values[0]
            pause_time_w_in_words += final_time - initial_time
            pauses_w_in_words += 1

    # Assign accumulated values to the corresponding columns for the group
    train_logs.loc[group.index, 'pause_time_w_in_words_aux'] = pause_time_w_in_words
    train_logs.loc[group.index, 'pauses_w_in_words_aux'] = pauses_w_in_words


train_logs['mean_pause_length_w_in_words'] = np.round(train_logs.pause_time_w_in_words_aux / train_logs.pauses_w_in_words_aux, 1)

simplified_train_logs = train_logs[['num_events', 'chars_per_min_process', 
                                    'words_per_min_product', 'sentences_per_min_product', 
                                    'pause_time_proportion_perc', 'mean_pause_length_w_in_words']]

simplified_train_logs.drop_duplicates(inplace=True)
simplified_train_logs.fillna(0, inplace=True)
simplified_train_logs.reset_index(drop=True, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplified_train_logs.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplified_train_logs.fillna(0, inplace=True)


In [5]:
file_path = Path('./simplified_train_logs.pkl')

with open(file_path, 'wb') as file:
        pickle.dump(simplified_train_logs, file)