## Imports

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
SEED=7

## Reading data

In [2]:
path = Path('../input/linking-writing-processes-to-writing-quality')
path_keystroke_measures = Path('../input/keystroke-measures')

In [3]:
test_logs = pd.read_csv(path/'test_logs.csv')
train_scores = pd.read_csv(path/'train_scores.csv')

with open(path_keystroke_measures/'simplified_train_logs.pkl', 'rb') as file:
    simplified_train_logs = pd.read_pickle(file)

In [4]:
from xgboost import XGBRegressor

X = simplified_train_logs
y = train_scores.score

regr = XGBRegressor(alpha=0.3, colsample_bytree=0.9, gamma=1, learning_rate=0.1, 
                    max_depth=3, n_estimators=100, subsample=0.8, random_state=SEED)

## Preparation for submission

In [5]:
regr.fit(X,y)      # Fitting the Model to the whole dataset

def transformations_train_test(df):     # Apply to test_logs the same transformations applied to train_logs
    df['num_events'] = df.groupby('id')['event_id'].transform('last')
    df['total_time_mins'] = np.round(df.groupby('id')['up_time'].transform('last') / 60000, 1)
    df['input_chars_aux'] = df.groupby('id')['activity'].transform(lambda x: (x == 'Input').sum())
    
    for _, group in df.groupby('id'):
        first_strings_replace_chars = 0
        second_strings_replace_chars = 0
    
        for replace_str in group[group.activity == "Replace"].text_change.values:
            arrow_idx = replace_str.find(' => ')
            len_second_str = len(replace_str) - arrow_idx - len(' => ')
            second_strings_replace_chars += len_second_str
    
        # Assign accumulated values to the corresponding columns for the group
        df.loc[group.index, 'second_strings_replace_chars_aux'] = second_strings_replace_chars

    df['paste_chars_aux'] = df.groupby('id')['activity'].transform(
    lambda x: (df.loc[x.index, 'text_change'][x == 'Paste']).str.len().sum()
    )
    
    df['chars_process'] = df.input_chars_aux + df.second_strings_replace_chars_aux +\
                              df.paste_chars_aux
    df['chars_per_min_process'] = np.round(df.chars_process / df.total_time_mins, 1)

    df['last_word_count_aux'] = df.groupby('id')['word_count'].transform(lambda x: x.values[-1])
    df['words_per_min_product'] = np.round(df.last_word_count_aux / df.total_time_mins, 1)
    
    for _, group in df.groupby('id'):
        input_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Input')].event_id.count()
        
        remove_cut_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Remove/Cut')].event_id.count()
        
        df.loc[group.index, 'input_sentences_aux'] = input_sentences
        df.loc[group.index, 'remove_cut_sentences_aux'] = remove_cut_sentences

    
    df['sentences_per_min_product'] = np.round((df['input_sentences_aux'] - 
                                                df['remove_cut_sentences_aux']) / df['total_time_mins'],1)
    
    for _, group in df.groupby('id'):
        iki = group['down_time'] - group['up_time'].shift(1)
        filtered_iki = [num for num in iki if num >= 2000]
        pause_time = sum(filtered_iki)

        # Assign accumulated values to the corresponding columns for the group
        df.loc[group.index, 'pause_time_aux'] = pause_time
    
    df['pause_time_proportion_perc'] = np.round((100 * df.pause_time_aux) / (60000 * df.total_time_mins), 1)
    
    for _, group in df.groupby('id'):
        group['word_count_up_aux'] = group.word_count > group.shift().word_count
        word_count_up_idxs = group[(group.word_count_up_aux) & (group.text_change=='q')].index

        # Look for time when word ended
        pause_time_w_in_words, pauses_w_in_words = 0, 0
        for idx in word_count_up_idxs:
            initial_time = group.at[idx, 'up_time']
            slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 10) 
                            & (group.activity=='Input') & (group.text_change.isin(['.', ',', ';', ':', ' ', '!', '?']))] 
            if not slice_f.empty:
                final_time = slice_f['down_time'].values[0]
                pause_time_w_in_words += final_time - initial_time
                pauses_w_in_words += 1

        # Assign accumulated values to the corresponding columns for the group
        df.loc[group.index, 'pause_time_w_in_words_aux'] = pause_time_w_in_words
        df.loc[group.index, 'pauses_w_in_words_aux'] = pauses_w_in_words

    
    df['mean_pause_length_w_in_words'] = np.round(df.pause_time_w_in_words_aux / df.pauses_w_in_words_aux, 1)
    
    simplified_df = df[['id', 'num_events',
                        'chars_per_min_process', 'words_per_min_product',
                        'sentences_per_min_product', 'pause_time_proportion_perc',
                        'mean_pause_length_w_in_words']]
    
    simplified_df.drop_duplicates(inplace=True)
    simplified_df.fillna(0, inplace=True)
    simplified_df.reset_index(drop=True, inplace=True)
    return simplified_df


test_logs_transformed = transformations_train_test(test_logs)
predictions = regr.predict(test_logs_transformed[['num_events',
                        'chars_per_min_process', 'words_per_min_product',
                        'sentences_per_min_product', 'pause_time_proportion_perc',
                        'mean_pause_length_w_in_words']])

test_logs_transformed['score'] = predictions
test_logs_transformed[['id', 'score']].to_csv('./submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplified_df.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplified_df.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_logs_transformed['score'] = predictions
