## Imports

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import warnings
warnings.simplefilter("ignore")
SEED=7

## Reading data

In [2]:
path = Path('../input/linking-writing-processes-to-writing-quality')
path_keystroke_measures = Path('../input/keystroke-measures')

In [3]:
os.listdir(path)

['sample_submission.csv',
 'test_logs.csv',
 'train_scores.csv',
 'train_logs.csv']

In [4]:
test_logs = pd.read_csv(path/'test_logs.csv')
train_scores = pd.read_csv(path/'train_scores.csv')

with open(path_keystroke_measures/'train_logs.pkl', 'rb') as file:
    train_logs = pd.read_pickle(file)

DEBUG = False
    
if DEBUG:
    num_essays = 100
    train_logs = train_logs[train_logs.id.isin(train_logs.id.unique()[:num_essays])]
    train_scores = train_scores[train_scores.id.isin(train_logs.id.unique()[:num_essays])]
    print(f"Debug mode activated. Utilizing {num_essays} essays from train_logs and train_scores.")
else:
    print("Debug mode has been deactivated. The whole dataset of train_logs and train_scores will be utilized:",
          f"{train_logs.id.nunique()} essays.")

Debug mode has been deactivated. The whole dataset of train_logs and train_scores will be utilized: 2471 essays.


In [5]:
train_logs['max_cursor_position'] = train_logs.groupby('id').cursor_position.transform('max')

In [6]:
simplified_train_logs = train_logs[['max_cursor_position', 'num_events', 
                                    'chars_per_min_process', 'words_per_min_product',
                                    'sentences_per_min_product', 'pause_time_proportion_perc',
                                    'mean_pause_length_w_in_words']]

In [7]:
simplified_train_logs.drop_duplicates(inplace=True)
simplified_train_logs.reset_index(drop=True, inplace=True)

## Baselines 

#### Baseline with Random Forest Regressor

In [8]:
# from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import make_scorer, mean_squared_error

# rf = RandomForestRegressor(random_state=SEED)

# X = simplified_train_logs
# y = train_scores.score

# num_folds = 5
# kfold = KFold(n_splits=num_folds, shuffle=True, random_state=SEED)

# # Define custom scorer for RMSE
# rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

# # Perform cross-validation with RMSE as the scoring metric and obtain predictions
# cv_predictions = cross_val_predict(rf, X, y, cv=kfold)

# # Display real values, predictions, max, and min predicted values for each fold
# for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
#     X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#     y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
#     predictions = cv_predictions[test_idx]

#     print(f"\nFold {fold + 1}:")
# #     print("Real Values:", y_test.values)
# #     print("Predictions:", predictions)
#     print("Max Predicted Value:", np.round(np.max(predictions),1))
#     print("Min Predicted Value:", np.round(np.min(predictions),1))

#     # Calculate and print RMSE for the fold
#     fold_rmse = np.round(np.sqrt(mean_squared_error(y_test, predictions)),2)
#     print("Fold RMSE:", fold_rmse)

# # Display the cross-validation scores
# cv_scores = cross_val_score(rf, X, y, cv=kfold, scoring=rmse_scorer)
# print("\nCross-validation RMSE scores:", -cv_scores)  # Negate scores since make_scorer uses greater_is_better=False
# print("Mean RMSE:", np.round(-cv_scores.mean(), 2))  # Negate mean as well


### Hyperparameters

In [9]:
# from sklearn.model_selection import KFold, GridSearchCV
# from xgboost import XGBRegressor

# X = simplified_train_logs
# y = train_scores.score

# # Set the number of folds
# num_folds = 5

# # Instantiate the KFold cross-validator
# kf = KFold(n_splits=num_folds, shuffle=True, random_state=SEED)

# # Set the parameter grid for grid search
# param_grid = {
#     'learning_rate': [0.05, 0.1, 0.2],
#     'n_estimators': [50, 100, 150, 200],
#     'max_depth': [3, 4, 5, 6],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9],
#     'gamma': [0, 1, 5],
#     'alpha': [0, 0.3]
# }

# # Initialize XGBRegressor
# regr = XGBRegressor(random_state=SEED)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(regr, param_grid, scoring='neg_root_mean_squared_error', cv=kf, verbose=1, n_jobs=-1)

# # Fit the model on the training data using GridSearchCV
# grid_search.fit(X, y)

# # Get the results
# results_df = pd.DataFrame(grid_search.cv_results_)

# # Display top 5 combinations of parameters for the mean score
# print("\nTop 5 combinations of parameters for the mean score:\n")
# top_5_mean = results_df[results_df['rank_test_score'] <= 5]

# # Sort by the absolute mean test score in ascending order
# top_5_mean = top_5_mean.assign(abs_mean_test_score=top_5_mean['mean_test_score'].abs())
# top_5_mean = top_5_mean.sort_values(by='abs_mean_test_score', ascending=True)

# for index, row in top_5_mean.iterrows():
#     print(f"Parameters: {row['params']}")
#     print(f"Mean Test Score: {np.round(row['mean_test_score'],4)}")
#     print(f"Standard Deviation: {np.round(row['std_test_score'],4)}\n")


In [10]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

X = simplified_train_logs
y = train_scores.score

# Set the number of folds
num_folds = 5

# Instantiate the KFold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=SEED)

# Initialize lists to store results
rmse_scores = []

regr = XGBRegressor(alpha=0.3, colsample_bytree=0.9, gamma=1, learning_rate=0.05, 
                    max_depth=4, n_estimators=200, subsample=0.9, random_state=SEED)

# Perform cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    regr.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

    # Get training and testing errors for each iteration of the current fold
    train_errors_fold = regr.evals_result()["validation_0"]["rmse"]
    test_errors_fold = regr.evals_result()["validation_1"]["rmse"]

    # Make predictions on the test data
    y_pred = regr.predict(X_test)

    # Calculate RMSE for the fold
    rmse = np.round(np.sqrt(((y_test - y_pred) ** 2).mean()), 2)
    rmse_scores.append(rmse)

    # Print results for each fold
    print(f"\nFold {fold}: Max Predicted Value: ", np.round(np.max(y_pred), 2), "; Min Predicted Value: ", np.round(np.min(y_pred), 2), f"; Fold RMSE: {rmse}")

# Print average RMSE across all folds
print(f"\nCross-validation RMSE scores: {rmse_scores}")
print(f"\nMean RMSE: {np.round(np.mean(rmse_scores), 3)}")


Fold 1: Max Predicted Value:  5.26 ; Min Predicted Value:  1.54 ; Fold RMSE: 0.68

Fold 2: Max Predicted Value:  5.25 ; Min Predicted Value:  1.61 ; Fold RMSE: 0.67

Fold 3: Max Predicted Value:  5.3 ; Min Predicted Value:  1.36 ; Fold RMSE: 0.65

Fold 4: Max Predicted Value:  5.19 ; Min Predicted Value:  1.5 ; Fold RMSE: 0.68

Fold 5: Max Predicted Value:  5.25 ; Min Predicted Value:  1.81 ; Fold RMSE: 0.7

Cross-validation RMSE scores: [0.68, 0.67, 0.65, 0.68, 0.7]

Mean RMSE: 0.676


## Preparation for submission

In [11]:
regr.fit(X,y)      # Fitting the Model to the whole dataset

def transformations_train_test(df):     # Apply to test_logs the same transformations applied to train_logs
    df['max_cursor_position'] = df.groupby('id').cursor_position.transform('max')
    df['num_events'] = df.groupby('id')['event_id'].transform('last')
    df['total_time_mins'] = np.round(df.groupby('id')['up_time'].transform('last') / 60000, 1)
    df['input_chars_aux'] = df.groupby('id')['activity'].transform(lambda x: (x == 'Input').sum())
    
    for _, group in df.groupby('id'):
        first_strings_replace_chars = 0
        second_strings_replace_chars = 0
    
        for replace_str in group[group.activity == "Replace"].text_change.values:
            arrow_idx = replace_str.find(' => ')
            len_second_str = len(replace_str) - arrow_idx - len(' => ')
            second_strings_replace_chars += len_second_str
    
        # Assign accumulated values to the corresponding columns for the group
        df.loc[group.index, 'second_strings_replace_chars_aux'] = second_strings_replace_chars

    df['paste_chars_aux'] = df.groupby('id')['activity'].transform(
    lambda x: (df.loc[x.index, 'text_change'][x == 'Paste']).str.len().sum()
    )
    
    df['chars_process'] = df.input_chars_aux + df.second_strings_replace_chars_aux +\
                              df.paste_chars_aux
    df['chars_per_min_process'] = np.round(df.chars_process / df.total_time_mins, 1)

    df['last_word_count_aux'] = df.groupby('id')['word_count'].transform(lambda x: x.values[-1])
    df['words_per_min_product'] = np.round(df.last_word_count_aux / df.total_time_mins, 1)
    
    for _, group in df.groupby('id'):
        input_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Input')].event_id.count()
        
        remove_cut_sentences = group[(group['text_change'].str.contains('[\.\;\?\!\:]', regex=True)) &
                                (group.activity=='Remove/Cut')].event_id.count()
        
        df.loc[group.index, 'input_sentences_aux'] = input_sentences
        df.loc[group.index, 'remove_cut_sentences_aux'] = remove_cut_sentences

    
    df['sentences_per_min_product'] = np.round((df['input_sentences_aux'] - 
                                                df['remove_cut_sentences_aux']) / df['total_time_mins'],1)
    
    for _, group in df.groupby('id'):
        iki = group['down_time'] - group['up_time'].shift(1)
        filtered_iki = [num for num in iki if num >= 2000]
        pause_time = sum(filtered_iki)

        # Assign accumulated values to the corresponding columns for the group
        df.loc[group.index, 'pause_time_aux'] = pause_time
    
    df['pause_time_proportion_perc'] = np.round((100 * df.pause_time_aux) / (60000 * df.total_time_mins), 1)
    
    for _, group in df.groupby('id'):
        group['word_count_up_aux'] = group.word_count > group.shift().word_count
        word_count_up_idxs = group[(group.word_count_up_aux) & (group.text_change=='q')].index

        # Look for time when word ended
        pause_time_w_in_words, pauses_w_in_words = 0, 0
        for idx in word_count_up_idxs:
            initial_time = group.at[idx, 'up_time']
            slice_f = group[(group.index >= idx + 1) & (group.index <= idx + 10) 
                            & (group.activity=='Input') & (group.text_change.isin(['.', ',', ';', ':', ' ', '!', '?']))] 
            if not slice_f.empty:
                final_time = slice_f['down_time'].values[0]
                pause_time_w_in_words += final_time - initial_time
                pauses_w_in_words += 1

        # Assign accumulated values to the corresponding columns for the group
        df.loc[group.index, 'pause_time_w_in_words_aux'] = pause_time_w_in_words
        df.loc[group.index, 'pauses_w_in_words_aux'] = pauses_w_in_words

    
    df['mean_pause_length_w_in_words'] = np.round(df.pause_time_w_in_words_aux / df.pauses_w_in_words_aux, 1)
    
    simplified_df = df[['id', 'max_cursor_position', 'num_events',
                        'chars_per_min_process', 'words_per_min_product',
                        'sentences_per_min_product', 'pause_time_proportion_perc',
                        'mean_pause_length_w_in_words']]
    
    simplified_df.drop_duplicates(inplace=True)
    simplified_df.reset_index(drop=True, inplace=True)
    return simplified_df


test_logs_transformed = transformations_train_test(test_logs)
predictions = regr.predict(test_logs_transformed[['max_cursor_position', 'num_events',
                        'chars_per_min_process', 'words_per_min_product',
                        'sentences_per_min_product', 'pause_time_proportion_perc',
                        'mean_pause_length_w_in_words']])

test_logs_transformed['score'] = predictions
test_logs_transformed[['id', 'score']].to_csv('./submission.csv', index=False)