# Load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from tqdm.auto import tqdm
tqdm.pandas()
from collections import Counter
import re

In [2]:
%%time
home_path = '/kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs = pd.read_csv(home_path + 'train_logs.csv')
train_scores = pd.read_csv(home_path + 'train_scores.csv')
test_logs = pd.read_csv(home_path + 'test_logs.csv')
submission = pd.read_csv(home_path + 'sample_submission.csv')

print(train_logs.shape, train_scores.shape, test_logs.shape, submission.shape)

(8405898, 11) (2471, 2) (6, 11) (3, 2)
CPU times: user 6.34 s, sys: 2.85 s, total: 9.19 s
Wall time: 15.1 s


In [3]:
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
          'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']


def activity_counts(df):
    tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
    ret = list()
#     for li in tqdm(tmp_df['activity'].values):
    for li in tmp_df['activity'].values:
        items = list(Counter(li).items())
        di = dict()
        for k in activities:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


def event_counts(df, colname):
    tmp_df = df.groupby('id').agg({colname: list}).reset_index()
    ret = list()
#     for li in tqdm(tmp_df[colname].values):
    for li in tmp_df[colname].values:
        items = list(Counter(li).items())
        di = dict()
        for k in events:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


def text_change_counts(df):
    tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
    ret = list()
#     for li in tqdm(tmp_df['text_change'].values):
    for li in tmp_df['text_change'].values:
        items = list(Counter(li).items())
        di = dict()
        for k in text_changes:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
def match_punctuations(df):
    tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
    ret = list()
#     for li in tqdm(tmp_df['down_event'].values):
    for li in tmp_df['down_event'].values:
        cnt = 0
        items = list(Counter(li).items())
        for item in items:
            k, v = item[0], item[1]
            if k in punctuations:
                cnt += v
        ret.append(cnt)
    ret = pd.DataFrame({'punct_cnt': ret})
    return ret


def get_input_words(df):
    tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
    tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
    tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
    tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df.drop(['text_change'], axis=1, inplace=True)
    return tmp_df


def make_feats(df):
    
    # id
    feats = pd.DataFrame({'id': df['id'].unique().tolist()})
    
    # time shift
    df['up_time_shift1'] = df.groupby('id')['up_time'].shift(1)
    df['action_time_gap'] = df['down_time'] - df['up_time_shift1']
    df.drop('up_time_shift1', axis=1, inplace=True)
    
    # cursor position shift
    df['cursor_position_shift1'] = df.groupby('id')['cursor_position'].shift(1)
    df['cursor_position_change'] = np.abs(df['cursor_position'] - df['cursor_position_shift1'])
    df.drop('cursor_position_shift1', axis=1, inplace=True)
    
    # word count shift
    df['word_count_shift1'] = df.groupby('id')['word_count'].shift(1)
    df['word_count_change'] = np.abs(df['word_count'] - df['word_count_shift1'])
    df.drop('word_count_shift1', axis=1, inplace=True)
    
    # stats feats
#     for item in tqdm([
#         ('event_id', ['max']),
#         ('up_time', ['max']),
#         ('action_time', ['sum', 'max', 'mean', 'std']),
#         ('activity', ['nunique']),
#         ('down_event', ['nunique']),
#         ('up_event', ['nunique']),
#         ('text_change', ['nunique']),
#         ('cursor_position', ['nunique', 'max', 'mean']),
#         ('word_count', ['nunique', 'max', 'mean']),
#         ('action_time_gap', ['max', 'min', 'mean', 'std', 'sum']),
#         ('cursor_position_change', ['max', 'mean', 'std', 'sum']),
#         ('word_count_change', ['max', 'mean', 'std', 'sum'])
#     ]):
    for item in [
        ('event_id', ['max']),
        ('up_time', ['max']),
        ('action_time', ['sum', 'max', 'mean', 'std']),
        ('activity', ['nunique']),
        ('down_event', ['nunique']),
        ('up_event', ['nunique']),
        ('text_change', ['nunique']),
        ('cursor_position', ['nunique', 'max', 'mean']),
        ('word_count', ['nunique', 'max', 'mean']),
        ('action_time_gap', ['max', 'min', 'mean', 'std', 'sum']),
        ('cursor_position_change', ['max', 'mean', 'std', 'sum']),
        ('word_count_change', ['max', 'mean', 'std', 'sum'])
    ]:
        colname, methods = item[0], item[1]
        for method in methods:
            tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method}'})
            feats = feats.merge(tmp_df, on='id', how='left')
    
    # counts
    tmp_df = activity_counts(df)
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = event_counts(df, 'down_event')
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = event_counts(df, 'up_event')
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = text_change_counts(df)
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = match_punctuations(df)
    feats = pd.concat([feats, tmp_df], axis=1)
    
    # input words
    tmp_df = get_input_words(df)
    feats = pd.merge(feats, tmp_df, on='id', how='left')
    
    # compare feats
    feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
    feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
    feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
    feats['idle_time_ratio'] = feats['action_time_gap_sum'] / feats['up_time_max']
    
    # made by Soo.Y
    feats['final_word_count'] = df.groupby('id')['word_count'].last().values
    feats['writing_time'] = (df.groupby('id')['up_time'].last() - df.groupby('id')['up_time'].first()).values
    
    return feats

# Downcast

In [4]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            df[col] = df[col].astype('category')
#             pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
                                             df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('{:.1f}% 압축됨'.format(100*(start_mem - end_mem) / start_mem))
    
    return df

In [5]:
train_logs = downcast(train_logs)
test_logs = downcast(test_logs)

69.3% 압축됨
-29.6% 압축됨


In [6]:
%%time
train_original = make_feats(train_logs)
test_original = make_feats(test_logs)

CPU times: user 14.3 s, sys: 1.32 s, total: 15.6 s
Wall time: 15.6 s


# 이상치 제거

In [7]:
def my_outlier_condition(df):
    # final_word_count < 200 is outlier
    df = df[df['final_word_count'] >= 200]
    # writing_time >= 3000 is outlier
    df = df[df['writing_time'] <= 50*60*1000]
    return df

In [8]:
process_outlier = True
if process_outlier:
    train_feats_outlier = my_outlier_condition(train_original)
    test_feats_outlier = test_original
else:
    train_feats_outlier = train_original
    test_feats_outlier = test_original

In [9]:
# 2471 -> 2434
print(f'original train : {len(train_original.id)}')
print(f'train without outlier : {len(train_feats_outlier.id)}')

original train : 2471
train without outlier : 2434


In [10]:
# merge train data set and Target data set
train_feats_outlier = train_feats_outlier.merge(train_scores, on='id', how='left')

In [11]:
# train_feats = train_feats_outlier.drop(columns=['final_word_count', 'writing_time']) # using this data set
# test_feats = test_feats_outlier.drop(columns=['final_word_count', 'writing_time']) # using this data set

In [12]:
# # trying 영수's drop columns
train_feats = train_feats_outlier.drop(columns = ['activity_4_count', 'up_event_9_count', 'up_event_10_count', 'up_event_14_count', 'up_event_15_count', 'down_event_15_count'])
test_feats = test_feats_outlier.drop(columns = ['activity_4_count', 'up_event_9_count', 'up_event_10_count', 'up_event_14_count', 'up_event_15_count', 'down_event_15_count'])

In [13]:
drop_columns = ['up_event_4_count', 'up_event_5_count', 'up_event_6_count',
       'up_event_8_count', 'up_event_3_count', 'up_event_2_count',
       'up_event_1_count', 'up_event_13_count', 'up_event_12_count',
       'up_event_11_count', 'text_change_2_count', 'up_event_7_count']

train_feats = train_feats.drop(columns = drop_columns)
test_feats = test_feats.drop(columns = drop_columns)

In [14]:
print(train_feats.shape, test_feats.shape)

(2434, 76) (3, 75)


# Feature Test

In [15]:
# #############################################
# # action time gap
# #############################################
# # col 추가
# count = train_logs[train_logs['action_time_gap'] >=120000].groupby('id')['id'].count() # -> raw 수가 안 맞아서 NAN 발생
# count = pd.DataFrame(data=count)
# count = count.rename({'id': 'penalty1'}, axis='columns')

# train_feats = pd.merge(train_feats,count, on='id', how='left')
# test_feats = pd.merge(test_feats,count, on='id', how='left')

# train_feats['penalty1'] = train_feats['penalty1'].fillna(0)
# test_feats['penalty1'] = test_feats['penalty1'].fillna(0)

In [16]:
# ['input_word_length_mean', 'down_event_8_count',
#        'cursor_position_max', 'down_event_3_count',
#        'input_word_length_std', 'down_event_7_count',
#        'down_event_11_count', 'input_word_length_max',
#        'text_change_8_count', 'word_count_change_mean',
#        'cursor_position_mean', 'action_time_gap_min',
#        'word_count_nunique', 'down_event_0_count',
#        'cursor_position_change_mean', 'text_change_5_count',
#        'action_time_std', 'action_time_gap_sum', 'up_time_max',
#        'cursor_position_nunique', 'cursor_position_change_max',
#        'activity_2_count', 'down_event_5_count',
#        'cursor_position_change_sum', 'down_event_1_count',
#        'action_time_max', 'word_time_ratio', 'word_event_ratio',
#        'text_change_nunique', 'cursor_position_change_std',
#        'text_change_1_count', 'action_time_gap_std', 'action_time_sum',
#        'text_change_4_count', 'activity_1_count', 'action_time_gap_max',
#        'punct_cnt', 'up_event_11_count', 'idle_time_ratio',
#        'action_time_mean', 'word_count_mean', 'word_count_change_std',
#        'text_change_3_count', 'word_count_max', 'down_event_12_count',
#        'up_event_3_count', 'down_event_13_count', 'up_event_0_count',
#        'event_time_ratio', 'event_id_max', 'down_event_2_count',
#        'text_change_0_count', 'text_change_9_count', 'down_event_6_count',
#        'up_event_8_count', 'action_time_gap_mean', 'down_event_4_count',
#        'activity_0_count', 'down_event_nunique', 'input_word_count',
#        'text_change_7_count', 'text_change_13_count',
#        'text_change_14_count', 'word_count_change_sum',
#        'text_change_11_count', 'up_event_12_count',
#        'text_change_10_count', 'text_change_6_count',
#        'text_change_2_count', 'word_count_change_max', 'up_event_nunique',
#        'up_event_5_count', 'up_event_1_count', 'up_event_2_count',
#        'down_event_10_count', 'activity_3_count', 'down_event_9_count',
#        'down_event_14_count', 'text_change_12_count', 'up_event_13_count',
#        'activity_nunique', 'up_event_7_count', 'up_event_6_count',
#        'up_event_4_count']

In [17]:
# # my test features selection 
# # it need 'id', 'score'
# select_colname = ['input_word_length_mean', 'down_event_8_count',
#        'cursor_position_max', 'down_event_3_count',
#        'input_word_length_std', 'down_event_7_count',
#        'down_event_11_count', 'input_word_length_max',
#        'text_change_8_count', 'word_count_change_mean',
#        'cursor_position_mean', 'action_time_gap_min',
#        'word_count_nunique', 'down_event_0_count',
#        'cursor_position_change_mean', 'text_change_5_count',
#        'action_time_std', 'action_time_gap_sum', 'up_time_max',
#        'cursor_position_nunique', 'cursor_position_change_max',
#        'activity_2_count', 'down_event_5_count',
#        'cursor_position_change_sum', 'down_event_1_count',
#        'action_time_max', 'word_time_ratio', 'word_event_ratio',
#        'text_change_nunique', 'cursor_position_change_std',
#        'text_change_1_count', 'action_time_gap_std', 'action_time_sum',
#        'text_change_4_count', 'activity_1_count', 'action_time_gap_max',
#        'punct_cnt', 'up_event_11_count', 'idle_time_ratio',
#        'action_time_mean', 'word_count_mean', 'word_count_change_std',
#        'text_change_3_count', 'word_count_max', 'down_event_12_count',
#        'up_event_3_count', 'down_event_13_count', 'up_event_0_count']
# select_colname.append('id')
# select_colname.append('score')
# train_feats = train_feats[select_colname]
# select_colname.remove('score')
# test_feats = test_feats[select_colname]

In [18]:
# cont_corr = train_feats.drop(columns =['id']).corr()     # 연속형 피처 간 상관관계
# # cont_corr = train_feats.corr()
# plt.figure(figsize=(12,6))
# sns.heatmap(cont_corr, annot=True, cmap='OrRd');

In [19]:
# cont_corr

# Optimizing Hyperparameters with Optuna

In [20]:
# from sklearn.model_selection import train_test_split
# is_Testing = False
# if is_Testing:
#     train_feats, test_feats = train_test_split(train_feats, test_size=0.1,random_state=42)

In [21]:
# %%time
# # TODO: change this to CV OOF using kfold cross-validation instead of train test split
# # can also change the categorical of parameters into "suggest_float"

import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from lightgbm.sklearn import log_evaluation
# from lightgbm import early_stopping

# X = train_feats.drop(columns=['id', 'score'])
# Y = train_feats.score

# def objective(trial,data=X,target=Y):
    
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.1,random_state=42)
#     param = {
#         'metric': 'rmse', 
#         'random_state': 42,
#         'max_depth': trial.suggest_int('max_depth', 5, 25, step = 5),
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 100000, step = 1000),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
# #         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
# #         'subsample': trial.suggest_float('subsample', 0.5, 1),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
#         'num_leaves' : trial.suggest_categorical('num_leaves' ,[32, 64, 128, 256]),
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#         'early_stopping_round': 100,
#     }
#     model = lgb.LGBMRegressor(**param)  
    
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)],
# #               early_stopping_round=100,
# #               callbacks=[log_evaluation(1000), early_stopping(100)],
#               callbacks=[log_evaluation(1000)],
#               )
    
#     preds = model.predict(test_x)
    
#     rmse = mean_squared_error(test_y, preds,squared=False)
    
#     return rmse


In [22]:
# %%time

# use_optuna_tuning = False
# if use_optuna_tuning:
#     study = optuna.create_study(direction='minimize', study_name='Optimize boosting hyperparameters')
#     study.optimize(objective, n_trials=200)

In [23]:
# if use_optuna_tuning:
#     print('Best trial:', study.best_trial.params)

# LGBM Model

In [24]:
%%time
import gc
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from lightgbm import early_stopping
ycol = 'score'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], train_feats.columns))

model = lgb.LGBMRegressor(metric='rmse', 
                          random_state=42,
                          n_estimators=10000,
                          reg_alpha=0.11191757875268019,
                          reg_lambda=2.767891890352492,
#                           colsample_tytree=0.8130568466013137,
#                           subsample=0.9324597831546558,
                          subsample=1.0,
                          learning_rate=0.0059971707517773795,
                          num_leaves=28,
                          min_child_samples=37,
                         )

# model = lgb.LGBMRegressor(metric='rmse', 
#                           max_depth=15,
#                           random_state=42,
#                           n_estimators=61000,
#                           reg_alpha=0.007654008530059538,
#                           reg_lambda=0.007679306840372993,
#                           colsample_bytree=1.0,
# #                           colsample_bytree=0.8144580038492835,
# #                           subsample=0.6094218361178689,
#                           subsample=1.0,
#                           learning_rate=0.001494529719393843,
#                           num_leaves=32,
#                           min_child_samples=17,
#                          )

oof = []
prediction = test_feats[['id']]
prediction[ycol] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_feats[feature_names])):
    X_train = train_feats.iloc[trn_idx][feature_names]
    Y_train = train_feats.iloc[trn_idx][ycol]

    X_val = train_feats.iloc[val_idx][feature_names]
    Y_val = train_feats.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          eval_metric='rmse',
                          callbacks=[log_evaluation(500), early_stopping(100)]
                         )

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train_feats.iloc[val_idx][['id', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        test_feats[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[ycol] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()
    
    
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Training until validation scores don't improve for 100 rounds
[500]	train's rmse: 0.486566	valid's rmse: 0.663107
Early stopping, best iteration is:
[620]	train's rmse: 0.454132	valid's rmse: 0.657839




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[500]	train's rmse: 0.491082	valid's rmse: 0.644093
Early stopping, best iteration is:
[840]	train's rmse: 0.407911	valid's rmse: 0.632443


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Training until validation scores don't improve for 100 rounds
[500]	train's rmse: 0.48517	valid's rmse: 0.680437
[1000]	train's rmse: 0.373589	valid's rmse: 0.669779
Early stopping, best iteration is:
[1090]	train's rmse: 0.357806	valid's rmse: 0.669419


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Training until validation scores don't improve for 100 rounds
[500]	train's rmse: 0.496705	valid's rmse: 0.605964
Early stopping, best iteration is:
[575]	train's rmse: 0.474113	valid's rmse: 0.605194




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[500]	train's rmse: 0.490235	valid's rmse: 0.645026
Early stopping, best iteration is:
[568]	train's rmse: 0.470249	valid's rmse: 0.643293
CPU times: user 48.6 s, sys: 17.9 s, total: 1min 6s
Wall time: 34.2 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,column,importance
0,input_word_length_mean,999.8
1,down_event_8_count,822.6
2,down_event_3_count,730.8
3,final_word_count,604.0
4,down_event_11_count,603.6
...,...,...
69,down_event_9_count,39.6
70,down_event_10_count,32.4
71,down_event_14_count,31.8
72,activity_nunique,29.6


In [25]:
# pd.set_option('display.max_columns', 100)
# display(df_importance.T)
# sns.lineplot(df_importance)

In [26]:
# df_importance[df_importance.importance <= 0].column.values

In [27]:
# df_oof.info()

In [28]:
# temp_1 = pd.merge(df_oof, train_feats, on='id', how='left')
# temp_1.columns

In [29]:
# # sns.scatterplot(x=df_oof.pred, y=train_feats.score, )
# fig, ax = plt.subplots()
# sns.scatterplot(x='pred', y='score_x', data=temp_1, ax=ax, hue='penalty1')
# ax.set_xlim(0, 6.5)
# ax.set_ylim(0, 6.5)
# plt.show()

In [30]:
# # sns.scatterplot(x=df_oof.pred, y=train_feats.score, )
# fig, ax = plt.subplots()
# sns.scatterplot(x=round(temp_1.pred, 1), y='score_x', data=temp_1, ax=ax, hue='penalty1')
# ax.set_xlim(0, 6.5)
# ax.set_ylim(0, 6.5)
# plt.show()

In [31]:
# def x_round(x):
#     return [ round(x_value*2)/2 for x_value in x ]

# # fig, ax = plt.subplots()
# # sns.scatterplot(x=x_round(temp_1.pred), y='score_x', data=temp_1, ax=ax, hue='penalty1')
# # ax.set_xlim(0, 6.5)
# # ax.set_ylim(0, 6.5)
# # plt.show()

In [32]:
from sklearn.metrics import mean_squared_error
df_oof = pd.concat(oof)
rmse = mean_squared_error(df_oof[ycol], np.clip(df_oof['pred'], a_min=0.5, a_max=6.0), squared=False)
print('rmse:', rmse)

rmse: 0.6420189467797773


In [33]:
# from sklearn.metrics import mean_squared_error
# df_oof = pd.concat(oof)
# rmse = mean_squared_error(df_oof[ycol], np.clip(x_round(df_oof['pred']), a_min=0.5, a_max=6.0), squared=False)
# print('rmse:', rmse)

In [34]:
# from sklearn.metrics import mean_squared_error
# df_oof = pd.concat(oof)
# rmse = mean_squared_error(df_oof[ycol], np.clip(round(df_oof['pred'], 1), a_min=0.5, a_max=6.0), squared=False)
# print('rmse:', rmse)

In [35]:
# display(prediction)
# prediction['score'] = np.clip(round(prediction['score'], 1), a_min=0.5, a_max=6.0)
# prediction.to_csv('submission.csv', index=False)

In [36]:
display(prediction)
prediction['score'] = np.clip(prediction['score'], a_min=0.5, a_max=6.0)
prediction.to_csv('submission.csv', index=False)

Unnamed: 0,id,score
0,0000aaaa,1.625999
1,2222bbbb,1.577628
2,4444cccc,1.575059


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['score'] = np.clip(prediction['score'], a_min=0.5, a_max=6.0)
