In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/code/data/processed/proceeded_train_20200116_102618.csv', index_col=0)
test = pd.read_csv('/code/data/processed/proceeded_test_20200116_102618.csv', index_col=0)

In [3]:
from sklearn.preprocessing import PowerTransformer


EFFECTIVE_FEATURES = [
    'session_title', 'label_session_title_count_accuracy_label',
    'count_accuracy', 'label_session_title_count_action_label', '2000',
    'mean_accuracy_group', '3afb49e6', 'acc_Bird Measurer (Assessment)',
    'Clip', 'mean_game_round',
    'label_mean_accuracy_group_label_description_val', 'good_comment_ratio',
    '4070', 'acc_Chest Sorter (Assessment)', '7372e1a5',
    'count_label_session_title_description_val', '04df9b66',
    'label_session_title_description_val', '3020', 'args_6'
    ]

NUMERICAL_FEATURES = [
    'args_6', 'count_accuracy', '2000', 'mean_accuracy_group', '3afb49e6',
    'acc_Bird Measurer (Assessment)', 'Clip', 'mean_game_round',
    'good_comment_ratio', '4070', 'acc_Chest Sorter (Assessment)', '7372e1a5',
    'count_label_session_title_description_val', '04df9b66', '3020'
    ]

CATEGORICAL_FEATURES = [
    'session_title', 'label_session_title_count_accuracy_label',
    'label_session_title_count_action_label',
    'label_mean_accuracy_group_label_description_val',
    'label_session_title_description_val'
    ]


class PreprocessForNN:

    def process(self, train: pd.DataFrame, test: pd.DataFrame):

        train = train[EFFECTIVE_FEATURES]
        test = test[EFFECTIVE_FEATURES]

        # 数値データをBox-Cox変換
        pt = PowerTransformer()
        pt.fit(train[NUMERICAL_FEATURES])
        train[NUMERICAL_FEATURES] = pt.transform(train[NUMERICAL_FEATURES])
        test[NUMERICAL_FEATURES] = pt.transform(test[NUMERICAL_FEATURES])

        # Category data を One hot encoding
        all_df = pd.concat([train, test])
        all_df = pd.get_dummies(all_df, columns=CATEGORICAL_FEATURES)
        train = all_df.iloc[:train.shape[0], :].reset_index(drop=True)
        test = all_df.iloc[:test.shape[0], :].reset_index(drop=True)

        # 欠損値埋め
        train = train.fillna(0)
        test = test.fillna(0)

        return train, test

In [4]:
preprocess = PreprocessForNN()
train_x, test_x = preprocess.process(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

In [5]:
y = train.accuracy_group.values

In [6]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [40]:
knn = KNeighborsRegressor(n_neighbors=80)

In [41]:
tra_x, val_x, tra_y, val_y = train_test_split(train_x, y)

In [42]:
knn.fit(tra_x, tra_y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=80, p=2,
                    weights='uniform')

In [43]:
pred = knn.predict(val_x)

In [44]:
pred

array([1.2125, 2.5875, 2.3125, ..., 1.8625, 2.325 , 1.5375])

In [45]:
PARAMS = {
        'threshold_0': 1.12,
        'threshold_1': 1.62,
        'threshold_2': 2.20
        }

def threshold(x, params):
    if x < params['threshold_0']:
        y = 0
    elif x < params['threshold_1']:
        y = 1
    elif x < params['threshold_2']:
        y = 2
    else:
        y = 3
    return y

def qwk(a1, a2):
    """
    Source:
    https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o += (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

In [48]:
%%time
func = np.frompyfunc(threshold, 2, 1)
pred_class = pred
loss = qwk(func(pred_class, PARAMS), val_y)

CPU times: user 11.9 ms, sys: 10 µs, total: 11.9 ms
Wall time: 11.7 ms


In [51]:
%%time
tet_pred = knn.predict(test_x)

CPU times: user 5.61 s, sys: 8 µs, total: 5.61 s
Wall time: 5.62 s


In [53]:
concat_test = pd.concat([test_x,test_x,test_x,test_x,test_x])

In [54]:
%%time
concat_test_pred = knn.predict(concat_test)


CPU times: user 28.3 s, sys: 18 µs, total: 28.3 s
Wall time: 28.4 s


In [47]:
loss

0.5322318435353558