In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import gc, pickle, os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import janestreet

In [None]:
# dfの各列の型を設定しメモリ軽減
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
# 欠損値の補完(前の値で補完する)
train.fillna(method = 'ffill', inplace=True) 
train.dropna(inplace=True)

In [None]:
resp_params = (train['resp'].mean(), train['resp'].std())
resp_standardized = ((train['resp'] - resp_params[0])/resp_params[1]).values
resp_info = (resp_params, resp_standardized)

In [None]:
# 列を取得
columns = train.columns.drop(['date', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp', 'ts_id'])

# 基準化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(train[columns])

In [None]:
Z = sc.transform(train[columns])
train = pd.DataFrame(Z, columns=columns)
# メモリ対策
train = reduce_mem_usage(train)

# 主成分分析

In [None]:
import sklearn
from sklearn.decomposition import PCA

# 主成分分析
pca = PCA()
pca.fit(train.drop(['weight'],axis=1).values)

# データを主成分空間に写像
score = pca.transform(train.drop(['weight'],axis=1).values)

In [None]:
# respと主成分スコアを1つのdfにまとめる
target = pd.DataFrame(np.concatenate([resp_info[1][:, np.newaxis], score[:, :16]], axis=1))
target.columns = pd.Index(['resp'] + ['PC{}'.format(i+1) for i in range(16)])

# 'weight'を追加
target = pd.concat([target, train['weight']], axis=1).copy()

# メモリ対策
del score
del train
gc.collect()

# K-Means

In [None]:
from sklearn.cluster import KMeans # K-means
kmeans_model = KMeans(n_clusters=5, random_state=0).fit(target.iloc[:, 1:]) # resp以外でクラスタリング

# 結果をdfにまとめる
km_result = pd.concat([target, pd.DataFrame(kmeans_model.labels_, columns=['cluster'])],axis=1)
km_result['resp_pn'] = km_result['resp'].apply(lambda x:'p' if x>0 else 'n')
km_result.head()

In [None]:
#modelを保存しておく
#with open('./kmeans_model.pickle', 'wb') as f: pickle.dump(kmeans_model,f)

#結果を保存しておく
#km_result['resp_pn'] = km_result['resp'].apply(lambda x:'p' if x>0 else 'n') km_result.to_pickle('./km_result.pickle')

# モデル構築

In [None]:
# 学習デートと検証データに分ける(時系列データのため、直近2割を検証用)
from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(km_result, shuffle=False, test_size=0.2)

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(train_data.drop(['resp', 'resp_pn'], axis=1), train_data['resp'])
lgb_eval = lgb.Dataset(valid_data.drop(['resp', 'resp_pn'], axis=1), valid_data['resp'])

# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression', # 目的 : 回帰  
        'metric': {'rmse'}, # 評価指標 : rsme(平均二乗誤差の平方根)
        'num_iteration': 10000, #10000回学習
        'verbose': 0
}

# モデルの学習
model = lgb.train(params, # パラメータ
            train_set=lgb_train, # トレーニングデータの指定
            valid_sets=lgb_eval # 検証データの指定
            #early_stopping_rounds=100 # 100回ごとに検証精度の改善を検討　→ 精度が改善しないなら学習を終了(過学習に陥るのを防ぐ)
            )

In [None]:
#modelを保存しておく
#with open('./lgb_model.pickle', 'wb') as f: pickle.dump(model,f)

# 検証データについて

In [None]:
mu, sigma = resp_info[0]
valid_predict = model.predict(valid_data.drop(['resp', 'resp_pn'], axis=1)) * sigma + mu

In [None]:
# ヒストグラムを書いてみる
plt.figure()
plt.subplot(1, 2, 1)
plt.hist(valid_predict, bins=50, label='valid_predict', color='blue')
plt.title('predict')

plt.subplot(1, 2, 2)
valid_data['resp'].hist(bins=50, histtype='step', label='valid_resp', color='red')
plt.title('true resp')

#plt.legend()

形状は似通っているように見える

In [None]:
valid_data['predict'] = valid_predict * sigma + mu

# actionを決める為、thresholdを設定する(予測が正だったものから1%刻みのパーセンタイルとする)
score_data = {}
for i in range(100):
    threshold = np.percentile(valid_data.loc[valid_data['predict'] > 0]['predict'], i)
    score = valid_data['resp'].loc[valid_data['predict'] > threshold].sum()
    # save
    score_data[i] = [threshold, score]

# total scoreのthreshold毎の推移
plt.plot([score for _, score in score_data.values()], label='total score')
plt.title('total score in each thresholds')
plt.xlabel('threshold')
plt.ylabel('total score')

In [None]:
best_score_point = np.argmax([score for _, score in score_data.values()])
best_score = score_data[best_score_point][1]
best_score_threshold = score_data[best_score_point][0]
print('best score: {}'.format(best_score))
print('best score point: {}'.format(best_score_point))
print('best threshold: {}'.format(best_score_threshold))

# テストデータの推定

In [None]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
first_step = True
second_step = False

for (test_df, sample_prediction_df) in iter_test:
    null_pos = test_df.isnull().values #　欠損値の位置(True or Flaseの配列)
    with_null = null_pos.any() # 欠損値の判定
    # 最初の欠損値の処理:actionをしないでスキップ
    if first_step:
        if with_null:
            sample_prediction_df["action"] = 0
            env.predict(sample_prediction_df)
        else:
            first_step = False
            second_step = True  

    # 欠損値が無いデータ以降の処理(※途中、欠損値を含む)
    if second_step:
        if with_null:
            # 欠損値を前のレコードの値で埋める
            null_columns = np.where(null_pos)[1]
            test_df.iloc[:, null_columns] = test_df_prv.iloc[:, null_columns].values
        
        # 前レコードを保存
        test_df_prv = test_df.copy()
        
        if test_df['weight'].items() == 0:
            sample_prediction_df["action"] = 0
            env.predict(sample_prediction_df)
            continue
        
        # 正規化
        Z = sc.transform(test_df[columns])
        
        # 主成分分析:データを主成分空間に写像
        score_test = pca.transform(Z[:, 1:])

        # weightを追加する
        score_test = np.append(score_test[:, :16], Z[:, 0].item())

        # クラスター番号(予測値)を追加する
        cluster_num = kmeans_model.predict(score_test[np.newaxis, :])

        # respの推定
        y_pred = np.dot(model.predict(np.append(score_test, cluster_num)[np.newaxis, :]), sigma) + mu

        # action{0, 1}に変換
        action = 1 if y_pred > best_score_threshold else 0

        # 結果を格納
        sample_prediction_df["action"] = action
        env.predict(sample_prediction_df)