# Library

In [None]:
import os
import os.path as osp
import pickle

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer

In [None]:
TRAIN = "../usr/lib/orvp_table_data_generator/train_df.csv"

In [None]:
train_df = pd.read_csv(TRAIN)

# クラスターリスト作成

In [None]:
cluster_list = train_df.groupby('stock_id')[['c1']].agg(np.mean).reset_index()
cluster_list.to_csv("cluster_list.csv", index=False)

# 高相関特徴量特定

In [None]:
def get_fe_remove(df, threshold=0.95):
    corr_df = df.corr().abs()
    features = corr_df.columns.tolist()
    
    fe_remove = []

    for i, feature in enumerate(features):
        if feature in fe_remove: continue
        temp_series = corr_df.iloc[i+1:, i]
        corr_idx = temp_series > threshold

        series = temp_series.loc[corr_idx]
        fe_corr = series.index.tolist()
        if fe_corr!=[]:
            if feature not in ['stock_id', 'fold', 'oof', 'row_id', 'target', 'time_id']:
                fe_remove += fe_corr
                
    fe_remove = list(set(fe_remove))
    return fe_remove

In [None]:
fe_remove = get_fe_remove(train_df, threshold=0.97)

fe_remove_ = [fe for fe in fe_remove if fe not in ['c1', 'stock_id']]
train_df = train_df.drop(fe_remove_, axis=1)

In [None]:
fe_remove_

# 数値的補完関数作成

In [None]:
x = train_df.drop(['row_id', 'target'], axis=1)
y = train_df['target']

cols_drop = ['time_id']
cols_cat = ['stock_id', 'c1']

# Transform stock id to a numeric value
for fe in cols_cat:
    x[fe] = x[fe].astype(int)

x = x.replace([np.inf, -np.inf], np.nan)
x = x.fillna(method='ffill')
x_num = x.drop(cols_cat + cols_drop, axis=1).values
x_cat = x.loc[:, cols_cat].values

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_num)

scaler = QuantileTransformer(n_quantiles=2000, random_state=28, output_distribution='normal')
scaler.fit(x_num)

In [None]:
with open("imputer.pkl", "wb") as f:
    pickle.dump(imputer, f)
    
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# カテゴリー的補完関数作成

In [None]:
cat_dims = []
for i, col in enumerate(cols_cat):
    le = LabelEncoder()
    le.fit(x.loc[:, col])
    cat_dims.append(len(le.classes_))
    
    with open(f"le_{col}.pkl", "wb") as f:
        pickle.dump(le, f)

In [None]:
cat_dims  # stock_id, c1

In [None]:
train_df.to_csv("train_preprocessed.csv", index=False)