In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#把數據庫放進train裡面
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=2000000)
train.info()

In [None]:
#把weight=0的部分刪掉
train = train[train['weight']!=0]

# 創建action
# 因為resp是用來當分類器所以把resp定義成action
# 為了把resp的效用最大化所以要最大化pi，pi=∑j(weightij∗respij∗actionij)
# resp會增加pi
train['action'] = train['resp'].apply(lambda x:x>0).astype(int)

In [None]:
features = [col for col in list(train.columns) if 'feature' in col]

In [None]:
X = train[features]
y = train['action']

# 保留部分train的數據當作保留驗證集
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)

In [None]:
# 先檢查目標數據在訓練數據內有沒有平衡
sns.set_palette("colorblind")
ax = sns.barplot(train_y.value_counts().index, train_y.value_counts()/len(train_y))
ax.set_title("Proportion of trades with action=0 and action=1")
ax.set_ylabel("Percentage")
ax.set_xlabel("Action")
sns.despine();
# 目標數據相當均衡，幾乎每個action的對應交易都占50％

In [None]:
# 繪製對角線相關圖，用來查看各個特徵的關聯性
corr = train_x.corr()


mask = np.triu(np.ones_like(corr, dtype=bool))


f, ax = plt.subplots(figsize=(12, 10))


cmap = sns.diverging_palette(20, 230, as_cmap=True)


sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
# 有幾個feature有強烈的關聯性

In [None]:
# 調查feature的缺失值且進行估算
missing_values = pd.DataFrame()
missing_values['feature'] = features
missing_values['num_missing'] = [train_x[i].isna().sum() for i in features]
missing_values.T

In [None]:
train_median = train_x.median()
# 在訓練集和保留驗證集中估算中位數
train_x = train_x.fillna(train_median)
valid_x = valid_x.fillna(train_median)

In [None]:
# 要進行PCA前要先把所有feature內的資料標準化
scaler = StandardScaler()
scaler.fit(train_x)
train_x_norm = scaler.transform(train_x)

pca = PCA()
comp = pca.fit(train_x_norm)

# 繪製圖表顯示129個feature的變化如何隨feature的數量而變化
# 前15個feature包括了大約80％的變化
# 前40個feature包括大約95％的變化


plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance')
sns.despine();

In [None]:
# 只使用前50個feature來使用PCA而不使用所有feature來提高速度
pca = PCA(n_components=50).fit(train_x_norm)
train_x_transform = pca.transform(train_x_norm)

In [None]:
#轉換驗證集
valid_x_transform = pca.transform(scaler.transform(valid_x))

In [None]:
dtrain = xgb.DMatrix(train_x_transform, label=train_y)
dvalid = xgb.DMatrix(valid_x_transform, label=valid_y)

In [None]:
def objective(trial):
    
# 設定xgboost的參數
# params設定xgboost要調整的參數範圍
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 25),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
# 根據測試集的準確性評估並測試
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=25, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
# 讓xgboost的分類器使用最佳的參數
best_params = trial.params
best_params['tree_method'] = 'gpu_hist' 
best_params['objective'] = 'binary:logistic'


In [None]:
optimal_clf = xgb.XGBClassifier(**best_params)

In [None]:
optimal_clf.fit(train_x_transform, train_y)

In [None]:
# 繪製最佳準確率如何隨train的次數增加
fig = optuna.visualization.plot_optimization_history(study)
fig.show();

In [None]:
# 繪製參數改變的相對重要性
fig = optuna.visualization.plot_param_importances(study)
fig.show();

In [None]:
# 利用中位數估算缺失值
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    wt = test_df.iloc[0].weight
    if(wt == 0):
        sample_prediction_df.action = 0 
    else:
        sample_prediction_df.action = optimal_clf.predict(pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_median[features].values))))
    env.predict(sample_prediction_df)