In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
# 支持向量机
from sklearn.svm import SVC, LinearSVC
# 随机森林
from sklearn.ensemble import RandomForestClassifier
# KNN算法
from sklearn.neighbors import KNeighborsClassifier
# 朴素贝叶斯算法
from sklearn.naive_bayes import GaussianNB
# SGD算法
from sklearn.linear_model import SGDClassifier
# 决策树算法
from sklearn.tree import DecisionTreeClassifier
# XGBoost
from xgboost import XGBClassifier
import janestreet

# 建模过程
def modeling(train):
    print("开始建模")
    
    X_train = train.loc[:, train.columns.str.contains('feature')]
    y_train = train.loc[:, 'action']
    
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)
    model = XGBClassifier(tree_method="gpu_hist",n_estimators = 95, learning_rate = 0.099, max_depth = 9,subsample=0.7648 )
    model.fit(X_train, y_train)
    
    return model

    
# 评分函数
def Score(model):
    test_df = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
    test_df = test_df.fillna(0.0)
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    resp = model.predict(X_test)
    date = test_df["date"].values
    weight = test_df["weight"].values
    action = (resp > 0).astype("int")
    
    count_i = len(np.unique(date))
    Pi = np.zeros(count_i)
    # 用循环太慢
    #for i, day in enumerate(np.unique(date)):
#        Pi[i] = np.sum(weight[date == day] * resp[date == day] * action[date == day])
    # 用下面这行代替
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u


# 特征工程
def featureEngineer(data):
    datalen = len(data)
    data = data.iloc[int(datalen/5):, :]
    data = data[data['weight'] != 0]
    data = data.fillna(0.0)
    weight = data['weight'].values
    resp = data['resp'].values
    data['action'] = ((weight * resp) > 0).astype('int')
    return data


# 进行预测，求值版本
def predict_value(model):
    env = janestreet.make_env() # initialize the environment
    iter_test = env.iter_test() # an iterator which loops over the test set

    # i = 0
    for (test_df, sample_prediction_df) in iter_test:
        # print(test_df.columns, sample_prediction_df.columns)
        if test_df['weight'].item() > 0:
            X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
            X_test = X_test.fillna(0.0)
            # pred_resp = model.predict(X_test)
            # y_preds = (test_df["weight"]*pred_resp > 0).astype('int')
            y_resp = model.predict(X_test)[0]
            y_preds = 0 if y_resp < 0 else 1
            # print("第%d个数据" % i)
            # print(test_df["weight"].values[0], y_resp, test_df["weight"].values[0]*y_resp, y_preds)
            # i = i+1
        else:
            y_preds = 0
        sample_prediction_df.action = y_preds # THIS LINE
        env.predict(sample_prediction_df)
    
    
# 进行预测，生成提交文件，分类版
def predict_clf(model):
    env = janestreet.make_env()
    iter_test = env.iter_test()
    for (test_df, sample_prediction_df) in iter_test:
        if test_df['weight'].item() > 0:
            X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
            X_test = X_test.fillna(0.0)
            y_preds = model.predict(X_test)[0]
        else:
            y_preds = 0
        print(y_preds)
        sample_prediction_df.action = y_preds
        env.predict(sample_prediction_df)
        
    
train = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
train = featureEngineer(train)

# xgboost
print("XGBoost")
model = modeling(train)
# 计算模型评分
# score = Score(model)
# print("模型评分:%.2f" % score)
# 进行预测和提交
predict_clf(model)
print("结束。")