In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
import warnings
warnings.filterwarnings("ignore")

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
%%time
example_sample_submission = dt.fread("/kaggle/input/jane-street-market-prediction/example_sample_submission.csv")
features = dt.fread("/kaggle/input/jane-street-market-prediction/features.csv")
example_test = dt.fread("/kaggle/input/jane-street-market-prediction/example_test.csv")
train = dt.fread("/kaggle/input/jane-street-market-prediction/train.csv")

print("ready")

In [None]:
example_sample_submission = example_sample_submission.to_pandas()
features = features.to_pandas()
example_test = example_test.to_pandas()
train = train.to_pandas()

In [None]:
def reduce_mem_usage(df):
    # iterate through all the columns of a dataframe and modify the data type
    #   to reduce memory usage. Credits to Guillaume Martin
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
%%time

for name, ds in zip(["submission","features","train","test"],[example_sample_submission, features, train, example_test]):
    print("{} \n".format(name))
    ds = reduce_mem_usage(ds)
    print("\n")

In [None]:
train = train[train.weight != 0]
ls_feature = [c for c in train.columns if 'feature' in c]

X = train[ls_feature]
y = (train["resp"].values>0).astype("int")

In [None]:
del train

In [None]:
from sklearn.metrics import accuracy_score
import xgboost as xgb

xgboost_params = { 
    "n_estimators":500,
    "max_depth":11,
    "min_child_weight":9.15,
    "gamma":0.59,
    "learning_rate":0.05,
    "subsample":0.9,
    "colsample_bytree":0.7,
    "alpha":10.4,
    "nthread":5,
    "missing":-999,
    "random_state":2020,
    "tree_method":'gpu_hist'
}

In [None]:
clf = xgb.XGBClassifier(**xgboost_params)

In [None]:
%%time

clf.fit(X,y)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    y = clf.predict(X_test)
    sample_prediction_df.action = y
    env.predict(sample_prediction_df)