<a href="https://colab.research.google.com/github/pratik-poudel/jane-street/blob/main/1%20JS%20auto%20sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download gdonchyts/jane-street-market-prediction-parquet -f train.parquet
import zipfile
zip_ref = zipfile.ZipFile('/content/train.parquet.zip', 'r')
zip_ref.extractall('files')
zip_ref.close()

Downloading train.parquet.zip to /content
 99% 1.07G/1.08G [00:08<00:00, 147MB/s]
100% 1.08G/1.08G [00:08<00:00, 130MB/s]


In [None]:
!sudo apt-get install build-essential swig
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

In [6]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import gc
# pd.options.display.float_format| = "{:.2f}".format
pd.options.display.max_columns = 500

In [7]:
df = pd.read_parquet('/content/files/train.parquet')

In [8]:
features = [f for f in df.columns if 'feature' in f]
features = features[1:]

In [79]:
train = df.query('date > 85').reset_index(drop = True) 
train = df[df['weight'] != 0]
train.shape

(1981287, 138)

In [80]:
train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')

In [81]:
train['action'].value_counts()

1    999387
0    981900
Name: action, dtype: int64

In [82]:
train.shape

(1981287, 139)

In [83]:
valid = train[train['date'] > 457].sample(2000)
train = train[train['date'] <= 457].sample(8500)

In [84]:
x_train = train[features]
y_train = train['action']
x_test = valid[features]
y_test = valid['action']

In [113]:
y_train

1131020    0
873855     1
1948001    0
742472     0
818080     1
          ..
777858     1
15991      0
221269     1
2003423    0
361407     0
Name: action, Length: 8500, dtype: int64

In [85]:
from autosklearn.classification import AutoSklearnClassifier

In [109]:
automl = AutoSklearnClassifier(time_left_for_this_task=1500,per_run_time_limit=750,
                               memory_limit=20000, 
                               include_estimators = ['random_forest', 'sgd'], exclude_estimators=None, 
                               include_preprocessors = ['no_preprocessing'],exclude_preprocessors=None,
                               seed=60,n_jobs=8, metric=autosklearn.metrics.roc_auc)

In [118]:
import sklearn
feat_type = ['Categorical' if x.name == 'category' else 'Numerical' for x in x_train[features].dtypes]


In [None]:
feat_type

In [None]:
automl.fit(x_train.to_numpy(np.float), y_train.values, x_test.to_numpy(np.float), y_test.values,feat_type=feat_type )


In [122]:
y_pred= automl.predict_proba(x_test.values)
print(sklearn.metrics.roc_auc_score(y_test.values, y_pred[:, 1]))

0.5304135181864755


In [123]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 9553fd0508fa3a5a7671ef6ceda44de6
  Metric: roc_auc
  Best validation score: 0.526217
  Number of target algorithm runs: 53
  Number of successful target algorithm runs: 41
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 12
  Number of target algorithms that exceeded the memory limit: 0



In [124]:
print(automl.show_models())

[(0.460000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'sgd', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'normalize', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:sgd:alpha': 0.04572354220586163, 'classifier:sgd:average': 'True', 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:learning_rate': 'constant', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l1', 'classifier:sgd:tol': 0.019009557451729487, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.0007965069711082688, 'classifier:sgd:eta0': 7.622836075055025e-07},
dataset_properties={
  'task': 1,
  'spa

In [125]:
automl.cv_results_['params'][np.argmax(automl.cv_results_['mean_test_score'])]

{'balancing:strategy': 'weighting',
 'classifier:__choice__': 'random_forest',
 'classifier:random_forest:bootstrap': 'False',
 'classifier:random_forest:criterion': 'gini',
 'classifier:random_forest:max_depth': 'None',
 'classifier:random_forest:max_features': 0.4846858122343987,
 'classifier:random_forest:max_leaf_nodes': 'None',
 'classifier:random_forest:min_impurity_decrease': 0.0,
 'classifier:random_forest:min_samples_leaf': 1,
 'classifier:random_forest:min_samples_split': 6,
 'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding',
 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.02004575263163203,
 'data_preprocessing:numerical_transformer:imputation:strategy': 'median',
 'data_preprocessing:numerical_transformer:rescaling:__c