In [None]:
import gc

from tqdm import tqdm

import pandas as pd
import numpy as np
import janestreet as jane
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score

import matplotlib.pyplot as plt

import multiprocessing
from joblib import delayed, Parallel, parallel_backend

import warnings
warnings.filterwarnings("ignore")

Reading the data, creating the target

In [None]:
%%time

train = pd.read_csv("../input/jane-street-market-prediction/train.csv")
train = train.loc[train["weight"] != 0,]
train["Y"] = (train["weight"]*train["resp"] > 0).astype("int")

trainCV = train.iloc[0:100000,:]

train.ndim# dimensions of data frame
train.size #rows
train.shape#rows columns
train.head(n=5)

date = train["date"]
dateCV = date[0:100000] 

Creating targets and features

In [None]:
impFeat = pd.read_csv("../input/market-prediction-feature-importance-lofo-lgbm/importanceResults.csv")
featVecGen = impFeat.iloc[0:60,0]

In [None]:
impFeat = impFeat.iloc[0:20,:]

plt.figure(figsize=(10,7))
plt.barh(impFeat.iloc[:,0], 
         impFeat.iloc[:,1], 
         xerr=impFeat.iloc[:,2], 
         align='center',
         color= "green")
plt.xlabel('Importance')
plt.title('LOFO Feature Importance')

plt.show()

In [None]:
%%time

X, XCV = train.loc[:,train.columns.isin(featVecGen)], trainCV.loc[:,train.columns.isin(featVecGen)]
Y, YCV = train.loc[:,train.columns.str.contains("Y")], trainCV.loc[:,train.columns.str.contains("Y")]

Removing train dataset

In [None]:
del train, trainCV
gc.collect()

Define function for normalization and subsetting

Define models

In [None]:
def models(k):
    """This function generates models in a list"""
    
    RFC = RF(
            n_jobs = -1,
            random_state = k,
            n_estimators = 70,
            max_depth = 12,
            max_samples = 0.75,
            min_samples_leaf = 3
            )
    
    return RFC

help(models)

Read in important feature names

CV is separated by time. It predicts future values, stores the model in al list and assesses the quality by accuracy.

Random Forest can not interpolate trends. It needs stationary data.
Logit is able to interpolate. Therefore, it is a good supplement.

In [None]:
%%time

D=0

folds = 10
denom = max(dateCV+1)/folds
shift = 4

print("total folds: ", folds - shift*2)

AccRFC = []
AucRFC = []
RecRFC = []
PreRFC = []
RFCs = []

k=0

for D, k in zip(list(range(0,folds-shift*2+1)), list(range(0,folds-shift*2+1))):
    
    #create time variant indices
    startTr = ((dateCV)/(denom)).astype("int") >= D
    endTr = ((dateCV)/(denom)).astype("int") < D+shift
    train_index = startTr & endTr
    
    startTe = ((dateCV)/(denom)).astype("int") >= D+shift
    endTe = ((dateCV)/(denom)).astype("int") < D+shift*2
    test_index = startTe & endTe
    print("D: ", D, "D+shift: ", D+shift)
    
    #train test split
    X_train, X_test = XCV.loc[train_index,:], XCV.loc[test_index,:]
    y_train, y_test = YCV.loc[train_index,:], YCV.loc[test_index,:]

    #learn
    RFC = models(k)
    RFC.fit(X_train.fillna(value=-999), y_train) 
    
    #store fitted models
    RFCs.append(RFC)
    
    #predict
    R = RFCs[k]
    PredRFC = R.predict(X_test.fillna(value=-999))

    #store accuracy
    AccRFC.append(accuracy_score(y_test, PredRFC))
    
    #store AUC
    AucRFC.append(roc_auc_score(y_test, PredRFC))
    
    #Recall
    RecRFC.append(recall_score(y_test, PredRFC))
    
    #Precision
    PreRFC.append(precision_score(y_test, PredRFC))

In [None]:
print("Accuracy Random Forest is: ", AccRFC)
print("Auc Roc Random Forest is: ", AucRFC)
print("Recall Random Forest is: ", RecRFC)
print("Precision Random Forest is: ", PreRFC)

In [None]:
No2plt = 10

RFImpMean = (RFCs[0].feature_importances_ + RFCs[1].feature_importances_ + RFCs[2].feature_importances_)/3

Importances = np.c_[list(X_train.columns), list(RFImpMean)]#np.c_ == cbind()
ind = list(np.lexsort((Importances[:,0],Importances[:,1])))
Importances = Importances[list(reversed(ind))]

Features = Importances[0:No2plt,0]
y_pos = np.arange(No2plt)
Importance = np.round(Importances[0:No2plt,1].astype("float32"), 4)

I = np.c_[list(RFCs[0].feature_importances_), 
          list(RFCs[1].feature_importances_), 
          list(RFCs[2].feature_importances_)]


maxI = np.amax(I, 1)
minI = np.amin(I, 1)
error = maxI - minI 
error = error[0:No2plt]


plt.figure(figsize=(12,7))
plt.barh(Features, 
         Importance, 
         xerr=error, 
         align='center',
         color = "green")
plt.xlabel('Importance')
plt.title('RF Gini Feature Importance')

plt.show()

In [None]:
del X_train, y_train, y_test, PredRFC, AccRFC, impFeat, RFC, RFCs, dateCV, date, X_test
gc.collect() 

### Final learning

In [None]:
%%time

RFC = models(10)
RFC.fit(X.fillna(value=-999), Y) 
gc.collect()


Initializing environment, preparing data and submit.

In [None]:
%%time

cores = multiprocessing.cpu_count()

env = jane.make_env() 
iter_test = env.iter_test() 

        
for (test_df, sample_prediction_df) in tqdm(iter_test):
    
    if test_df["weight"].item() == 0:
        sample_prediction_df.action = 0
    else:
        X_test = test_df[featVecGen].fillna(value=-999)
        sample_prediction_df.action = RFC.predict(X_test)
    
    env.predict(sample_prediction_df)