In [None]:
import gc
from tqdm import tqdm
import pandas as pd
import numpy as np
import janestreet as jane
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import multiprocessing
from joblib import delayed, Parallel, parallel_backend
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train_df = train_df[train_df.weight != 0 ]

In [None]:
train_df.head()

In [None]:
features = [c for c in train_df.columns if 'feature' in c] + ['date']

X = train_df.loc[:, features].values
y = train_df.loc[:, 'resp'].values

del train_df

gc.collect()

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer, MissingIndicator
transformer = FeatureUnion(
    transformer_list=[
        ('features', SimpleImputer(strategy='median')),
        ('indicators', MissingIndicator())])

transformer = transformer.fit(X, y)
X = transformer.transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle=False)
del X, y

gc.collect()

In [None]:
clf =  LinearRegression(n_jobs=-1)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)

y_pred_test = clf.predict(X_test)



In [None]:
print(f'train rmse  {np.sqrt(mean_squared_error(y_pred,y_train))}')
print(f'test rmse  {np.sqrt(mean_squared_error(y_pred_test,y_test))}')

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test() 

for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, features]
    X_test = transformer.transform(X_test)
    preds = clf.predict(X_test)  
    action = ((test_df['weight'].values * preds) > 0).astype('int')

    sample_prediction_df.action = action
    env.predict(sample_prediction_df)