In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Load in some data.  The train csv is fairly large, so I will only load some for memory purposes.

In [None]:
df1 = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=2500)

Only pull the features columns.

Instead of trying to predict the resp variable, let's just try to predict up/down.

In [None]:
X = df1.loc[:, df1.columns.str.contains('feature')]
y = (df1['resp'] > 0)*1 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .25, random_state = 69)

Some simple cleaning.  Replace any nan with the mean then scale to mean = 0 and sd = 1.  

In [None]:
imp = SimpleImputer(missing_values=np.nan , strategy='mean')
ss = StandardScaler()

X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

Perform a simple logistic regression.  I wanted to add something that uses l1 penalty to get rid of useless features.  max_iter needs to be adjusted so there is no convergence error.

In [None]:
logreg = LogisticRegression(solver = 'saga', penalty='elasticnet', 
                            l1_ratio = .5, max_iter = 5000)
logreg.fit(X_train_scaled,y_train)
print('Log Reg Score: {:.3f}'.format(logreg.score(X_test_scaled,y_test)))

So a simple model not optimized or trained on the whole data set already performs at a 61% accuracy clip.

In [None]:
import janestreet
import tqdm
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in tqdm.tqdm(iter_test):
    y_preds = []
    X_test_1 = test_df.loc[:, test_df.columns.str.contains('feature')]
    X_test_1 = ss.transform(imp.transform(X_test_1))
    sample_prediction_df.action = logreg.predict(X_test_1.reshape(1,-1))[0]
    env.predict(sample_prediction_df)