## Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [None]:
train=pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

## Missing values

In [None]:
train.isna().sum().sort_values(ascending=False)

In [None]:
print('Train Nan Valued colas: %d' %train.isna().any().sum())

In [None]:
import missingno as msno
day0 = train.loc[train['date'] == 0]
msno.matrix(day0);

## Handling Missing Data

In [None]:
imputer = SimpleImputer(strategy='mean')
imputed_train= pd.DataFrame(imputer.fit_transform(train))

imputed_train.columns=train.columns
imputed_train.index=train.index

print(f"Is there any missing values left? {imputed_train.isna().sum().any()}")
imputed_train.head()

In [None]:
threshold = 4
z = np.abs(stats.zscore(imputed_train, nan_policy='omit'))
clean_train= imputed_train[(z < threshold).all(axis=1)].reset_index(drop=True)
clean_train

In [None]:
clean_train.drop_duplicates(keep=False,inplace=True)
clean_train

## Correlation

In [None]:
correlations = clean_train.corr(method='pearson')
fig, axs = plt.subplots(figsize=(16, 16))
sns.heatmap(correlations)
fig.show()

In [None]:
day0 = clean_train.loc[clean_train['date'] == 0]
day1 = clean_train.loc[clean_train['date'] == 1]
day0and1 = pd.concat([day0, day1])
day0and1.corr().style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

## Outlier Analysis

In [None]:
def find_skewed_boundaries(df, variable, distance):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

upper_resp,lower_resp = find_skewed_boundaries(train,'resp',1.5)


print('Capping are',lower_resp,upper_resp)

## EDA

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 10))
axs = axs.flatten()
sns.distplot(clean_train['resp'], ax=axs[0])
axs[0].set_title('resp')
sns.distplot(clean_train['weight'], ax=axs[1])
axs[1].set_title('weight')
sns.distplot(clean_train['resp_1'], ax=axs[2])
sns.distplot(clean_train['resp_2'], ax=axs[3])
sns.distplot(clean_train['resp_3'], ax=axs[4])
sns.distplot(clean_train['resp_4'], ax=axs[5])

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
balance= pd.Series(clean_train['resp']).cumsum()
resp_1= pd.Series(clean_train['resp_1']).cumsum()
resp_2= pd.Series(clean_train['resp_2']).cumsum()
resp_3= pd.Series(clean_train['resp_3']).cumsum()
resp_4= pd.Series(clean_train['resp_4']).cumsum()
ax.set_xlabel ("Trade", fontsize=18)
ax.set_title ("Cumulative return of resp and time horizons 1, 2, 3, and 4", fontsize=18)
balance.plot(lw=3)
resp_1.plot(lw=3)
resp_2.plot(lw=3)
resp_3.plot(lw=3)
resp_4.plot(lw=3)
plt.legend(loc="upper left");

In [None]:
'''feat= [c for c in clean_train.columns if 'feature' in c]
for f in feat:
    fig, axs = plt.subplots(1, 4, figsize=(15, 4))
    sns.distplot(clean_train[f], ax=axs[0])
    sns.distplot(clean_train.query('weight > 0')[f], ax=axs[1])
    try:
        sns.distplot(clean_train.query('weight > 0 and resp > 0')[f].dropna().apply(np.log1p), ax=axs[2])
        sns.distplot(clean_train.query('weight > 0 and resp < 0')[f].dropna().apply(np.log1p), ax=axs[2])
    except:
        pass
    train.sample(5000).plot(kind='scatter', x=f, y='resp', ax=axs[3])
    fig.suptitle(f, fontsize=15, y=1.1)
    
    axs[0].set_title('feature distribution')
    axs[1].set_title('only weight > 0')
    axs[2].set_title('log transform')
    axs[3].set_title('feature vs. response')
    
    plt.tight_layout()
    plt.show()
'''

In [None]:
original_size = train.shape[0]
train = train.query('weight > 0').reset_index(drop=True)

# # use data later than DATE_BEGIN
# train = train.query(f'date >= {DATE_BEGIN}')

print('Train size reduced from {:,} to {:,}.'.format(original_size, train.shape[0]))

In [None]:
train['action'] = train['resp'] * train['weight']
train['action'] = 1 * (train['action'] > 0)

In [None]:
feats = [f for f in train.columns.values.tolist() if f.startswith('feature')]
print('There are {:,} features.'.format(len(feats)))

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.LGBMClassifier(
    num_leaves=264,
    max_depth=8,
    min_child_weight=3,
    feature_fraction=0.5059148739717305,
    bagging_fraction=0.5984799881722351,
    bagging_freq=3,
    min_child_samples=68,
    lambda_l1=0.0009060749477250305,
    lambda_l2=1.9356247580596065e-06)
model.fit(train[feats], train['action'], verbose=100)

In [None]:
import janestreet

In [None]:
SEED = 20201225 # Merry Christmas!
# INPUT_DIR = '../input/jane-street-market-prediction/'
INPUT_DIR = '../input/janestreet-save-as-feather/'
TRADING_THRESHOLD = 0.5 # 0 ~ 1: The smaller, the more aggressive
DATE_BEGIN = 0 # 0 ~ 499: set 0 for model training using the complete data 

In [None]:
env = janestreet.make_env()
test = env.iter_test()
        
for (t, sub) in test:
    sub.action = (model.predict_proba(t[feats])[:, 1] > TRADING_THRESHOLD).astype('int')
    env.predict(sub)     