In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, train_test_split, KFold

import matplotlib.pyplot as plt
import seaborn as sns

import gc
import time


In [None]:
data_path = '/kaggle/input/jane-street-market-prediction'
train = pd.read_csv(os.path.join(data_path, 'train.csv'))
features = pd.read_csv(os.path.join(data_path, 'features.csv'))
test = pd.read_csv(os.path.join(data_path, 'example_test.csv'))
submission = pd.read_csv(os.path.join(data_path, 'example_sample_submission.csv'))


In [None]:
import time

start = time.time()
x_ = train.fillna(0, inplace = False)
stop = time.time()
print('Pandas fillna:', stop - start)

start = time.time()
x_ = np.nan_to_num(train)
stop = time.time()
print('nan_to_num fillna:', stop - start)

start = time.time()
x_ = np.where(np.isnan(train), 0, train)
stop = time.time()
print('np.where fillna:', stop - start)

In [None]:
# downcast dataset to save memory
gc.collect()

def downcast_type(df):
    # downcast value: float64 -> float32, int64 -> int32
    float_cols = [col for col in df.columns if df[col].dtype == 'float64']
    int_cols = [col for col in df.columns if df[col].dtype == 'int64']
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return df

train = downcast_type(train)
features = downcast_type(features)
test = downcast_type(test)
    


In [None]:
print(train.shape)
print(features.shape)
print(test.shape)
gc.collect()

In [None]:
train.head()

In [None]:
features.head()

In [None]:
test.head()

In [None]:
train.date.value_counts().sort_values(ascending = False)

In [None]:
plt.plot(train.date, color='blue')

In [None]:
# Raw model: withou explicit feature engineering and domain knowledge, get a baseline prediction
gc.collect()

train['action_num'] = train['resp_1'] + train['resp_2'] + train['resp_3'] + train['resp_4']

gc.collect()
train['action'] = train['weight']*train['resp']

threshold = 0


# binarize action: greater than thershold, action = 1. less than threshold, action = 0
train[train['action'] > threshold] = 1
train[train['action'] <= threshold] = 0
train[train['action_num'] > threshold] = 1
train[train['action_num'] <= threshold] = 0


In [None]:
# combine the tag_x influence for each features in each row:
# 1. convert impact of tag_x to binary
# 2. calculate product of each tag_x for each feature_x in each row
# 3. combine the tag_x impact of each feature_x in each row together
features_bin = features.copy()
features_bin.iloc[:, 1:] = features.iloc[:, 1:].astype(int)
gc.collect()
print(features_bin.head())

In [None]:
feature_col = [col for col in train.columns if 'feature' in col]
tags = [col for col in features.columns if 'tag' in col]

product = np.dot(train[feature_col], features_bin[tags])

gc.collect()
train[tags] = product
gc.collect()

In [None]:
# Feature processing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, train_test_split

train.fillna(0, inplace = True)


# Generate X & y dataframe
drop_feature = ['action', 'action_num', 'resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'ts_id']
target = train['action']
train.drop(drop_feature, axis = 1, inplace = True)
gc.collect()



In [None]:
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score

lr = LogisticRegression()

target = target.values.reshape(-1, 1)
lr.fit(train, target)

predict = lr.predict(train)

score = roc_auc_score(target, predict)

print('LR score:', score)





In [None]:
!pip install numba

In [None]:
# fillna function
from numba import njit

@njit
def fillna_arr(df):
    
    if (np.isnan(df).sum()):
        df = np.where(np.isnan(df), 0, df)

    return df

In [None]:
# Make submission

import janestreet

janestreet.competition.make_env.__called__ = False
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in iter_test:
    
    a = test_df
    stop
    test_df = downcast_type(test_df)
    gc.collect()
    test_df[tags] = np.dot(test_df[feature_col], features_bin[tags])
    gc.collect()
    
    # Save time for submission
    # For zero weight, prediction.action = 0
    X_test = test_df.values
    X_test = fillna_arr(X_test)
    
    sample_prediction_df.action = lr.predict(X_test) #make your 0/1 prediction here
        
    env.predict(sample_prediction_df)
    



In [None]:
a.loc[:, a.columns.str.contains('feature')]