In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
import pandas as pd 
import numpy as np 
import janestreet
from sklearn.preprocessing import MinMaxScaler
import tensorflow.keras
import time
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation, Lambda, dot, Activation, concatenate,  Layer, BatchNormalization
from tensorflow.keras.callbacks import Callback
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
path = '../input/jane-street-market-prediction/'
train_data = import_data(path + 'train.csv')
features = import_data(path + 'features.csv')
#sample = pd.read_csv(path + 'example_sample_submission.csv')
#test_data = pd.read_csv(path + 'example_test.csv')
# train_data = pd.read_csv(path + 'train.csv')
# features = pd.read_csv(path + 'features.csv')

In [None]:
features = [f'feature_{x}' for x in range(130)] 

target_feature = 'action'

#train_data.fillna(train_data.mean(),inplace=True)

#try using all 0s
train_data.fillna(0, inplace = True)
train_data['action'] = 0
#train_data.loc[train_data['resp']>0.0 ,'action'] = 1

#define assignment of 1 for more action more clear
train_data.loc[(train_data['resp_1'] > 0.0) , 'action'] = 1

#print('Buy: ', train_data['action'].value_counts())


dates = train_data['date'].to_numpy()

target = train_data[target_feature].to_numpy()
train_data = train_data[features].to_numpy()
train_data

In [None]:
#Print baseline accuracy
buy = np.count_nonzero(target == 1)
sell = np.count_nonzero(target == 0)
print('Buy: ', buy)
print('Sell: ', sell)

print('Baseline Buy Rate: ', buy/(buy + sell))

In [None]:
#print("Number of features with null values:",np.sum(train_data.isna().sum()>0))

In [None]:
#Developing Naive Bayes
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
#O(np) n = datapoints vs p = number of features

#split 75: 25 train:validation
X_train, X_valid, y_train, y_valid = train_test_split(train_data, target, test_size = 0.25)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_valid)

print('Number of mislabeled points out of a total %d points : %d' % (X_valid.shape[0], (y_valid != y_pred).sum()))
print('Validation Accuracy Percentage: ', (y_valid != y_pred).sum()/ X_valid.shape[0])



In [None]:
#sample = import_data(path + 'example_sample_submission.csv')
#test_data = import_data(path + 'example_test.csv')

In [None]:
#env = janestreet.make_env() # initialize the environment
#iter_test = env.iter_test() # an iterator which loops over the test set

# for (test_df, sample_prediction_df) in iter_test:
#     sample_prediction_df.action = 0 #make your 0/1 prediction here
#     env.predict(sample_prediction_df)
#for (test_df, sample_prediction_df) in iter_test:
    
 #   prediction = 0
 #   for model in lgb_models:
 #       prediction += model.predict(test_df[features])[0]
    
 #   prediction /= len(lgb_models)
 #   prediction = prediction >= 0.5
 #   sample_prediction_df.action = prediction.astype(int)
 #   env.predict(sample_prediction_df)


In [None]:
submission = pd.read_csv('./submission.csv')
submission.head()