In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=100000)
train.head()

In [None]:
import csv
from csv import DictReader


def train_data(path):
    t = 0
    with open(path, 'r') as f:
        buffer = csv.reader(f)
        for ID, line in enumerate(buffer):
            if ID == 0:
                date_idx = line.index('date')
                weight_idx = line.index('weight')
                resp_idx = line.index('resp')
                cols = [i for (i, field) in enumerate(line) if 'feature' in field]
                yield len(cols)  # dimension
                continue
            # preprocess
            if line[weight_idx] == '0':
                continue
            else:
                y = int((float(line[weight_idx]) * float(line[resp_idx])) > 0)
                x = np.array([float(line[i]) if line[i] else 0 for i in cols])
                yield ID - 1, x, y, t, line[date_idx]
                t += 1
                

In [None]:
import numba

DATE = 'date'
WEIGHT = 'weight'
TARGET = 'resp'

@numba.njit(fastmath = True)
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

def utility(data, action):
    return utility_score_numba(data[DATE].values, data[WEIGHT].values, data[TARGET].values, action)

In [None]:
train.date.unique()

In [None]:
train.describe()

In [None]:
from math import exp


def projection(x, V):
    """
    projection of vector x on ball of radius V
    """
    x_norm = np.linalg.norm(x)
    if x_norm > V:
        return x / x_norm * V
    return x


class ogd(object):
    """
    Standard Gradient Descent algorithm for linear prediction problems,
    with Euclidean regularization.
    """

    def __init__(self, d, beta=1, rand_init=False, project=False):
        """
        d - dimension
        beta - diameter of the set V considered (we do not operate projection)
        """
        self.d = d                  # dimension
        self.beta = beta            # diameter for projection
        self.rand_init = rand_init  # random initialization
        self.project = project      # whether to project on a bounded (L2) ball
        self.w = None
        self.reset()

    def reset(self):
        if self.w is None:
            if self.rand_init:
                self.w = np.random.rand(self.d)
            else:
                self.w = np.zeros(self.d)
        self.eta = 1
        self.G_t = 0
        
    def get_model(self):
        return self.w
            
    def predict(self, x):
        """
        Prediction according OGD weights.
        """
        return 1. / (1. + np.exp(-np.maximum(np.minimum(x.dot(self.w), 35.), -35.)))
    
    def update(self, x, p, y):
        """
        Update model.
        """
        
        # gradient of log loss
        g_t = (p - y) * x

        # params
        self.G_t += np.linalg.norm(g_t)**2
        self.eta = self.beta / np.sqrt(1 + self.G_t)
        self.w -= self.eta * g_t
        if self.project:
            self.w = projection(self.w, self.beta)
            
    def __repr__(self):
        return "OGD"

        
def logloss(p, y):
    ''' FUNCTION: Bounded logloss

        INPUT:
            p: our prediction
            y: real answer

        OUTPUT:
            logarithmic loss of p given y
    '''

    p = max(min(p, 1. - 10e-15), 10e-15)
    return -np.log(p) if y == 1. else -np.log(1. - p)


In [None]:
class ftrl(ogd):
    """
    FTRL algorithm for linear prediction problems,
    with Euclidean regularization.
    """

    def __init__(self, d, beta=1, rand_init=False, project=False):
        """
        d - dimension
        beta - diameter of the set V considered (we do not operate projection)
        """
        ogd.__init__(self, d, beta=1, rand_init=rand_init, project=project)
        self.grad_sum = np.zeros(d)
    
    def update(self, x, p, y):
        """
        Update model.
        """
        
        # gradient of log loss
        g_t = (p - y) * x

        # params
        self.grad_sum += g_t
        self.G_t += np.linalg.norm(g_t)**2
        self.eta = self.beta / np.sqrt(1 + self.G_t)
        self.w = -self.eta * self.grad_sum
        if self.project:
            self.w = projection(self.w, self.beta)
            
    def __repr__(self):
        return "FTRL"

In [None]:
from datetime import datetime


def training(learner, path, epoch=1, verbose=True):
    # start training
    start = datetime.now()
    
    for e in range(epoch):
        count = 0
        valid_loss = 0.
        train_loss = 0.
        prev_date = '0'
        
        it = train_data(path)
        dim = next(it)  # first iteration returns the dimension
        
        for ID, x, y, t, date in it:  # data is a generator
            # ID  : index of the line in the file 
            # x   : features
            # y   : label
            # t   : just a instance counter
            # date: date!

            # step 1, get prediction from learner
            p = learner.predict(x)
            train_loss += logloss(p, y)

            if t % 1000 == 0:
                # step 2-1, calculate validation loss
                #           we do not train with the validation data so that our
                #           validation loss is an accurate estimation
                valid_loss += logloss(p, y)
                count += 1
            else:
                # step 2-2, update learner with label information
                learner.update(x, p, y)
                yield ID, p
            
            if date != prev_date:
                pi = 0
            
            if t % 20000 == 0:
                if t == 0:
                    pass
                else:
                    avg_train = train_loss / t
                    if verbose:
                        print(f"Processed: {t} datapoints", datetime.now())
                        print(f"\tAVG Cum Train Loss: {avg_train:.3f}")
                        print(f"\tAVG Cum Valid Loss: {valid_loss / count:.3f}")
                


In [None]:
from tqdm import tqdm
import pickle


##############################################################################
# parameters #################################################################
##############################################################################


# A, paths
train_path = '/kaggle/input/jane-street-market-prediction/train.csv'    # path to training file
epoch = 1                                                               # learn training data for N passes


##############################################################################
# start training #############################################################
##############################################################################

dim = next(train_data(train_path))
# initialize a learner
# learners = [ogd(dim, project=True), ftrl(dim)]
learners = [ftrl(dim)]

for learner in learners:
    print(f"----- {str(learner)} -----\n")
    ids = []
    pred = []
    for t, value in enumerate(training(learner, '/kaggle/input/jane-street-market-prediction/train.csv', epoch)):
        ID, p = value
        ids.append(ID)
        pred.append(p)
    print()
    # utility function
    pred_train = np.array(pred)
    try:
        print(f"Train Score = {utility(train.iloc[ids], pred_train > 0.5):.4f}\n")
    except IndexError:
        print("Maybe next time!")
        train_pickle_file = '/kaggle/input/pickling/train.csv.pandas.pickle'
        train = pickle.load(open(train_pickle_file, 'rb'))
    

try:
    pickle.dump(learner.get_model(), open(os.path.join('../working', 'model.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
except TypeError:
    learner.get_model().save(open(os.path.join('../working', 'model.h5')))

In [None]:
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array


def test_data(df_object, cols):
    ''' GENERATOR.

        INPUT:
            path: path to testing file

        YIELDS:
            ID: id of the instance, mainly useless
            x: feature vector
            y: action, either 0 or 1
    '''
    # test dataset is given as pandas dataframe
    arr = pd.DataFrame(df_object, columns=cols).to_numpy()
    return fillna_npwhere_njit(arr, 0)
            

In [None]:
small_train = train.iloc[:100000]
cols = ['feature_' + str(i) for i in range(130)]
y = learner.predict(test_data(small_train, cols))
y

In [None]:
##############################################################################
# start testing, and build Kaggle's submission file ##########################
##############################################################################

import janestreet

env = janestreet.make_env()

test_path = '/kaggle/input/jane-street-market-prediction/example_test.csv'
iter_test = env.iter_test()
cols = ['feature_' + str(i) for i in range(130)]
# test_all_data = []

for (test_df, sample_prediction_df) in tqdm(iter_test):
    sample_prediction_df.action = int(learner.predict(test_data(test_df, cols)) > .5)
    env.predict(sample_prediction_df)
    # test_all_data.append(test_df)
    
"""
# sanity check
zeros = []
ones = []

for idx, x in enumerate(test_all_data):
    y_pred = int(learner.predict(test_data(x, cols)) > .5)
    if y_pred == 1:
        ones.append(idx)
    else:
        zeros.append(idx)

print(len(ones))
print(len(zeros))
"""