In [None]:
import pandas
import os
import numpy
import math
import matplotlib.pyplot

In [None]:
train_csv = pandas.read_pickle('/kaggle/input/jsmp-preprocess-1/train_csv.pickle.gz')
features_csv = pandas.read_pickle('/kaggle/input/jsmp-preprocess-1/features_csv.pickle.gz')

feature_columns = [f'feature_{i}' for i in range(130)]

In [None]:
import lightgbm
import sklearn.model_selection
import sklearn.metrics

def calc_utility_score(actions, df):
    result = pandas.DataFrame({
        'date': df.loc[:, 'date'],
        'resp': df.loc[:, 'resp'],
        'weight': df.loc[:, 'weight'],
        'action': actions,
    })
    
    days = result['date'].nunique()

    result['p'] = result['weight'] * result['resp'] * result['action']

    p = result.groupby('date').sum()['p']
    t = p.sum() / math.sqrt((p ** 2).sum()) * math.sqrt(250 / days)
    u = min(max(t, 0), 6) * p.sum()
    
    normalized_u = u / days
    
    return normalized_u

train_rates = numpy.linspace(0.05, 0.95, 10)
train_scores = []
test_scores = []

for train_rate in train_rates:
    train_n_rows = int(len(train_csv) * train_rate)
    train_df = train_csv.iloc[:train_n_rows, :]
    test_df = train_csv.iloc[train_n_rows:, :]
    
    train_features = train_df.loc[:, feature_columns].to_numpy()
    train_resps = train_df.loc[:, 'resp']

    test_features = test_df.loc[:, feature_columns].to_numpy()
    test_resps = test_df.loc[:, 'resp']

    train = lightgbm.Dataset(train_features, train_resps)
    test = lightgbm.Dataset(test_features, test_resps, reference=train)

    params = {}
    model = lightgbm.train(params=params, train_set=train, valid_sets=test)
    
    train_score = calc_utility_score(model.predict(train_features) > 0, train_df)
    test_score = calc_utility_score(model.predict(test_features) > 0, test_df)
    
    train_scores.append(train_score)
    test_scores.append(test_score)

    print(f'Train score : {train_score}')
    print(f'Test score : {test_score}')
    
matplotlib.pyplot.plot(train_rates, train_scores, label='Train scores')
matplotlib.pyplot.plot(train_rates, test_scores, label='Test scores')
matplotlib.pyplot.xlabel('Train data rate')
matplotlib.pyplot.ylabel('Utility score')
matplotlib.pyplot.legend()
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.show()

In [None]:
train = lightgbm.Dataset(train_csv.loc[:, feature_columns].to_numpy(), train_csv.loc[:, 'resp'].to_numpy())
params = {}
model = lightgbm.train(params=params, train_set=train)

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    action = model.predict(test_df.loc[:, feature_columns].to_numpy()) > 0
    
    sample_prediction_df.action = action.astype(numpy.int8)
    env.predict(sample_prediction_df)