In [None]:
import warnings
warnings.filterwarnings('ignore')

import os, gc
# import cudf
import pandas as pd
import numpy as np
# import cupy as cp
# import janestreet
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

import tensorflow as tf
tf.random.set_seed(41)
import tensorflow.keras.backend as K

import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from typing import List
from sklearn.cluster import KMeans
from scipy.stats import mstats
from scipy.spatial.distance import cdist 

In [None]:
!pip install seaborn --upgrade --quiet

In [None]:
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

In [None]:
features = [c for c in train.columns if 'feature' in c]
f_mean = train[features[1:]].mean()
train = train.query('weight > 0').reset_index(drop = True)
train['action'] = (train['resp'] > 0).astype('int')
train[features[1:]] = train[features[1:]].fillna(f_mean)
time_features = ['feature_60', 'feature_61', 'feature_62', 'feature_63' ,'feature_65', 'feature_66', 'feature_67', 'feature_68']
new_time_features = []

In [None]:
def utility_score(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

In [None]:
# https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/
distortions = []
inertias = []


In [None]:
K = range(2,10)

In [None]:
train_0 = train.query('feature_0==-1')

In [None]:
for i in K:
    gc.collect()
    kmeans_ = KMeans(n_clusters=i, random_state=0).fit(train_0[time_features])
    distortions.append(sum(np.min(cdist(train_0[time_features], kmeans_.cluster_centers_, 
                      'euclidean'),axis=1)) / train_0[time_features].shape[0]) 
    inertias.append(kmeans_.inertia_)  
    print(f'# clusters, cluster eval, score')
    for j in range(i):
        score = utility_score(date=train_0['date'], weight=train_0['weight'], resp=train_0['resp'], action=(kmeans_.labels_==j))
        score = round(score,2)    
        print(f'{i}, {j}, {score}')

In [None]:
k=5

In [None]:
train_0['clusters'] = KMeans(n_clusters=k, random_state=0).fit(train_0[time_features]).labels_

In [None]:
plt.plot(K, distortions[:9], 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() 

In [None]:
plt.plot(K, inertias[:9], 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('The Elbow Method using Inertia') 
plt.show() 

In [None]:
def plotFeatureSplits(df: pd.DataFrame, feature_list: List[int]) -> None:
    for i in feature_list:
        if i != 64:

            # Create a plot with original timeseries, and split by feature 0
            _, axes = plt.subplots(1, figsize=(10, 5))

            # Original timeseries
            
            axes.scatter(df['feature_64'], df[f'feature_{i}'], s=.2)
            axes.set_title(f'Feature {i}')
            axes.set_ylabel(f'Feature {i}')
            axes.set_xlabel(f'Time of Day')
            
            sns.displot(data=df,x="feature_64",y=f'feature_{i}', kind='kde', hue='clusters')
            
            plt.show()

# Show features 60-68 and their relationship
new = train_0.query('date==0')


In [None]:
plotFeatureSplits(new, np.arange(60, 65))

In [None]:
plotFeatureSplits(new, np.arange(65, 69))