In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import gc
import os

In [None]:
def convert_to_32bit(df):
    for f in df.columns:
        if df[f].dtype == 'int64':
            df[f] = df[f].astype('int32')
        if df[f].dtype == 'float64':
            df[f] = df[f].astype('float32')
    return df

In [None]:
pd.set_option('max_rows', 300)
pd.set_option('max_columns', 300)
pd.set_option('max_colwidth', 300)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler




In [None]:
#data_dir=r'C:/91_data_science/optiver/full_input'

In [None]:
data_dir = '../input/optiver-realized-volatility-prediction/' 

In [None]:
def rmspe(y_true, y_pred):
        return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

In [None]:
def convert_to_32bit(df):
    for f in df.columns:
        if df[f].dtype == 'int64':
            df[f] = df[f].astype('int32')
        if df[f].dtype == 'float64':
            df[f] = df[f].astype('float32')
    return df

In [None]:
train_target = pd.read_csv(data_dir + '/train.csv')
time_ids = np.unique(train_target.time_id) 
train_target['row_id'] = train_target['stock_id'].astype(str) + '-' + train_target['time_id'].astype(str)
train_target = train_target[['row_id','target','stock_id']]
train_target.head()
print(f" Time ids shape {time_ids.shape}")

In [None]:
# Add group column in target dataframe

In [None]:
test_target = pd.read_csv(data_dir + '/test.csv')
print(test_target.shape)
test_target = test_target[['row_id','stock_id']]
display(test_target.head(2))
print(test_target.shape)

In [None]:
# Constants
#stock_id_filter=[0,1,10,120,122,123,124,125, 126,2,3,4,41,42,43,44,46,47,48,5,50,6,7,8, 81,82,83,84,85,86,87,88, 89,9,90]
#stock_id_filter=[0,102,3,112,103,12,116,118,126,22,5,55,78,83,87,88]
#stock_id_filter=[0,31,]
#test_target=test_target[test_target['stock_id'].isin(stock_id_filter)]
#train_target=train_target[train_target['stock_id'].isin(stock_id_filter)]

In [None]:
from numba import jit

@jit
def vola(x):
    return np.sqrt(np.sum(x*x))

@jit
def log_wap_max(x):
    return np.sqrt(np.max(x*x))


In [None]:
def calculate_diff(df, diff_cols, sort_cols=['time_id','seconds_in_buckets']):
    
    df.sort_values(by=sort_cols, inplace=True)
    
    for key, value in diff_cols.items():
        df[value]=df.groupby(["time_id"])[key].diff().fillna(0)
        
        
    #print(diff_cols.values())    
    return df

In [None]:
def calc_vola (df, by_cols=['time_id'], starts_seconds=0):
    
    df = df[df['seconds_in_bucket'] > starts_seconds].groupby(by_cols).agg(
    
            vola_1 = ('return1_diff', lambda x: vola(x.values)),
            vola_2 = ('return2_diff', lambda x: vola(x.values)),
            
         ).reset_index()
    
    return df

In [None]:
def group_quintiles(df, by_cols=['time_id'], col_names=['r1_diff','r2_diff'], prefix=None):
    
    # NO absolute Value columns
    df = df.groupby(by_cols)[col_names].quantile([.1, .5, .7, .98, 1.0 ]).reset_index()
    
    # pivot the frame
    
    df = pd.pivot_table(df, index=["time_id"], columns=["level_1"], values=col_names)
    df.columns = ['_'.join((i, str(int(j*100)))).strip('') for  i,j in df.columns]
    df = df.reset_index()
    
    if prefix:    
        columns_list = df.columns.to_list()
        cols_to_rename = [col for col in columns_list if col not in ['time_id']]
        renamed_cols = [col + '_' + str(prefix) for col in columns_list if col not in ['time_id']]
        rename_dict = dict(zip(cols_to_rename, renamed_cols))
        df.rename(columns=rename_dict, inplace=True)

    
    return df

In [None]:
def trade_groups(df, by_cols=['time_id'], start_seconds=0, prefix='m0'):
    
    df = df.groupby(by_cols).agg(
    
                        price_mean = ('price', np.nanmean),
                        size_mean  = ('size', np.nanmean)  ,
                        cnt_mean   = ('order_count', np.nanmean) ,
                        trade_vol_per_cnt_mean = ( 'trade_vol_per_cnt', np.nanmean),
        
                        price_std = ('price', np.nanstd),
                        size_std  = ('size', np.nanstd)  ,
                        trade_vol_per_cnt_std = ( 'trade_vol_per_cnt', np.nanstd),
                    
                        tv_sum = ('tv', np.sum),
                        cnt_sum   = ('order_count', np.sum) ,
        
        ).reset_index()
    
    columns_list = df.columns.to_list()
    cols_to_rename = [col for col in columns_list if col not in ['time_id']]
    renamed_cols = [col + '_' + str(prefix) for col in columns_list if col not in ['time_id']]
    rename_dict = dict(zip(cols_to_rename, renamed_cols))
    df.rename(columns=rename_dict, inplace=True)
    
    return df

In [None]:
%%time

def trade_processing(data_dir=data_dir, datatype='train', stock_id=0):
    
    df_td = pd.read_parquet(f'{data_dir}/trade_{datatype}.parquet/stock_id={stock_id}')
    df_td = convert_to_32bit(df_td)
    df_td['trade_vol_per_cnt'] =df_td['price']*df_td['size']/df_td['order_count']
    df_td['tv'] =df_td['price']*df_td['size']
    
    df1 = trade_groups (df_td, by_cols=['time_id'], start_seconds=0, prefix='t0')
    df2 = trade_groups (df_td, by_cols=['time_id'], start_seconds=200, prefix='t200')
    df3 = trade_groups (df_td, by_cols=['time_id'], start_seconds=400, prefix='t400')
    
    
    df_qnt = group_quintiles(df_td, by_cols=['time_id'], col_names=['price','size'], prefix=None)
    
    df_trade = df1.merge(df2, on='time_id', how='left')
    df_trade = df_trade.merge(df3, on='time_id', how='left')
    
    
    df_trade = df_trade.merge(df_qnt, on='time_id', how='left')
    
    df_trade['row_id'] = df_trade['time_id'].apply(lambda x:f'{stock_id}-{x}')
    df_trade['stock_id'] = stock_id
   
    
    return df_trade

In [None]:
%%time
tr1 = trade_processing(data_dir=data_dir, datatype='train', stock_id=0)
tr1.columns

In [None]:
tr1.head()

In [None]:
def book_groups(df, by_cols=['time_id'], start_seconds=0, prefix='m0'):
    
    df = df[df['seconds_in_bucket'] > start_seconds].groupby(by_cols).agg(
    
                        size_spread1_mean = ('size_spread1', np.mean),
                        size_spread1_std = ('size_spread1', np.std),

                        size_spread2_mean = ('size_spread2', np.mean),
                        size_spread2_std = ('size_spread2', np.std),

                        price_spread1_mean = ('price_spread1', np.mean),
                        price_spread1_std = ('price_spread1', np.std),

                        price_spread2_mean = ('price_spread2', np.mean),
                        price_spread2_std = ('price_spread2', np.std),

                        price_spread3_mean = ('price_spread3', np.mean),
                        price_spread3_std = ('price_spread3', np.std),
                      
                        mid1_mean = ('mid1', np.mean),
                        mid1_std = ('mid1', np.std),

                        mid2_mean = ('mid2', np.mean),
                        mid2_std = ('mid2', np.std),
        
                        return1_diff_max = ('return1_diff' ,np.std),
                        return2_diff_max = ('return2_diff' ,np.std),
        
                        log_return1 = ('log_return1', np.max),
                        log_return2 =('log_return2',   np.max),
        
                        log_return1_sum = ('log_return1', np.sum),
                        log_return2_sum =('log_return2',   np.sum),

        ).reset_index()
    
    columns_list = df.columns.to_list()
    cols_to_rename = [col for col in columns_list if col not in ['time_id']]
    renamed_cols = [col + '_' + str(prefix) for col in columns_list if col not in ['time_id']]
    rename_dict = dict(zip(cols_to_rename, renamed_cols))
    df.rename(columns=rename_dict, inplace=True)
    
    return df

In [None]:
%%time

def stock_processing(data_dir=data_dir, datatype='train', stock_id=0):
    
    df = pd.read_parquet(f'{data_dir}/book_{datatype}.parquet/stock_id={stock_id}')
    df = convert_to_32bit(df)
    bpr1, bsz1, apr1, asz1 = (df[col].values for col in [ 'bid_price1','bid_size1','ask_price1','ask_size1' ])
    bpr2, bsz2, apr2, asz2 = (df[col].values for col in [ 'bid_price2','bid_size2','ask_price2','ask_size2'])
   
    df['log_return1']=np.log(((bpr1 * asz1) + (apr1 * bsz1)) / (asz1 + bsz1))
    
    df.loc[df['bid_size2'] > 0 , 'log_return2']=np.log(((bpr2 * asz2) + (apr2 * bsz2)) / (asz2 + bsz2))
    df.loc[df['bid_size2'] > 0 , 'size_spread2'] = (bsz2 -asz2)/ (asz2 + bsz2)
    df.loc[df['bid_size2'] > 0 , 'price_spread2']=(apr2/bpr2) - 1
    df.loc[df['bid_size2'] > 0 , 'mid2']=(apr2 - bpr2)/2
    
    df['size_spread1'] = (bsz1 -asz1)/ (asz1 + bsz1)
    df['price_spread1']=(apr1/bpr1) - 1
    df['mid1']=(apr1 - bpr1)/2
    
    df.loc[df['bid_size2'] > 0 , 'price_spread3']=(apr2/bpr1) - 1
    
    diff_cols = { 'log_return1' : 'return1_diff', 
                 'log_return2' : 'return2_diff',
                }
    
    df = calculate_diff(df, diff_cols, sort_cols=['time_id','seconds_in_bucket'])
    
    df['return1_diff_sqr']=np.sqrt(df['return1_diff']**2)
    df['return2_diff_sqr']=np.sqrt(df['return2_diff']**2)
    
    df_vola = calc_vola (df, by_cols=['time_id'],starts_seconds=0)
    
    
    # cretae group for book data
    df_1 = book_groups(df, by_cols=['time_id'], start_seconds=0, prefix='m0')
    df_2 = book_groups(df, by_cols=['time_id'], start_seconds=200, prefix='m2')
    df_4 = book_groups(df, by_cols=['time_id'], start_seconds=400, prefix='m6')
    
    df_qnt = group_quintiles(df, by_cols=['time_id'], col_names=['return1_diff','price_spread1','return2_diff',
                                                                 'size_spread1','price_spread2', 'size_spread2',
                                                                 'return1_diff_sqr', 'return2_diff_sqr', 'price_spread3',
                                                                ], prefix=None)
    
    df_vola = df_vola.merge(df_1, on='time_id', how='left')
    df_vola = df_vola.merge(df_qnt, on='time_id', how='left')
    
  
    df_vola['row_id'] = df_vola['time_id'].apply(lambda x:f'{stock_id}-{x}')
    df_vola['stock_id'] = stock_id
   
    
    return df_vola

In [None]:
%%time
b1 = stock_processing(data_dir=data_dir, datatype='train', stock_id=0)
b1.columns

In [None]:
b1.head(3)

In [None]:
def preprocessor(data_dir,list_stock_ids, datatype='train', book_trade='book'):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        if book_trade=='book':
            df_tmp = stock_processing(data_dir, datatype=datatype, stock_id=stock_id)
            return pd.concat([df,df_tmp])
        else:
            df_tmp = trade_processing(data_dir, datatype=datatype, stock_id=stock_id)
            return pd.concat([df,df_tmp])
   
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df


In [None]:
%%time
train_ids= train_target.stock_id.unique()
train_book_df = preprocessor(data_dir,train_ids,datatype='train',book_trade='book')
train_trade_df = preprocessor(data_dir,train_ids,datatype='train',book_trade='trade')


In [None]:
%%time
test_ids= test_target.stock_id.unique()
test_book_df = preprocessor(data_dir,test_ids,datatype='test',book_trade='book')
test_trade_df = preprocessor(data_dir,test_ids,datatype='test',book_trade='trade')


In [None]:
%%time
train_vola = train_book_df.merge(train_trade_df, on =['time_id','row_id','stock_id'], how='left')
test_vola = test_book_df.merge(test_trade_df, on =['time_id','row_id','stock_id'], how='left')


In [None]:
# Take log transformation of volatility

#train_vola['vola_1']=np.log1p(train_vola['vola_1'])
#test_vola['vola_2']=np.log1p(test_vola['vola_2'])


# Clustering 

In [None]:
train = train_vola.copy()
test = test_vola.copy()
train.shape, test.shape

In [None]:
test.head()

In [None]:
# We don't have all stocks in test dataset. bring one random time_id from train, this will not output results much !

random_test_data = train[train['time_id']== time_ids[5]].copy()
random_test_data.head()

test = pd.concat([test, random_test_data])
test.head()

In [None]:
from sklearn.cluster import KMeans
# making agg features

train_p = pd.read_csv(data_dir + '/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=7, random_state=0, max_iter=2000, n_init=5).fit(corr.values)
print(kmeans.labels_)
df = pd.DataFrame( {'stock_id': [ f for f in corr.columns ], 'tgt_cluster': kmeans.labels_} )
df = convert_to_32bit(df)

del train_p, corr, kmeans
_ = gc.collect()

train = train.merge(df, on='stock_id', how='left')
test = test.merge(df, on='stock_id', how='left')

# Clusters found
train.groupby('tgt_cluster')['time_id'].agg('count')


In [None]:
# Add Spread based Second cluster ********

In [None]:
from sklearn.cluster import KMeans
# making agg features

train_p = train.pivot(index='time_id', columns='stock_id', values='price_spread2_std_m0')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=5, random_state=0, max_iter=2000, n_init=5).fit(corr.values)
print(kmeans.labels_)
df = pd.DataFrame( {'stock_id': [ f for f in corr.columns ], 'bas_cluster': kmeans.labels_} )
df = convert_to_32bit(df)

del train_p, corr, kmeans
_ = gc.collect()

train = train.merge(df, on='stock_id', how='left')
test = test.merge(df, on='stock_id', how='left')

# Clusters found
train.groupby('bas_cluster')['time_id'].agg('count')


In [None]:
from sklearn.cluster import KMeans
# making agg features

train_p = train.pivot(index='time_id', columns='stock_id', values='tv_sum_t0')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=4, random_state=0, max_iter=2000, n_init=5).fit(corr.values)
print(kmeans.labels_)
df = pd.DataFrame( {'stock_id': [ f for f in corr.columns ], 'tvsize_cluster': kmeans.labels_} )
df = convert_to_32bit(df)

del train_p, corr, kmeans
_ = gc.collect()

train = train.merge(df, on='stock_id', how='left')
test = test.merge(df, on='stock_id', how='left')

# Clusters found
train.groupby('tvsize_cluster')['time_id'].agg('count')


In [None]:
from sklearn.cluster import KMeans
# making agg features

train_p = train.pivot(index='time_id', columns='stock_id', values='log_return1_sum_m0')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=5, random_state=0, max_iter=2000, n_init=5).fit(corr.values)
print(kmeans.labels_)
df = pd.DataFrame( {'stock_id': [ f for f in corr.columns ], 'logr_cluster': kmeans.labels_} )
df = convert_to_32bit(df)

del train_p, corr, kmeans
_ = gc.collect()

train = train.merge(df, on='stock_id', how='left')
test = test.merge(df, on='stock_id', how='left')

# Clusters found
train.groupby('logr_cluster')['time_id'].agg('count')

In [None]:
# First Level Aggrregation -- time id

In [None]:
#test.columns.to_list()

In [None]:
def group_kpis(group_vars=['time_id'], datatype='train'):
    if datatype=='train':
        df=train
    else :
        df=test
    
    kpis_at_time_level = df.groupby(group_vars).agg(    t_vola1_mean         =  ('vola_1', np.mean),
                                                        t_vola2_mean         =  ('vola_2', np.mean),
                                                      
                                                        t_price_std_t0_mean  =  ('price_std_t0', np.mean),
                                                        t_price_std_t300_mean  = ('price_std_t400', np.mean),
                                                   
                                                        t_tv_sum_t0_mean = ('tv_sum_t0', np.mean),
                                                        t_tv_sum_t300_mean = ('tv_sum_t400', np.mean),
                                                        
                                                        t_return1_diff_max_m0_mean = ('return1_diff_max_m0', np.mean),
                                                        t_return2_diff_max_m0_mean = ('return2_diff_max_m0', np.mean),
                                                       
                                                        t_cnt_mean_t0_mean  = ('cnt_mean_t0', np.mean),
                                                        t_trade_vol_per_cnt_mean_t0_mean =('trade_vol_per_cnt_mean_t0', np.mean),
                                                    
                                                        t_price_spread1_std = ('price_spread1_std_m0', np.mean),
                                                        t_price_spread2_std = ('price_spread2_std_m0', np.mean),
                                                        t_price_spread3_std = ('price_spread3_std_m0', np.mean),


                                                    ).reset_index()
    
    print("Number of records in {} are {}".format(datatype, kpis_at_time_level.shape))
    return kpis_at_time_level

In [None]:
train_vola_by_time = group_kpis(group_vars=['time_id'], datatype='train')
test_vola_by_time = group_kpis(group_vars=['time_id'], datatype='test')

display(train_vola_by_time.head(2))
display(test_vola_by_time.head(2))


In [None]:
# Second  Aggrregation -- time id, tgt_cluster
train_vola_by_tgt_time = group_kpis(group_vars=['time_id','tgt_cluster'], datatype='train')
test_vola_by_tgt_time = group_kpis(group_vars=['time_id','tgt_cluster'], datatype='test')

# Third Aggrregation -- time id, bas_cluster
train_vola_by_bas_time = group_kpis(group_vars=['time_id','bas_cluster'], datatype='train')
test_vola_by_bas_time = group_kpis(group_vars=['time_id','bas_cluster'], datatype='test')

# Fourth Aggrregation -- time id, tvsize_cluster
train_vola_by_tv_time = group_kpis(group_vars=['time_id','tvsize_cluster'], datatype='train')
test_vola_by_tv_time = group_kpis(group_vars=['time_id','tvsize_cluster'], datatype='test')

# fifth Aggrregation -- time id, tvsize_cluster
train_vola_by_logr_time = group_kpis(group_vars=['time_id','logr_cluster'], datatype='train')
test_vola_by_logr_time = group_kpis(group_vars=['time_id','logr_cluster'], datatype='test')


In [None]:
# do the pivot 
def create_pivot(df,columns, pivot_cols):

    df = pd.pivot_table(df, index=["time_id"], columns=columns, values=pivot_cols)
    df.columns = ['_'.join((i, str(int(j))+ '_' + columns )).strip('') for  i,j in df.columns]
    df = df.reset_index()
    df.head()
    
    return df


In [None]:
%%time

train_tvsize = create_pivot(train_vola_by_tv_time ,columns='tvsize_cluster',  pivot_cols =['t_vola1_mean', 't_vola2_mean','t_price_std_t0_mean', 
                                                                                           't_tv_sum_t0_mean','t_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])

train_bas = create_pivot(train_vola_by_bas_time ,columns='bas_cluster',  pivot_cols =['t_vola1_mean', 't_vola2_mean','t_price_std_t0_mean',
                                                                                      't_tv_sum_t0_mean' ,'t_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])

train_tgt = create_pivot(train_vola_by_tgt_time ,columns='tgt_cluster',  pivot_cols =['t_vola1_mean', 't_vola2_mean','t_price_std_t0_mean','t_tv_sum_t0_mean',
                                                                                      't_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])

train_logr = create_pivot(train_vola_by_logr_time ,columns='logr_cluster',  pivot_cols =['t_price_std_t0_mean','t_tv_sum_t0_mean',
                                                                                      't_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])



test_tvsize = create_pivot(test_vola_by_tv_time ,columns='tvsize_cluster',  pivot_cols =['t_vola1_mean', 't_vola2_mean','t_price_std_t0_mean','t_tv_sum_t0_mean',
                                                                                         't_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])


test_bas = create_pivot(test_vola_by_bas_time ,columns='bas_cluster',  pivot_cols =['t_vola1_mean', 't_vola2_mean','t_price_std_t0_mean','t_tv_sum_t0_mean',
                                                                                    't_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])


test_tgt = create_pivot(test_vola_by_tgt_time ,columns='tgt_cluster',  pivot_cols =['t_vola1_mean', 't_vola2_mean','t_price_std_t0_mean','t_tv_sum_t0_mean',
                                                                                    't_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])

test_logr = create_pivot(test_vola_by_logr_time ,columns='logr_cluster',  pivot_cols =['t_price_std_t0_mean','t_tv_sum_t0_mean',
                                                                                      't_return1_diff_max_m0_mean','t_return2_diff_max_m0_mean'])


In [None]:
train_vola_by_time = train_vola_by_time.merge(train_tvsize, on='time_id', how='left').fillna(0)
train_vola_by_time = train_vola_by_time.merge(train_bas, on='time_id', how='left').fillna(0)
train_vola_by_time = train_vola_by_time.merge(train_tgt, on='time_id', how='left').fillna(0)
train_vola_by_time = train_vola_by_time.merge(train_logr, on='time_id', how='left').fillna(0)


test_vola_by_time = test_vola_by_time.merge(test_tvsize, on='time_id', how='left').fillna(0)
test_vola_by_time = test_vola_by_time.merge(test_bas, on='time_id', how='left').fillna(0)
test_vola_by_time = test_vola_by_time.merge(test_tgt, on='time_id', how='left').fillna(0)
test_vola_by_time = test_vola_by_time.merge(test_logr, on='time_id', how='left').fillna(0)


train_vola_by_time.head()



In [None]:
# join with stock Level data
# join time level KPIs'
train_data_t = train.merge(train_vola_by_time, on = ["time_id"], how = "left").reset_index(drop=True)
test_data_t = test.merge(test_vola_by_time, on = ["time_id"], how = "left").reset_index(drop=True)

display(train_data_t.head(2))
display(test_data_t.head(2))

train_data_t.shape, test_data_t.shape

In [None]:
train_data_with_target= train_target.merge(train_data_t, on = ["stock_id" , "row_id"], how = "left").fillna(0)
test_data_without_target= test_target.merge(test_data_t, on = ["stock_id" , "row_id"], how = "left").fillna(0)
display(train_data_with_target.head(2))
display(test_data_without_target.head(2))

In [None]:
train_data_with_target.shape,  test_data_without_target.shape

In [None]:

#transform traget to factor( Multiplication Factor)
train_data_with_target['multi_factor']=train_data_with_target['target']/train_data_with_target['vola_1']
train_data_with_target['multi_factor']=np.log1p(train_data_with_target['multi_factor'])




In [None]:
features_to_consider =   [col for col in train_data_with_target.columns if col not in {"time_id", "target", "row_id" , "multi_factor",
                                                                                       "pred_lgb1" , 
                                                                                       'tgt_cluster','bas_cluster','tvsize_cluster' }]

#features_to_consider



In [None]:
sample_df = train_data_with_target[train_data_with_target['stock_id'].isin([0,46,31])].drop(columns='multi_factor')
sample_df.rename(columns={"target": "orignal_target"}, inplace=True)
sample_df.head()

In [None]:
from sklearn import preprocessing, model_selection
import lightgbm as lgb

cats = ['stock_id']
#categorical_feature=cats,
features_to_consider =   [col for col in train_data_with_target.columns if col not in {"time_id", "target", "row_id" , "multi_factor",
                                                                                       "pred_lgb1" , "orignal_target",
                                                                                       'tgt_cluster','bas_cluster','tvsize_cluster' }]


n_folds = 5
n_rounds = 3000

train=train_data_with_target
test=test_data_without_target
test['multi_factor'] = 0
sample_df['multi_factor']=0

target_name = 'multi_factor'
scores_folds = {}

model_name = 'lgb1'
pred_name = 'pred_{}'.format(model_name)
print('We consider {} features'.format(len(features_to_consider)))

train[pred_name] = 0

params_lgbm =  {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':255,
    'min_data_in_leaf':750,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 3,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':2021,
    'n_jobs':-1,
    'verbose': -1,
    'device': 'gpu',
    'num_gpu': 1,
    'gpu_platform_id':-1,
    'gpu_device_id':-1,
    'gpu_use_dp': False,
}


kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=2016)
scores_folds[model_name] = []
counter = 1
for dev_index, val_index in kf.split(range(len(train))):
    print('CV {}/{}'.format(counter, n_folds))
    X_train = train.loc[dev_index, features_to_consider]
    y_train = train.loc[dev_index, target_name].values
    X_val = train.loc[val_index, features_to_consider]
    vola_1 = train.loc[val_index, 'vola_1']
    y_val = train.loc[val_index, target_name].values
   
    
    #############################################################################################
    #LGB
    #############################################################################################
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cats, weight=1/np.power(y_train, 3))
    val_data = lgb.Dataset(X_val, label=y_val,categorical_feature=cats)
    
    model = lgb.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 250,
                      early_stopping_rounds=1000
                     )
    preds = model.predict(train.loc[val_index, features_to_consider])
    preds =np.expm1(preds)*vola_1
    train.loc[val_index, pred_name] = preds
    score = round(rmspe(y_true = np.expm1(y_val)*vola_1, y_pred = preds),5)
    
    print('Fold {} {}: {}'.format(counter, model_name, score))
    scores_folds[model_name].append(score)
    counter += 1
    
    
    test['multi_factor'] += model.predict(test[features_to_consider])
    sample_df['multi_factor'] += model.predict(sample_df[features_to_consider])
    
del train_data, val_data

In [None]:
importances = pd.DataFrame({'Feature': model.feature_name(), 
                        'Importance': model.feature_importance(importance_type='gain')})
importances.sort_values(by = 'Importance', inplace=True)
importances2 = importances.nlargest(30,'Importance', keep='first').sort_values(by='Importance', ascending=True)
importances2[['Importance', 'Feature']].plot(kind = 'barh', x = 'Feature', figsize = (8,8), color = 'blue', fontsize=11);plt.ylabel('Feature', fontsize=12)

In [None]:
train[['row_id', 'vola_1', 'return1_diff_100', 'target', 'pred_lgb1']].query("row_id=='31-4142'")

In [None]:
sample_df['target'] = np.expm1(sample_df[target_name]/n_folds )*sample_df['vola_1']
display(sample_df[['row_id',  'vola_1', 'return1_diff_100', 'orignal_target', 'multi_factor','target']].tail(4))

In [None]:
test['target'] = np.expm1(test[target_name]/n_folds )*test['vola_1']
display(test[['row_id', 'target']].head(4))

In [None]:
test[['row_id', 'target']].to_csv('submission.csv',index = False)