# Libraries

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import gc
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import StratifiedGroupKFold

# Configurations

In [2]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.05
    start_dt = 1100
    chg_date_3 = 1530 # 마지막 partiton

In [3]:
means = {'feature_00': 0.640198826789856, 'feature_01': 0.03755598142743111, 'feature_02': 0.6368075609207153, 'feature_03': 0.6365063786506653, 'feature_04': 0.013741530478000641, 'feature_05': -0.02173694409430027, 'feature_06': -0.006415014620870352, 'feature_07': -0.010971736162900925, 'feature_08': -0.04653771221637726, 'feature_09': 32.596106194690265, 'feature_10': 4.95929203539823, 'feature_11': 167.6541592920354, 'feature_12': -0.13415881991386414, 'feature_13': -0.07573335617780685, 'feature_14': -0.12015637010335922, 'feature_15': -0.7470195889472961, 'feature_16': -0.6257441639900208, 'feature_17': -0.7294047474861145, 'feature_18': -0.042215555906295776, 'feature_19': -0.08798160403966904, 'feature_20': -0.15741558372974396, 'feature_21': 0.10528526455163956, 'feature_22': 0.018054703250527382, 'feature_23': 0.03165541961789131, 'feature_24': 2.733017921447754, 'feature_25': 0.39958420395851135, 'feature_26': -0.11045943945646286, 'feature_27': -0.5332594513893127, 'feature_28': -0.4522790312767029, 'feature_29': -0.5739678144454956, 'feature_30': -0.7905704975128174, 'feature_31': 0.10600688308477402, 'feature_32': 0.40044134855270386, 'feature_33': -0.021725023165345192, 'feature_34': 0.4226262867450714, 'feature_35': 0.42143046855926514, 'feature_36': -0.00023802756913937628, 'feature_37': 0.027961043640971184, 'feature_38': 0.010258913040161133, 'feature_39': 0.005768273025751114, 'feature_40': 0.017485467717051506, 'feature_41': 0.038347117602825165, 'feature_42': -0.06123563274741173, 'feature_43': -0.11644423753023148, 'feature_44': -0.12342483550310135, 'feature_45': -0.028769943863153458, 'feature_46': -0.015200662426650524, 'feature_47': 0.015717582777142525, 'feature_48': -0.0033910537604242563, 'feature_49': -0.0052393232472240925, 'feature_50': -0.2285808026790619, 'feature_51': -0.3548349440097809, 'feature_52': -0.358092725276947, 'feature_53': 0.2607136368751526, 'feature_54': 0.18796788156032562, 'feature_55': 0.3154229521751404, 'feature_56': -0.1471923440694809, 'feature_57': 0.15730056166648865, 'feature_58': -0.021774644032120705, 'feature_59': -0.0037768862675875425, 'feature_60': -0.010220836848020554, 'feature_61': -0.03178725391626358, 'feature_62': -0.3769100308418274, 'feature_63': -0.3229374587535858, 'feature_64': -0.3718394339084625, 'feature_65': -0.10233989357948303, 'feature_66': -0.13688170909881592, 'feature_67': -0.14402112364768982, 'feature_68': -0.06875362992286682, 'feature_69': -0.11862917989492416, 'feature_70': -0.11789549142122269, 'feature_71': -0.06013699993491173, 'feature_72': -0.10766122490167618, 'feature_73': -0.09921672940254211, 'feature_74': -0.10233042389154434, 'feature_75': -0.05991339311003685, 'feature_76': -0.06349952518939972, 'feature_77': -0.07424316555261612, 'feature_78': -0.07759837061166763}
stds = {'feature_00': 1.027751088142395, 'feature_01': 1.0967519283294678, 'feature_02': 1.0156300067901611, 'feature_03': 1.0170334577560425, 'feature_04': 1.0726385116577148, 'feature_05': 0.9639211297035217, 'feature_06': 1.0963259935379028, 'feature_07': 1.0789952278137207, 'feature_08': 0.7962697148323059, 'feature_09': 23.72976726545254, 'feature_10': 3.1867162933797224, 'feature_11': 163.44513161352285, 'feature_12': 0.6700984835624695, 'feature_13': 0.5805172920227051, 'feature_14': 0.664044201374054, 'feature_15': 0.37517768144607544, 'feature_16': 0.3393096327781677, 'feature_17': 0.3603287935256958, 'feature_18': 0.9911752939224243, 'feature_19': 1.0550744533538818, 'feature_20': 0.6643751263618469, 'feature_21': 0.38239365816116333, 'feature_22': 0.950261116027832, 'feature_23': 0.8119344711303711, 'feature_24': 1.4362775087356567, 'feature_25': 1.0947270393371582, 'feature_26': 1.077124834060669, 'feature_27': 1.0645726919174194, 'feature_28': 1.0676648616790771, 'feature_29': 0.2640742361545563, 'feature_30': 0.19689509272575378, 'feature_31': 0.3815343976020813, 'feature_32': 1.2996565103530884, 'feature_33': 0.9989405870437622, 'feature_34': 1.3409572839736938, 'feature_35': 1.3365675210952759, 'feature_36': 0.8695492148399353, 'feature_37': 0.7334080934524536, 'feature_38': 0.698810338973999, 'feature_39': 0.7965824604034424, 'feature_40': 0.518515944480896, 'feature_41': 0.6384949088096619, 'feature_42': 0.8168442249298096, 'feature_43': 0.5228385925292969, 'feature_44': 0.6521403193473816, 'feature_45': 0.8666537404060364, 'feature_46': 0.9039222002029419, 'feature_47': 3.2711963653564453, 'feature_48': 0.6570901274681091, 'feature_49': 0.7083076238632202, 'feature_50': 1.0132617950439453, 'feature_51': 0.6081287860870361, 'feature_52': 0.9250587224960327, 'feature_53': 1.0421689748764038, 'feature_54': 0.5859629511833191, 'feature_55': 0.9191848039627075, 'feature_56': 0.9549097418785095, 'feature_57': 1.0204777717590332, 'feature_58': 0.8327276110649109, 'feature_59': 0.8309783339500427, 'feature_60': 0.8389413356781006, 'feature_61': 1.192766547203064, 'feature_62': 1.388945460319519, 'feature_63': 0.09957146644592285, 'feature_64': 0.3396177291870117, 'feature_65': 1.01683509349823, 'feature_66': 1.0824761390686035, 'feature_67': 0.642227828502655, 'feature_68': 0.5312599539756775, 'feature_69': 0.6208390593528748, 'feature_70': 0.6724499464035034, 'feature_71': 0.5356909036636353, 'feature_72': 0.6534596681594849, 'feature_73': 1.0855497121810913, 'feature_74': 1.0880277156829834, 'feature_75': 1.2321789264678955, 'feature_76': 1.2345560789108276, 'feature_77': 1.0921478271484375, 'feature_78': 1.0924347639083862}

# 전체 데이터 기준으로 재정의

def normalize_dataframe(df: pl.DataFrame, means: dict, stds: dict) -> pl.DataFrame:
    # 폴라 데이터프레임을 평균과 표준편차로 정규화
    normalize_exprs = []

    for col in df.columns:
        if col in means and col in stds: # 정규화 되어야 하는 열만 존재하도록
            if stds[col] != 0: # 0인 차원 적용 X
                # 정규화 하되 이름은 유지하도록
                normalize_exprs.append(
                    ((pl.col(col) - means[col]) / stds[col]).alias(col)
                )
            else:
                normalize_exprs.append(pl.col(col) - means[col]).alias(col)

    normalized_df = df.select(normalize_exprs) # 정규화된 데이터프레임 생성
    return normalized_df

# Load training data & select

In [4]:
# Use last 2 parquets

train = pl.scan_parquet(
    f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).with_columns(
    (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
).filter(
    (pl.col("date_id") >= CONFIG.chg_date_3) # 이유는 모르겠지만 1100일 이후의 데이터만 가져옴
).fill_null(
    strategy="forward"
).fill_null(
    value=0 # 결측치는 일단 단순하게 0으로 처리 
)

In [5]:
train = train.collect().to_pandas()
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label
0,40852762,1530,0,0,3.084694,1.153571,1.563784,0.697396,0.756759,2.580965,...,0.323897,0.601499,2.074103,0.746552,0.552013,3.071231,0.914794,0.997124,9,6
1,40852763,1530,0,1,2.232906,0.553354,1.730064,0.990195,0.61149,2.023031,...,-0.399384,-0.635306,2.092151,0.342582,0.757289,1.979042,0.967537,1.219739,9,3
2,40852764,1530,0,2,2.404948,1.532503,2.095852,0.919688,0.583715,2.330047,...,-0.006571,0.51887,-0.344441,0.641694,-0.64604,-0.50626,0.739797,-2.041514,9,-1
3,40852765,1530,0,3,1.986533,0.647099,1.68746,0.569406,1.061679,2.444131,...,-0.235901,-0.428956,-1.903627,-1.214619,-0.4695,-2.590589,-0.946317,-0.390001,9,-5
4,40852766,1530,0,4,2.742601,1.096778,1.551411,0.632113,0.368218,2.181873,...,-0.209282,-0.095182,-1.598217,0.968505,-0.705594,-1.579623,0.954296,-1.805623,9,-3


In [6]:
tmp_normed = normalize_dataframe(pl.DataFrame(train), means, stds)
tmp_normed = tmp_normed.to_pandas()

In [7]:
tmp_normed.head()

Unnamed: 0,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
0,0.499511,1.391589,0.059656,0.118239,2.393372,0.200274,1.03324,0.507069,0.130217,-0.910085,...,0.985952,-1.250955,2.510017,-0.256836,0.091398,0.094051,3.447852,3.021115,0.844592,0.985976
1,-0.0845,1.5432,0.347949,-0.024597,1.873222,0.353507,1.085249,0.531814,0.13102,-0.910085,...,0.245522,-1.25248,0.684429,-0.634124,0.091398,0.094051,0.972306,1.330793,0.232393,0.218051
2,0.86821,1.87672,0.278527,-0.051907,2.159446,0.372264,1.157184,0.469902,0.150821,2.039796,...,-0.055353,-1.638039,0.782851,-0.808737,0.091398,0.094051,0.410088,0.294533,0.028503,0.010836
3,0.006714,1.504355,-0.066364,0.418051,2.265805,0.17867,0.823622,0.664158,0.14955,-1.205073,...,-0.629897,-1.904379,0.544011,-0.746161,0.091398,0.094051,5.725927,5.829973,0.391163,0.552786
4,0.444251,1.380308,-0.004622,-0.263795,2.021307,0.245187,1.095154,0.252112,0.115493,-0.74152,...,-0.651171,-1.495421,0.337533,-0.769647,0.091398,0.094051,0.449206,0.505704,-1.152199,-0.812885


In [8]:
train.update(tmp_normed)
train.head()

  0.73342033]' has dtype incompatible with int8, please explicitly cast to a compatible dtype first.
  train.update(tmp_normed)
 -1.24243631]' has dtype incompatible with int8, please explicitly cast to a compatible dtype first.
  train.update(tmp_normed)
  2.16798039]' has dtype incompatible with int16, please explicitly cast to a compatible dtype first.
  train.update(tmp_normed)


Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label
0,40852762,1530,0,0,3.084694,0.499511,1.391589,0.059656,0.118239,2.393372,...,0.323897,0.601499,2.074103,0.746552,0.552013,3.071231,0.914794,0.997124,9,6
1,40852763,1530,0,1,2.232906,-0.0845,1.5432,0.347949,-0.024597,1.873222,...,-0.399384,-0.635306,2.092151,0.342582,0.757289,1.979042,0.967537,1.219739,9,3
2,40852764,1530,0,2,2.404948,0.86821,1.87672,0.278527,-0.051907,2.159446,...,-0.006571,0.51887,-0.344441,0.641694,-0.64604,-0.50626,0.739797,-2.041514,9,-1
3,40852765,1530,0,3,1.986533,0.006714,1.504355,-0.066364,0.418051,2.265805,...,-0.235901,-0.428956,-1.903627,-1.214619,-0.4695,-2.590589,-0.946317,-0.390001,9,-5
4,40852766,1530,0,4,2.742601,0.444251,1.380308,-0.004622,-0.263795,2.021307,...,-0.209282,-0.095182,-1.598217,0.968505,-0.705594,-1.579623,0.954296,-1.805623,9,-3


In [9]:
train = pl.DataFrame(train)

# Create Lags data from training data

In [10]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
lags

date_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32
1531,0,-0.111643,-0.098582,-2.035532,0.772288,0.356617,-0.269562,-0.042934,0.009101,-0.106585
1531,1,1.982551,0.174802,2.422206,0.574457,0.197794,0.750299,-0.263879,-0.138759,-0.672006
1531,2,-0.658825,-0.717583,-0.137797,0.549591,0.308446,0.166129,-0.130733,-0.055674,-0.452719
1531,3,1.554449,0.562713,-0.514036,0.353216,0.235419,-0.11215,-0.087656,-0.010191,-0.223291
1531,4,0.500255,0.198204,2.345295,0.172461,0.09778,0.780662,-0.157494,-0.062953,-0.330664
…,…,…,…,…,…,…,…,…,…,…
1699,34,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461
1699,35,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063
1699,36,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517
1699,37,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395


# Merge training data and lags data

In [11]:
train = train.join(lags, on=["date_id", "symbol_id"],  how="left")
train

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32
40852762,1530,0,0,3.084694,0.499511,1.391589,0.059656,0.118239,2.393372,0.200274,1.03324,0.507069,0.130217,-0.910085,0.64038,-0.560764,-0.779183,3.765472,0.400486,1.991109,1.245036,2.024276,-1.206654,-2.091535,0.141268,-0.662939,1.792462,0.756125,-0.873898,0.117753,1.173064,0.938796,0.559958,0.102707,0.488256,-0.707417,…,-0.060013,-0.931456,-0.252875,-1.211234,-1.937454,-0.323892,6.011952,0.985952,-1.250955,2.510017,-0.256836,0.091398,0.094051,3.447852,3.021115,0.844592,0.985976,0.417462,0.323897,0.601499,2.074103,0.746552,0.552013,3.071231,0.914794,0.997124,9,6,,,,,,,,,
40852763,1530,0,1,2.232906,-0.0845,1.5432,0.347949,-0.024597,1.873222,0.353507,1.085249,0.531814,0.13102,-0.910085,0.64038,-0.560764,-1.3869,1.917889,-0.203604,1.991109,0.905414,2.024276,-1.436592,-1.964066,0.251065,-0.165011,0.317551,0.137221,-0.603025,0.766088,-1.742014,0.052924,0.806168,-0.438616,0.869968,-0.064461,…,0.092257,0.36992,-0.245319,-1.666959,-2.216311,-0.919993,3.016481,0.245522,-1.25248,0.684429,-0.634124,0.091398,0.094051,0.972306,1.330793,0.232393,0.218051,-0.318671,-0.399384,-0.635306,2.092151,0.342582,0.757289,1.979042,0.967537,1.219739,9,3,,,,,,,,,
40852764,1530,0,2,2.404948,0.86821,1.87672,0.278527,-0.051907,2.159446,0.372264,1.157184,0.469902,0.150821,2.039796,-0.928634,-0.664775,-1.295046,2.03526,-0.277496,1.991109,0.276652,2.024276,-1.270575,-1.447565,0.920898,-0.769085,-0.003088,-0.235416,-0.942683,0.187247,0.786031,0.991258,0.252159,-0.22286,-0.24872,-0.827074,…,0.058283,-0.63556,0.078729,-1.471262,-1.824139,-0.990994,1.727234,-0.055353,-1.638039,0.782851,-0.808737,0.091398,0.094051,0.410088,0.294533,0.028503,0.010836,0.200878,-0.006571,0.51887,-0.344441,0.641694,-0.64604,-0.50626,0.739797,-2.041514,9,-1,,,,,,,,,
40852765,1530,0,3,1.986533,0.006714,1.504355,-0.066364,0.418051,2.265805,0.17867,0.823622,0.664158,0.14955,-1.205073,-0.614831,-0.958451,-1.242276,0.846063,-0.615656,1.991109,2.386341,2.024276,-1.891906,-1.745729,-0.518283,-0.543042,-0.332258,-1.331192,-1.252893,-0.919064,0.066196,0.24545,-0.438845,-1.334981,0.038435,-0.633564,…,-0.013778,1.774841,0.422454,-1.927524,-1.187028,-1.101101,0.891364,-0.629897,-1.904379,0.544011,-0.746161,0.091398,0.094051,5.725927,5.829973,0.391163,0.552786,-0.349773,-0.235901,-0.428956,-1.903627,-1.214619,-0.4695,-2.590589,-0.946317,-0.390001,9,-5,,,,,,,,,
40852766,1530,0,4,2.742601,0.444251,1.380308,-0.004622,-0.263795,2.021307,0.245187,1.095154,0.252112,0.115493,-0.74152,-1.242436,-0.970688,-1.135423,0.212984,-0.848328,1.991109,-0.037126,2.024276,-1.529391,-1.057173,-0.223391,0.382956,0.215408,-1.150255,1.040125,0.761322,-2.00925,-0.500362,0.20893,-0.089326,-1.014103,0.177179,…,-0.049093,-0.251247,-0.28985,-1.649936,-1.989867,-1.246919,0.181925,-0.651171,-1.495421,0.337533,-0.769647,0.091398,0.094051,0.449206,0.505704,-1.152199,-0.812885,-0.373938,-0.209282,-0.095182,-1.598217,0.968505,-0.705594,-1.579623,0.954296,-1.805623,9,-3,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47127333,1698,967,34,3.242493,1.834064,-0.692533,1.877867,1.810274,0.37647,0.837775,1.025435,2.04858,0.580161,0.396291,0.012774,-0.108013,1.400633,2.12395,1.734372,0.20212,0.875731,0.675141,1.788172,0.077538,0.235215,-0.832511,0.962434,1.645062,-2.068691,-0.997691,-0.009937,1.525549,1.776374,-0.384962,-1.131028,-0.914452,…,-0.005705,0.473224,-0.106592,2.487577,0.026287,1.376077,1.692126,0.781065,2.157998,2.119947,2.916669,-0.077589,-0.080781,0.238702,0.332623,0.027268,0.086536,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461,9,0,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
47127334,1698,967,35,1.079139,1.184827,-0.755141,2.076181,1.674842,0.775027,0.698301,1.082448,1.832585,0.462256,-0.320109,0.64038,0.167309,0.087845,-0.132628,-0.126518,0.868608,2.483938,1.306111,1.934262,0.272836,-0.062922,-0.603839,-1.076128,-0.10295,-1.588908,-0.356563,0.254926,-0.382045,-0.64734,0.961459,2.086064,-0.651148,…,0.02379,0.692436,0.27386,2.117333,0.071481,0.17848,-0.061415,-0.111349,-0.093604,-0.048466,-0.070002,-0.089227,-0.067058,0.897348,0.650624,0.098854,0.117589,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063,9,0,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
47127335,1698,967,36,1.033172,1.824691,-0.647233,1.627012,1.853514,0.224992,0.976874,1.074893,2.031026,0.370475,0.691279,0.64038,0.791372,1.732393,-0.036447,0.518739,0.583485,-0.233439,0.068235,2.375662,0.394305,0.074088,-0.603839,-1.014183,-0.339718,-1.75943,-0.69143,0.254926,-0.382045,-0.616739,-0.023759,1.980761,-0.651148,…,0.025141,0.741234,0.083004,2.314729,0.682482,1.83591,-0.403733,0.36382,1.454465,0.157479,0.738398,-0.112124,-0.054459,0.07522,0.081315,0.222636,0.210476,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517,9,0,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
47127336,1698,967,37,1.243116,1.968472,-0.84492,1.65055,2.423639,0.289672,0.664661,1.087339,1.492774,0.459966,0.059162,-0.301028,0.283556,1.333345,0.619775,0.809157,0.362347,0.330163,-0.449639,1.909002,0.468912,-2.183676,-0.936072,-0.304789,-0.1023,-1.834514,-0.962381,0.263791,0.485418,0.044742,-0.012475,0.300371,-0.845183,…,0.013538,1.818338,0.343466,2.548679,0.623139,1.753471,-0.168007,0.412375,0.886687,1.573349,1.054513,-0.0063,-0.007983,0.181613,0.188594,0.033585,0.044044,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395,9,0,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


# Split training data and validation data

In [12]:
len_train   = train.select(pl.col("date_id")).shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records
last_tr_dt  = train.select(pl.col("date_id")).row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data = train.filter(pl.col("date_id").gt(last_tr_dt))


 len_train = 6274576

 len_ofl_mdl = 5960848

---> Last offline train date = 1690



In [13]:
validation_data

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32
46826290,1691,0,0,3.725687,2.174799,1.700422,1.777762,1.611958,-1.784994,0.136469,-0.894757,-0.451468,-0.197641,-0.910085,0.64038,-0.560764,-1.398536,4.603804,0.307726,0.422075,1.602574,-0.081417,-1.81812,-1.44895,1.936515,-0.582772,1.975919,1.261393,-0.69666,1.142125,1.073541,1.592272,0.816611,-1.488494,-0.631275,-0.541859,…,0.000368,0.230374,-0.096989,-1.431326,-1.395397,-1.083865,2.400614,-0.099187,-0.886815,6.595269,0.900937,-0.082144,-0.141269,3.244081,5.320797,3.177561,4.122289,-0.300887,0.024018,0.26636,-1.846393,-1.319574,-1.649363,-2.37614,-2.129162,-3.700761,9,-4,-0.220147,-0.263234,0.314635,0.003016,0.004875,0.164086,0.015148,0.024748,0.016193
46826291,1691,0,1,3.344007,1.926211,0.856727,1.87192,1.355163,-2.649967,0.177514,-0.708286,-0.464213,-0.240851,-0.910085,0.64038,-0.560764,-1.382643,1.560727,-0.423562,0.422075,1.974938,-0.081417,-2.787019,-1.439849,1.804608,-0.09051,1.624237,0.996693,-1.063224,0.076583,-1.059632,0.324185,1.11346,-0.106605,0.37774,-0.122139,…,0.07158,0.453076,0.337309,-1.504624,-1.604561,-0.975691,0.954613,-0.754664,-1.328763,2.22035,-0.139829,-0.082144,-0.141269,1.168534,2.221594,0.482529,0.402646,-0.138919,-0.002569,0.175624,-2.282696,-2.430525,-3.05395,-3.764602,-2.666663,-5.0,9,-7,-1.077176,-0.22724,0.145465,0.119212,0.0406,0.258023,0.146226,0.08656,0.273702
46826292,1691,0,2,2.97724,1.217192,1.166066,1.794446,1.498346,-2.317598,0.131342,-0.752495,-0.754113,-0.426893,2.039796,-0.928634,-0.664775,-0.684741,1.151628,-0.34019,0.422075,0.257135,-0.081417,-0.738676,-1.815353,0.122224,-0.674488,0.695731,0.942926,-1.53274,-0.409942,0.898842,1.14319,0.398204,0.531089,1.530534,-0.782129,…,0.075425,0.411855,0.214407,-0.968207,-2.574945,-1.085329,1.201904,-0.509749,-0.962329,0.939316,-0.589747,-0.082144,-0.141269,0.105927,0.091016,-0.151619,-0.093128,-0.246979,-0.112181,-0.920523,0.785028,1.981712,-2.451337,1.154909,3.141321,-0.85232,9,2,-1.273494,-0.254899,-0.731619,0.295771,0.182442,-0.137669,-0.097605,-0.00902,-0.20872
46826293,1691,0,3,1.978776,1.40807,0.843012,1.916022,1.768141,-2.472045,0.199619,-1.128111,-0.609235,-0.370962,-1.205073,-0.614831,-0.958451,-0.829923,3.86456,0.24509,0.422075,1.062493,-0.081417,-2.097163,-2.032649,0.212301,-0.425554,0.37655,0.7353,-1.640007,-0.623054,0.091123,0.09971,-0.085562,-0.218938,0.985046,-0.3913,…,0.233994,2.304885,0.879985,-1.945667,-3.240269,-0.971295,3.960341,0.144013,-0.903143,3.988668,0.434363,-0.082144,-0.141269,1.123385,1.559152,0.196944,0.202918,0.085344,-0.032906,0.097499,-0.741609,-0.503937,0.042153,-0.994788,-0.568593,-0.002262,9,-1,-1.151294,-0.786059,0.122825,-0.228032,-0.10892,0.266399,0.264185,0.120004,0.335621
46826294,1691,0,4,2.612476,1.826925,1.039475,1.952895,2.051906,-2.64141,0.082287,-0.60732,-0.341058,-0.120292,-0.74152,-1.242436,-0.970688,-1.29748,0.711712,-0.605696,0.422075,-0.387647,-0.081417,-1.4871,-2.63563,-1.240343,2.111321,0.930969,0.162271,1.142982,2.165401,-1.290798,-0.7571,-0.185446,0.018131,0.334925,1.572323,…,-0.00614,-0.093082,-0.034937,-1.071342,-1.616501,-1.283037,0.839491,-0.559032,-1.140646,-0.217996,-0.889073,-0.082144,-0.141269,0.407777,0.422105,-0.063591,-0.054679,0.115972,0.124892,-0.05685,0.266993,0.023727,-0.544508,0.337255,0.019037,-0.882814,9,0,0.688146,0.658165,-0.366572,-0.078787,-0.056613,-0.170136,-0.007079,0.023542,-0.032158
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47127333,1698,967,34,3.242493,1.834064,-0.692533,1.877867,1.810274,0.37647,0.837775,1.025435,2.04858,0.580161,0.396291,0.012774,-0.108013,1.400633,2.12395,1.734372,0.20212,0.875731,0.675141,1.788172,0.077538,0.235215,-0.832511,0.962434,1.645062,-2.068691,-0.997691,-0.009937,1.525549,1.776374,-0.384962,-1.131028,-0.914452,…,-0.005705,0.473224,-0.106592,2.487577,0.026287,1.376077,1.692126,0.781065,2.157998,2.119947,2.916669,-0.077589,-0.080781,0.238702,0.332623,0.027268,0.086536,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461,9,0,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
47127334,1698,967,35,1.079139,1.184827,-0.755141,2.076181,1.674842,0.775027,0.698301,1.082448,1.832585,0.462256,-0.320109,0.64038,0.167309,0.087845,-0.132628,-0.126518,0.868608,2.483938,1.306111,1.934262,0.272836,-0.062922,-0.603839,-1.076128,-0.10295,-1.588908,-0.356563,0.254926,-0.382045,-0.64734,0.961459,2.086064,-0.651148,…,0.02379,0.692436,0.27386,2.117333,0.071481,0.17848,-0.061415,-0.111349,-0.093604,-0.048466,-0.070002,-0.089227,-0.067058,0.897348,0.650624,0.098854,0.117589,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063,9,0,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
47127335,1698,967,36,1.033172,1.824691,-0.647233,1.627012,1.853514,0.224992,0.976874,1.074893,2.031026,0.370475,0.691279,0.64038,0.791372,1.732393,-0.036447,0.518739,0.583485,-0.233439,0.068235,2.375662,0.394305,0.074088,-0.603839,-1.014183,-0.339718,-1.75943,-0.69143,0.254926,-0.382045,-0.616739,-0.023759,1.980761,-0.651148,…,0.025141,0.741234,0.083004,2.314729,0.682482,1.83591,-0.403733,0.36382,1.454465,0.157479,0.738398,-0.112124,-0.054459,0.07522,0.081315,0.222636,0.210476,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517,9,0,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
47127336,1698,967,37,1.243116,1.968472,-0.84492,1.65055,2.423639,0.289672,0.664661,1.087339,1.492774,0.459966,0.059162,-0.301028,0.283556,1.333345,0.619775,0.809157,0.362347,0.330163,-0.449639,1.909002,0.468912,-2.183676,-0.936072,-0.304789,-0.1023,-1.834514,-0.962381,0.263791,0.485418,0.044742,-0.012475,0.300371,-0.845183,…,0.013538,1.818338,0.343466,2.548679,0.623139,1.753471,-0.168007,0.412375,0.886687,1.573349,1.054513,-0.0063,-0.007983,0.181613,0.188594,0.033585,0.044044,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395,9,0,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


# Save data as parquets

In [14]:
training_data.\
write_parquet(
    f"training_data.parquet"
)

In [15]:
validation_data.\
write_parquet(
    f"validation_data.parquet"
)