# Libraries

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import gc
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import StratifiedGroupKFold

# Configurations

In [2]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.05
    start_dt = 1100
    chg_date_1 = 484
    chg_date_2 = 677
    chg_date_3 = 1360

In [3]:
means = {'feature_00': 0.640198826789856, 'feature_01': 0.03755598142743111, 'feature_02': 0.6368075609207153, 'feature_03': 0.6365063786506653, 'feature_04': 0.013741530478000641, 'feature_05': -0.02173694409430027, 'feature_06': -0.006415014620870352, 'feature_07': -0.010971736162900925, 'feature_08': -0.04653771221637726, 'feature_09': 32.596106194690265, 'feature_10': 4.95929203539823, 'feature_11': 167.6541592920354, 'feature_12': -0.13415881991386414, 'feature_13': -0.07573335617780685, 'feature_14': -0.12015637010335922, 'feature_15': -0.7470195889472961, 'feature_16': -0.6257441639900208, 'feature_17': -0.7294047474861145, 'feature_18': -0.042215555906295776, 'feature_19': -0.08798160403966904, 'feature_20': -0.15741558372974396, 'feature_21': 0.10528526455163956, 'feature_22': 0.018054703250527382, 'feature_23': 0.03165541961789131, 'feature_24': 2.733017921447754, 'feature_25': 0.39958420395851135, 'feature_26': -0.11045943945646286, 'feature_27': -0.5332594513893127, 'feature_28': -0.4522790312767029, 'feature_29': -0.5739678144454956, 'feature_30': -0.7905704975128174, 'feature_31': 0.10600688308477402, 'feature_32': 0.40044134855270386, 'feature_33': -0.021725023165345192, 'feature_34': 0.4226262867450714, 'feature_35': 0.42143046855926514, 'feature_36': -0.00023802756913937628, 'feature_37': 0.027961043640971184, 'feature_38': 0.010258913040161133, 'feature_39': 0.005768273025751114, 'feature_40': 0.017485467717051506, 'feature_41': 0.038347117602825165, 'feature_42': -0.06123563274741173, 'feature_43': -0.11644423753023148, 'feature_44': -0.12342483550310135, 'feature_45': -0.028769943863153458, 'feature_46': -0.015200662426650524, 'feature_47': 0.015717582777142525, 'feature_48': -0.0033910537604242563, 'feature_49': -0.0052393232472240925, 'feature_50': -0.2285808026790619, 'feature_51': -0.3548349440097809, 'feature_52': -0.358092725276947, 'feature_53': 0.2607136368751526, 'feature_54': 0.18796788156032562, 'feature_55': 0.3154229521751404, 'feature_56': -0.1471923440694809, 'feature_57': 0.15730056166648865, 'feature_58': -0.021774644032120705, 'feature_59': -0.0037768862675875425, 'feature_60': -0.010220836848020554, 'feature_61': -0.03178725391626358, 'feature_62': -0.3769100308418274, 'feature_63': -0.3229374587535858, 'feature_64': -0.3718394339084625, 'feature_65': -0.10233989357948303, 'feature_66': -0.13688170909881592, 'feature_67': -0.14402112364768982, 'feature_68': -0.06875362992286682, 'feature_69': -0.11862917989492416, 'feature_70': -0.11789549142122269, 'feature_71': -0.06013699993491173, 'feature_72': -0.10766122490167618, 'feature_73': -0.09921672940254211, 'feature_74': -0.10233042389154434, 'feature_75': -0.05991339311003685, 'feature_76': -0.06349952518939972, 'feature_77': -0.07424316555261612, 'feature_78': -0.07759837061166763}
stds = {'feature_00': 1.027751088142395, 'feature_01': 1.0967519283294678, 'feature_02': 1.0156300067901611, 'feature_03': 1.0170334577560425, 'feature_04': 1.0726385116577148, 'feature_05': 0.9639211297035217, 'feature_06': 1.0963259935379028, 'feature_07': 1.0789952278137207, 'feature_08': 0.7962697148323059, 'feature_09': 23.72976726545254, 'feature_10': 3.1867162933797224, 'feature_11': 163.44513161352285, 'feature_12': 0.6700984835624695, 'feature_13': 0.5805172920227051, 'feature_14': 0.664044201374054, 'feature_15': 0.37517768144607544, 'feature_16': 0.3393096327781677, 'feature_17': 0.3603287935256958, 'feature_18': 0.9911752939224243, 'feature_19': 1.0550744533538818, 'feature_20': 0.6643751263618469, 'feature_21': 0.38239365816116333, 'feature_22': 0.950261116027832, 'feature_23': 0.8119344711303711, 'feature_24': 1.4362775087356567, 'feature_25': 1.0947270393371582, 'feature_26': 1.077124834060669, 'feature_27': 1.0645726919174194, 'feature_28': 1.0676648616790771, 'feature_29': 0.2640742361545563, 'feature_30': 0.19689509272575378, 'feature_31': 0.3815343976020813, 'feature_32': 1.2996565103530884, 'feature_33': 0.9989405870437622, 'feature_34': 1.3409572839736938, 'feature_35': 1.3365675210952759, 'feature_36': 0.8695492148399353, 'feature_37': 0.7334080934524536, 'feature_38': 0.698810338973999, 'feature_39': 0.7965824604034424, 'feature_40': 0.518515944480896, 'feature_41': 0.6384949088096619, 'feature_42': 0.8168442249298096, 'feature_43': 0.5228385925292969, 'feature_44': 0.6521403193473816, 'feature_45': 0.8666537404060364, 'feature_46': 0.9039222002029419, 'feature_47': 3.2711963653564453, 'feature_48': 0.6570901274681091, 'feature_49': 0.7083076238632202, 'feature_50': 1.0132617950439453, 'feature_51': 0.6081287860870361, 'feature_52': 0.9250587224960327, 'feature_53': 1.0421689748764038, 'feature_54': 0.5859629511833191, 'feature_55': 0.9191848039627075, 'feature_56': 0.9549097418785095, 'feature_57': 1.0204777717590332, 'feature_58': 0.8327276110649109, 'feature_59': 0.8309783339500427, 'feature_60': 0.8389413356781006, 'feature_61': 1.192766547203064, 'feature_62': 1.388945460319519, 'feature_63': 0.09957146644592285, 'feature_64': 0.3396177291870117, 'feature_65': 1.01683509349823, 'feature_66': 1.0824761390686035, 'feature_67': 0.642227828502655, 'feature_68': 0.5312599539756775, 'feature_69': 0.6208390593528748, 'feature_70': 0.6724499464035034, 'feature_71': 0.5356909036636353, 'feature_72': 0.6534596681594849, 'feature_73': 1.0855497121810913, 'feature_74': 1.0880277156829834, 'feature_75': 1.2321789264678955, 'feature_76': 1.2345560789108276, 'feature_77': 1.0921478271484375, 'feature_78': 1.0924347639083862}

def normalize_dataframe(df: pl.DataFrame, means: dict, stds: dict) -> pl.DataFrame:
    # 폴라 데이터프레임을 평균과 표준편차로 정규화
    normalize_exprs = []

    for col in df.columns:
        if col in means and col in stds: # 정규화 되어야 하는 열만 존재하도록
            if stds[col] != 0: # 0인 차원 적용 X
                # 정규화 하되 이름은 유지하도록
                normalize_exprs.append(
                    ((pl.col(col) - means[col]) / stds[col]).alias(col)
                )
            else:
                normalize_exprs.append(pl.col(col) - means[col]).alias(col)

    normalized_df = df.select(normalize_exprs) # 정규화된 데이터프레임 생성
    return normalized_df

# Load training data & select

In [4]:
# Use last 2 parquets

train = pl.scan_parquet(
    f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).with_columns(
    (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
).filter(
    (pl.col("date_id") >= CONFIG.chg_date_1 - 50) & (pl.col("date_id") <= CONFIG.chg_date_1 + 50)
    | (pl.col("date_id") >= CONFIG.chg_date_2 - 50) & (pl.col("date_id") <= CONFIG.chg_date_2 + 50)
    | (pl.col("date_id") >= CONFIG.chg_date_3) # 데이터를 초반 두 번의 변화 두 번 전후 50일씩, 그리고 마지막 2 parquet을 가져옴.
).fill_null(
    strategy="forward"
).fill_null(
    value=0
)

In [5]:
col_names = train.collect_schema().names()

# col_names를 사용하여 이후 처리
normalize_exprs = []
for col in col_names:
    if col in means and col in stds:
        if stds[col] != 0:
            normalize_exprs.append(
                ((pl.col(col) - means[col]) / stds[col]).alias(col)
            )
        else:
            normalize_exprs.append((pl.col(col) - means[col]).alias(col))

# LazyFrame에 정규화 적용
train = train.with_columns(normalize_exprs)

In [6]:
train = train.collect().to_pandas()
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label
0,6316560,434,0,0,2.274729,0.350401,0.833223,-0.290182,0.164869,2.051046,...,0.27283,-0.365024,0.3572,0.438255,0.094859,0.500254,0.510357,0.463678,2,1
1,6316561,434,0,1,4.780962,0.441058,0.799324,-0.236335,-0.021088,1.812573,...,0.274102,1.325831,0.572652,0.448252,-0.229782,0.294333,0.445521,-1.11665,2,0
2,6316562,434,0,2,1.162832,0.063179,0.820028,0.426863,0.115897,2.174137,...,-0.18142,-0.155896,0.089658,0.646632,-0.111775,0.199509,0.66334,0.377528,2,0
3,6316563,434,0,3,1.045792,0.233065,1.123905,0.152825,0.395505,2.364723,...,-0.124641,-1.419458,-0.562762,-0.145655,-1.202561,-0.797129,-0.150784,-1.485704,2,-1
4,6316564,434,0,7,2.628148,-0.277403,1.166836,-0.150327,0.51965,2.000447,...,-0.076131,0.098204,-0.406867,-0.324423,-0.71359,-0.433388,-0.366702,-0.916464,2,0


In [7]:
tmp_normed = normalize_dataframe(pl.DataFrame(train), means, stds)
tmp_normed = tmp_normed.to_pandas()

In [8]:
tmp_normed.head()

Unnamed: 0,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
0,-0.281973,0.725476,-0.912723,-0.463738,1.899339,0.069323,0.700608,0.447055,0.17491,-1.41199,...,-0.871308,-1.979774,0.685526,-0.829686,0.175593,0.180493,-0.057315,-0.064112,-0.11984,-0.172663
1,-0.193763,0.694567,-0.859705,-0.646581,1.677015,0.078985,0.781937,0.64965,0.185389,-1.41199,...,0.118375,-0.943577,0.985183,-0.98296,0.175593,0.180493,-0.033572,-0.049455,-0.074194,-0.192486
2,-0.561439,0.713445,-0.206713,-0.51189,2.014095,0.116023,1.095043,0.771529,0.202319,-1.287678,...,-0.65802,-1.628942,8.621769,0.331158,0.175593,0.180493,0.491979,0.334167,0.040144,0.034893
3,-0.39614,0.990514,-0.476535,-0.236965,2.191775,0.11062,0.943477,0.427947,0.225898,-1.424421,...,0.219942,-0.620904,1.635924,-0.444471,0.175593,0.180493,3.538906,3.340486,5.984796,6.022639
4,-0.892825,1.029658,-0.775021,-0.114899,1.852167,0.08486,0.922487,0.481041,0.206733,-1.41199,...,-0.523144,-1.551417,1.27661,-0.932359,0.175593,0.180493,-0.041877,-0.084179,-0.109762,-0.039549


In [9]:
train.update(tmp_normed)
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label
0,6316560,434,0,0,2.274729,-0.281973,0.725476,-0.912723,-0.463738,1.899339,...,0.27283,-0.365024,0.3572,0.438255,0.094859,0.500254,0.510357,0.463678,2,1
1,6316561,434,0,1,4.780962,-0.193763,0.694567,-0.859705,-0.646581,1.677015,...,0.274102,1.325831,0.572652,0.448252,-0.229782,0.294333,0.445521,-1.11665,2,0
2,6316562,434,0,2,1.162832,-0.561439,0.713445,-0.206713,-0.51189,2.014095,...,-0.18142,-0.155896,0.089658,0.646632,-0.111775,0.199509,0.66334,0.377528,2,0
3,6316563,434,0,3,1.045792,-0.39614,0.990514,-0.476535,-0.236965,2.191775,...,-0.124641,-1.419458,-0.562762,-0.145655,-1.202561,-0.797129,-0.150784,-1.485704,2,-1
4,6316564,434,0,7,2.628148,-0.892825,1.029658,-0.775021,-0.114899,1.852167,...,-0.076131,0.098204,-0.406867,-0.324423,-0.71359,-0.433388,-0.366702,-0.916464,2,0


In [10]:
train = pl.DataFrame(train)

# Create Lags data from training data

In [11]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
lags

date_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32
435,0,1.167622,0.522281,-0.651009,0.753661,0.485425,1.312562,1.757024,1.218615,4.805431
435,1,-0.221519,-0.064056,2.153494,-0.356272,-0.212228,0.309967,-0.363238,-0.1246,-1.147682
435,2,0.115236,-0.134426,1.4096,0.136752,0.052765,0.723226,0.187152,0.101661,0.271861
435,3,-0.759539,1.242356,2.137401,1.220474,0.514464,0.374935,-0.237412,-0.079438,-0.269219
435,7,-4.726245,-0.974081,-3.272671,-0.01613,-0.005919,-0.88512,-0.111873,-0.033721,-0.217343
…,…,…,…,…,…,…,…,…,…,…
1699,34,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461
1699,35,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063
1699,36,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517
1699,37,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395


# Merge training data and lags data

In [12]:
train = train.join(lags, on=["date_id", "symbol_id"],  how="left")
train

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32
6316560,434,0,0,2.274729,-0.281973,0.725476,-0.912723,-0.463738,1.899339,0.069323,0.700608,0.447055,0.17491,-1.41199,-1.355286,-1.029183,-1.827064,0.422669,-0.880802,7.298218,2.46083,7.642132,-1.903719,-1.206695,6.459238,-0.995355,0.636786,-0.023217,-2.175171,0.161991,0.197758,0.971445,1.007937,8.910877,8.373815,-1.00607,…,0.413646,8.394182,2.74325,-1.121971,-1.16523,-1.331035,0.186402,-0.871308,-1.979774,0.685526,-0.829686,0.175593,0.180493,-0.057315,-0.064112,-0.11984,-0.172663,-0.253606,0.27283,-0.365024,0.3572,0.438255,0.094859,0.500254,0.510357,0.463678,2,1,,,,,,,,,
6316561,434,0,1,4.780962,-0.193763,0.694567,-0.859705,-0.646581,1.677015,0.078985,0.781937,0.64965,0.185389,-1.41199,-1.355286,-1.029183,-1.456445,2.691814,-0.460659,7.298218,4.666592,7.642132,-1.06728,-1.453145,5.708385,-0.995355,3.170527,2.007338,-2.197745,1.583821,0.197758,0.971445,1.856006,7.102427,14.445711,-1.00607,…,0.25931,10.638181,1.814321,-1.107595,-0.766132,-1.278714,4.250971,0.118375,-0.943577,0.985183,-0.98296,0.175593,0.180493,-0.033572,-0.049455,-0.074194,-0.192486,1.140099,0.274102,1.325831,0.572652,0.448252,-0.229782,0.294333,0.445521,-1.11665,2,0,,,,,,,,,
6316562,434,0,2,1.162832,-0.561439,0.713445,-0.206713,-0.51189,2.014095,0.116023,1.095043,0.771529,0.202319,-1.287678,-1.847647,-1.029819,-1.819456,5.556284,-0.096689,7.298218,4.827356,7.642132,-1.27502,-0.651436,-1.418087,-0.995355,-1.112607,-1.421615,-2.999998,-0.475475,0.197758,0.971445,0.492544,5.641418,15.031393,-1.00607,…,0.546534,49.945595,10.320224,-0.582587,-1.162074,-1.17472,1.963946,-0.65802,-1.628942,8.621769,0.331158,0.175593,0.180493,0.491979,0.334167,0.040144,0.034893,-0.05987,-0.18142,-0.155896,0.089658,0.646632,-0.111775,0.199509,0.66334,0.377528,2,0,,,,,,,,,
6316563,434,0,3,1.045792,-0.39614,0.990514,-0.476535,-0.236965,2.191775,0.11062,0.943477,0.427947,0.225898,-1.424421,-1.749175,-1.031616,-2.393446,3.267372,-0.259828,7.298218,3.306319,7.642132,-2.146226,-1.329383,0.873843,-0.995355,-1.178008,-2.870883,-2.485448,0.326723,0.197758,0.971445,-0.787151,2.621095,6.527111,-1.00607,…,0.659287,80.856636,7.098997,-0.582587,-1.162074,-1.203887,5.331989,0.219942,-0.620904,1.635924,-0.444471,0.175593,0.180493,3.538906,3.340486,5.984796,6.022639,0.187285,-0.124641,-1.419458,-0.562762,-0.145655,-1.202561,-0.797129,-0.150784,-1.485704,2,-1,,,,,,,,,
6316564,434,0,7,2.628148,-0.892825,1.029658,-0.775021,-0.114899,1.852167,0.08486,0.922487,0.481041,0.206733,-1.41199,-1.355286,-1.029183,-1.833571,0.221941,-0.877202,7.298218,10.68426,7.642132,-1.313558,-1.033293,5.441164,-0.995355,0.371822,1.270197,-2.351796,0.339238,0.197758,0.971445,0.101967,26.040363,53.267483,-1.00607,…,0.232971,-21.414984,1.884187,-2.00817,-1.783691,-1.507834,-0.63355,-0.523144,-1.551417,1.27661,-0.932359,0.175593,0.180493,-0.041877,-0.084179,-0.109762,-0.039549,-0.175432,-0.076131,0.098204,-0.406867,-0.324423,-0.71359,-0.433388,-0.366702,-0.916464,2,0,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47127333,1698,967,34,3.242493,1.161628,-0.665683,1.22196,1.154109,0.338164,0.891683,0.941189,1.908768,0.787044,-1.356938,-1.55223,-1.026413,2.290397,3.789178,2.792779,2.529841,4.425087,3.897956,1.846684,0.156879,0.590978,-2.452438,0.99381,1.987114,-3.343163,-1.276368,0.093324,1.93393,2.087409,0.71573,-1.72913,-2.674619,…,0.267257,7.995877,0.781016,2.547037,0.150737,2.366914,3.314534,1.449159,3.38448,4.069668,4.628181,0.019924,0.019806,0.242348,0.320862,0.092946,0.150246,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461,9,0,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
47127334,1698,967,35,1.079139,0.529922,-0.722768,1.417222,1.020946,0.709732,0.746988,0.993193,1.708587,0.638972,-1.387128,-1.355286,-1.024728,0.3313,-0.098006,-0.00958,4.306299,9.164734,5.649051,1.994075,0.341983,0.14223,-1.854436,-1.151454,-0.165783,-3.009116,-0.690717,0.339223,0.142043,-0.182698,5.814374,14.609987,-1.984501,…,0.288492,10.197431,1.901252,2.182923,0.192487,0.50216,0.013813,0.011727,0.036124,0.021788,0.057631,0.009202,0.032418,0.776885,0.578446,0.158492,0.178672,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063,9,0,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
47127335,1698,967,36,1.033172,1.152509,-0.624379,0.974966,1.196625,0.196945,1.035988,0.986301,1.892499,0.523708,-1.344507,-1.355286,-1.02091,2.785489,0.067674,0.962128,3.546333,1.156188,2.213644,2.439405,0.457112,0.348453,-1.854436,-1.086267,-0.457393,-3.127841,-0.996608,0.339223,0.142043,-0.154037,2.083538,14.075168,-1.984501,…,0.289465,10.68751,1.339279,2.377051,0.756934,3.082911,-0.630537,0.777092,2.338257,0.406235,1.294739,-0.011891,0.043998,0.10967,0.117301,0.271831,0.263699,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517,9,0,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
47127336,1698,967,37,1.243116,1.292408,-0.804627,0.998141,1.757201,0.257244,0.712089,0.997654,1.393654,0.636096,-1.371145,-1.650702,-1.024017,2.189982,1.198084,1.399475,2.956911,2.817212,0.776417,1.96859,0.527824,-3.049874,-2.723259,-0.339742,-0.164983,-3.180119,-1.244114,0.347453,0.956888,0.465522,2.126269,5.540723,-2.493066,…,0.281111,21.504911,2.106209,2.607128,0.702114,2.954546,-0.186826,0.855301,1.493915,3.049307,1.778495,0.085594,0.086714,0.196016,0.204198,0.098731,0.11135,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395,9,0,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


# Split training data and validation data

In [13]:
len_train   = train.select(pl.col("date_id")).shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records
last_tr_dt  = train.select(pl.col("date_id")).row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data = train.filter(pl.col("date_id").gt(last_tr_dt))


 len_train = 17068838

 len_ofl_mdl = 16215397

---> Last offline train date = 1676



In [14]:
validation_data

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32
46300666,1677,0,0,2.873274,2.051845,-2.087524,2.128948,2.491857,2.637451,-1.587871,2.166003,0.947398,1.797012,-1.41199,-1.355286,-1.029183,-1.248793,11.645766,1.188871,1.908303,3.970276,3.216037,-0.849407,-0.969587,2.056125,-2.168102,1.789578,0.904839,-2.752288,-0.217181,1.091592,1.86991,1.271379,-0.29733,6.171846,-2.208922,…,0.186402,-1.613989,0.799266,-1.32266,-1.432696,-0.980803,11.07325,1.147051,-1.327311,9.655219,0.50019,0.041329,0.084309,1.361918,1.011515,0.373965,0.544464,-0.476772,-0.407066,-0.178412,0.651198,0.20705,-0.392802,1.096466,0.778626,-1.199324,9,2,-0.428585,-0.136251,1.566364,0.428843,0.23351,0.71017,0.007499,0.026758,-0.001246
46300667,1677,0,1,2.609207,2.884391,-2.057323,1.954708,2.253064,3.194304,-1.55187,2.356646,0.740778,1.720752,-1.41199,-1.355286,-1.029183,-1.244164,5.429528,0.395587,1.908303,7.580124,3.216037,-1.494669,-2.82874,2.708776,-1.459074,1.177254,0.706527,-2.644916,-0.264686,-1.045596,0.673719,1.693,0.625036,-4.596419,-1.363878,…,0.552515,7.9913,3.067132,-0.924866,-0.867734,-1.252565,10.660516,0.718726,-1.775867,4.808345,0.015539,0.041329,0.084309,1.262893,0.943764,0.401632,0.323128,-0.300104,-0.102044,-0.582812,0.399036,0.24965,0.589496,0.999029,0.337954,1.417968,9,1,-1.801191,-0.857617,0.480688,0.113079,0.061259,0.231233,0.221989,0.229291,0.457717
46300668,1677,0,2,2.283649,2.576077,-1.748822,1.637638,2.641702,2.768065,-2.165068,2.045094,0.675855,0.800727,-1.287678,-1.847647,-1.029819,-0.51008,25.694603,3.435648,1.908303,4.771524,3.216037,-0.837515,-1.442619,1.323822,-2.343809,0.680286,0.990703,-2.884496,-0.548675,0.745641,1.312172,0.644177,3.747975,5.480609,-2.944521,…,0.295382,6.562002,0.975869,-0.80762,-1.203791,0.216954,27.820011,7.363719,-1.553156,11.836189,0.889991,0.041329,0.084309,0.588648,0.544519,0.115381,0.036392,-0.174525,-0.242761,-0.528811,1.071503,0.880852,-2.011909,3.733616,1.047214,-3.185961,9,7,-0.628089,0.181011,-1.207484,1.384124,0.922712,-0.058739,0.198621,0.084671,0.338622
46300669,1677,0,3,1.90857,2.634571,-2.078476,2.306932,2.581763,2.65166,-1.577868,1.824954,0.84994,1.857093,-1.424421,-1.749175,-1.031616,-1.179679,17.189577,1.153447,1.908303,10.669032,3.216037,-0.917117,-2.545076,0.543072,-1.780067,0.31704,1.147671,-2.749204,-0.632405,0.183184,0.778162,0.189492,-0.014575,8.728395,-2.442986,…,0.354595,17.396265,2.24073,-0.380307,-1.433383,-0.594029,17.118586,2.418219,-1.2218,9.545094,0.358937,0.041329,0.084309,1.732601,1.28041,0.563479,0.576154,0.207932,-0.162053,0.516934,1.292413,-0.179215,0.916088,1.026692,-0.074256,1.460618,9,2,0.892754,0.094917,0.144263,0.315577,0.215864,0.864607,0.950294,0.473284,0.775277
46300670,1677,0,5,2.311023,2.865599,-1.99862,1.641862,2.576653,2.859766,-1.626988,1.939323,0.551119,1.917266,-1.427973,-1.05987,-1.025627,-1.169856,10.96752,0.725063,1.908303,5.571417,3.216037,-2.363008,-2.53181,2.551679,-1.930133,1.126099,0.225765,-2.674609,-0.079823,-0.344176,0.952108,1.340389,-1.789045,5.082495,-1.831453,…,0.308361,1.173485,2.76453,-1.537954,-0.768984,-1.217605,7.5347,0.51204,-1.430233,7.38337,0.960662,0.041329,0.084309,2.873025,1.870071,0.933466,0.914133,-0.111481,-0.075531,-0.202773,-0.752794,1.247104,-0.485876,-1.0209,1.181706,-1.000387,9,-2,-2.949066,-1.534304,0.350876,1.799247,0.62321,0.280726,0.15865,0.094266,0.232934
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47127333,1698,967,34,3.242493,1.161628,-0.665683,1.22196,1.154109,0.338164,0.891683,0.941189,1.908768,0.787044,-1.356938,-1.55223,-1.026413,2.290397,3.789178,2.792779,2.529841,4.425087,3.897956,1.846684,0.156879,0.590978,-2.452438,0.99381,1.987114,-3.343163,-1.276368,0.093324,1.93393,2.087409,0.71573,-1.72913,-2.674619,…,0.267257,7.995877,0.781016,2.547037,0.150737,2.366914,3.314534,1.449159,3.38448,4.069668,4.628181,0.019924,0.019806,0.242348,0.320862,0.092946,0.150246,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461,9,0,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
47127334,1698,967,35,1.079139,0.529922,-0.722768,1.417222,1.020946,0.709732,0.746988,0.993193,1.708587,0.638972,-1.387128,-1.355286,-1.024728,0.3313,-0.098006,-0.00958,4.306299,9.164734,5.649051,1.994075,0.341983,0.14223,-1.854436,-1.151454,-0.165783,-3.009116,-0.690717,0.339223,0.142043,-0.182698,5.814374,14.609987,-1.984501,…,0.288492,10.197431,1.901252,2.182923,0.192487,0.50216,0.013813,0.011727,0.036124,0.021788,0.057631,0.009202,0.032418,0.776885,0.578446,0.158492,0.178672,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063,9,0,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
47127335,1698,967,36,1.033172,1.152509,-0.624379,0.974966,1.196625,0.196945,1.035988,0.986301,1.892499,0.523708,-1.344507,-1.355286,-1.02091,2.785489,0.067674,0.962128,3.546333,1.156188,2.213644,2.439405,0.457112,0.348453,-1.854436,-1.086267,-0.457393,-3.127841,-0.996608,0.339223,0.142043,-0.154037,2.083538,14.075168,-1.984501,…,0.289465,10.68751,1.339279,2.377051,0.756934,3.082911,-0.630537,0.777092,2.338257,0.406235,1.294739,-0.011891,0.043998,0.10967,0.117301,0.271831,0.263699,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517,9,0,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
47127336,1698,967,37,1.243116,1.292408,-0.804627,0.998141,1.757201,0.257244,0.712089,0.997654,1.393654,0.636096,-1.371145,-1.650702,-1.024017,2.189982,1.198084,1.399475,2.956911,2.817212,0.776417,1.96859,0.527824,-3.049874,-2.723259,-0.339742,-0.164983,-3.180119,-1.244114,0.347453,0.956888,0.465522,2.126269,5.540723,-2.493066,…,0.281111,21.504911,2.106209,2.607128,0.702114,2.954546,-0.186826,0.855301,1.493915,3.049307,1.778495,0.085594,0.086714,0.196016,0.204198,0.098731,0.11135,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395,9,0,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


# Save data as parquets

In [15]:
training_data.\
write_parquet(
    f"training_data.parquet"
)

In [16]:
validation_data.\
write_parquet(
    f"validation_data.parquet"
)