# Libraries

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import gc
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import StratifiedGroupKFold

# Configurations

In [2]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.05
    start_dt = 1100
    chg_date_1 = 484
    chg_date_2 = 677
    chg_date_3 = 1530 # 마지막 partiton

In [3]:
means = {'feature_00': 0.640198826789856, 'feature_01': 0.03755598142743111, 'feature_02': 0.6368075609207153, 'feature_03': 0.6365063786506653, 'feature_04': 0.013741530478000641, 'feature_05': -0.02173694409430027, 'feature_06': -0.006415014620870352, 'feature_07': -0.010971736162900925, 'feature_08': -0.04653771221637726, 'feature_09': 32.596106194690265, 'feature_10': 4.95929203539823, 'feature_11': 167.6541592920354, 'feature_12': -0.13415881991386414, 'feature_13': -0.07573335617780685, 'feature_14': -0.12015637010335922, 'feature_15': -0.7470195889472961, 'feature_16': -0.6257441639900208, 'feature_17': -0.7294047474861145, 'feature_18': -0.042215555906295776, 'feature_19': -0.08798160403966904, 'feature_20': -0.15741558372974396, 'feature_21': 0.10528526455163956, 'feature_22': 0.018054703250527382, 'feature_23': 0.03165541961789131, 'feature_24': 2.733017921447754, 'feature_25': 0.39958420395851135, 'feature_26': -0.11045943945646286, 'feature_27': -0.5332594513893127, 'feature_28': -0.4522790312767029, 'feature_29': -0.5739678144454956, 'feature_30': -0.7905704975128174, 'feature_31': 0.10600688308477402, 'feature_32': 0.40044134855270386, 'feature_33': -0.021725023165345192, 'feature_34': 0.4226262867450714, 'feature_35': 0.42143046855926514, 'feature_36': -0.00023802756913937628, 'feature_37': 0.027961043640971184, 'feature_38': 0.010258913040161133, 'feature_39': 0.005768273025751114, 'feature_40': 0.017485467717051506, 'feature_41': 0.038347117602825165, 'feature_42': -0.06123563274741173, 'feature_43': -0.11644423753023148, 'feature_44': -0.12342483550310135, 'feature_45': -0.028769943863153458, 'feature_46': -0.015200662426650524, 'feature_47': 0.015717582777142525, 'feature_48': -0.0033910537604242563, 'feature_49': -0.0052393232472240925, 'feature_50': -0.2285808026790619, 'feature_51': -0.3548349440097809, 'feature_52': -0.358092725276947, 'feature_53': 0.2607136368751526, 'feature_54': 0.18796788156032562, 'feature_55': 0.3154229521751404, 'feature_56': -0.1471923440694809, 'feature_57': 0.15730056166648865, 'feature_58': -0.021774644032120705, 'feature_59': -0.0037768862675875425, 'feature_60': -0.010220836848020554, 'feature_61': -0.03178725391626358, 'feature_62': -0.3769100308418274, 'feature_63': -0.3229374587535858, 'feature_64': -0.3718394339084625, 'feature_65': -0.10233989357948303, 'feature_66': -0.13688170909881592, 'feature_67': -0.14402112364768982, 'feature_68': -0.06875362992286682, 'feature_69': -0.11862917989492416, 'feature_70': -0.11789549142122269, 'feature_71': -0.06013699993491173, 'feature_72': -0.10766122490167618, 'feature_73': -0.09921672940254211, 'feature_74': -0.10233042389154434, 'feature_75': -0.05991339311003685, 'feature_76': -0.06349952518939972, 'feature_77': -0.07424316555261612, 'feature_78': -0.07759837061166763}
stds = {'feature_00': 1.027751088142395, 'feature_01': 1.0967519283294678, 'feature_02': 1.0156300067901611, 'feature_03': 1.0170334577560425, 'feature_04': 1.0726385116577148, 'feature_05': 0.9639211297035217, 'feature_06': 1.0963259935379028, 'feature_07': 1.0789952278137207, 'feature_08': 0.7962697148323059, 'feature_09': 23.72976726545254, 'feature_10': 3.1867162933797224, 'feature_11': 163.44513161352285, 'feature_12': 0.6700984835624695, 'feature_13': 0.5805172920227051, 'feature_14': 0.664044201374054, 'feature_15': 0.37517768144607544, 'feature_16': 0.3393096327781677, 'feature_17': 0.3603287935256958, 'feature_18': 0.9911752939224243, 'feature_19': 1.0550744533538818, 'feature_20': 0.6643751263618469, 'feature_21': 0.38239365816116333, 'feature_22': 0.950261116027832, 'feature_23': 0.8119344711303711, 'feature_24': 1.4362775087356567, 'feature_25': 1.0947270393371582, 'feature_26': 1.077124834060669, 'feature_27': 1.0645726919174194, 'feature_28': 1.0676648616790771, 'feature_29': 0.2640742361545563, 'feature_30': 0.19689509272575378, 'feature_31': 0.3815343976020813, 'feature_32': 1.2996565103530884, 'feature_33': 0.9989405870437622, 'feature_34': 1.3409572839736938, 'feature_35': 1.3365675210952759, 'feature_36': 0.8695492148399353, 'feature_37': 0.7334080934524536, 'feature_38': 0.698810338973999, 'feature_39': 0.7965824604034424, 'feature_40': 0.518515944480896, 'feature_41': 0.6384949088096619, 'feature_42': 0.8168442249298096, 'feature_43': 0.5228385925292969, 'feature_44': 0.6521403193473816, 'feature_45': 0.8666537404060364, 'feature_46': 0.9039222002029419, 'feature_47': 3.2711963653564453, 'feature_48': 0.6570901274681091, 'feature_49': 0.7083076238632202, 'feature_50': 1.0132617950439453, 'feature_51': 0.6081287860870361, 'feature_52': 0.9250587224960327, 'feature_53': 1.0421689748764038, 'feature_54': 0.5859629511833191, 'feature_55': 0.9191848039627075, 'feature_56': 0.9549097418785095, 'feature_57': 1.0204777717590332, 'feature_58': 0.8327276110649109, 'feature_59': 0.8309783339500427, 'feature_60': 0.8389413356781006, 'feature_61': 1.192766547203064, 'feature_62': 1.388945460319519, 'feature_63': 0.09957146644592285, 'feature_64': 0.3396177291870117, 'feature_65': 1.01683509349823, 'feature_66': 1.0824761390686035, 'feature_67': 0.642227828502655, 'feature_68': 0.5312599539756775, 'feature_69': 0.6208390593528748, 'feature_70': 0.6724499464035034, 'feature_71': 0.5356909036636353, 'feature_72': 0.6534596681594849, 'feature_73': 1.0855497121810913, 'feature_74': 1.0880277156829834, 'feature_75': 1.2321789264678955, 'feature_76': 1.2345560789108276, 'feature_77': 1.0921478271484375, 'feature_78': 1.0924347639083862}

# 전체 데이터 기준으로 재정의

def normalize_dataframe(df: pl.DataFrame, means: dict, stds: dict) -> pl.DataFrame:
    # 폴라 데이터프레임을 평균과 표준편차로 정규화
    normalize_exprs = []

    for col in df.columns:
        if col in means and col in stds: # 정규화 되어야 하는 열만 존재하도록
            if stds[col] != 0: # 0인 차원 적용 X
                # 정규화 하되 이름은 유지하도록
                normalize_exprs.append(
                    ((pl.col(col) - means[col]) / stds[col]).alias(col)
                )
            else:
                normalize_exprs.append(pl.col(col) - means[col]).alias(col)

    normalized_df = df.select(normalize_exprs) # 정규화된 데이터프레임 생성
    return normalized_df

# Load training data & select

In [4]:
# Use last 2 parquets

train = pl.scan_parquet(
    f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).with_columns(
    (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
).filter(
    (pl.col("date_id") >= CONFIG.chg_date_1 - 30) & (pl.col("date_id") <= CONFIG.chg_date_1 + 30)
    | (pl.col("date_id") >= CONFIG.chg_date_2 - 30) & (pl.col("date_id") <= CONFIG.chg_date_2 + 30)
    | (pl.col("date_id") >= CONFIG.chg_date_3) # 데이터를 초반 두 번의 변화 두 번 전후 한달씩, 그리고 마지막 9 parquet을 가져옴.
).fill_null(
    strategy="forward"
).fill_null(
    value=0 # 결측치는 일단 단순하게 0으로 처리 
)

In [5]:
train = train.collect().to_pandas()
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label
0,6655311,454,0,0,2.469772,2.011155,-1.183452,2.44915,1.968074,1.073371,...,-0.91255,-0.551103,0.071475,0.213802,0.92071,0.945452,0.355292,1.365892,2,1
1,6655312,454,0,1,4.609505,1.923074,-1.297473,1.898652,2.59032,0.983711,...,-0.381795,-0.265064,0.488816,0.821543,0.539706,0.821673,0.975165,1.707223,2,1
2,6655313,454,0,2,1.313092,2.103599,-0.866588,2.051928,2.211389,0.847589,...,-0.169111,-0.49774,2.245259,0.355446,0.867455,2.395301,0.417111,1.836413,2,4
3,6655314,454,0,3,0.95309,1.613543,-0.876514,2.681358,2.772389,1.059146,...,0.185034,0.866747,0.925349,-0.60942,1.878252,1.123068,-0.699646,2.347015,2,2
4,6655315,454,0,7,1.674723,2.253614,-1.061369,2.49638,2.403782,1.15442,...,-0.02516,-0.026998,0.154214,-0.22548,-0.242701,-0.107649,-0.422369,-0.321855,2,0


In [6]:
tmp_normed = normalize_dataframe(pl.DataFrame(train), means, stds)
tmp_normed = tmp_normed.to_pandas()

In [7]:
tmp_normed.head()

Unnamed: 0,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
0,1.333938,-1.113294,1.784451,1.309266,0.987872,0.202065,0.703406,0.832457,-0.042096,-0.910085,...,-0.297654,-1.355273,0.420824,-0.71551,0.091398,0.094051,1.479111,1.985552,2.578956,2.782909
1,1.248235,-1.217257,1.242425,1.921091,0.904284,0.15178,0.919433,0.803433,-0.064288,-0.910085,...,-0.836947,-1.143275,1.431542,-0.639512,0.091398,0.094051,-0.00629,0.009189,-0.12033,-0.141355
2,1.423886,-0.824384,1.393342,1.548506,0.77738,0.206468,0.907292,0.413701,-0.166014,2.039796,...,-0.134096,-1.013648,-0.290621,-0.694798,0.091398,0.094051,-0.234672,-0.295525,-0.196816,-0.260476
3,0.947062,-0.833434,2.013085,2.100111,0.97461,0.204402,0.901388,0.504798,-0.094379,-1.205073,...,0.821026,-1.484724,4.479863,0.01306,0.091398,0.094051,0.036501,0.031306,-0.159729,-0.173449
4,1.56985,-1.001981,1.830955,1.737677,1.063433,0.172779,0.605629,0.968551,-0.11505,-0.910085,...,-0.992144,-1.177742,-0.574875,-1.105902,0.091398,0.094051,-0.272196,-0.214301,-0.190665,-0.242373


In [8]:
train.update(tmp_normed)
train.head()

  0.73342033]' has dtype incompatible with int8, please explicitly cast to a compatible dtype first.
  train.update(tmp_normed)
 -1.24243631]' has dtype incompatible with int8, please explicitly cast to a compatible dtype first.
  train.update(tmp_normed)
  2.16798039]' has dtype incompatible with int16, please explicitly cast to a compatible dtype first.
  train.update(tmp_normed)


Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label
0,6655311,454,0,0,2.469772,1.333938,-1.113294,1.784451,1.309266,0.987872,...,-0.91255,-0.551103,0.071475,0.213802,0.92071,0.945452,0.355292,1.365892,2,1
1,6655312,454,0,1,4.609505,1.248235,-1.217257,1.242425,1.921091,0.904284,...,-0.381795,-0.265064,0.488816,0.821543,0.539706,0.821673,0.975165,1.707223,2,1
2,6655313,454,0,2,1.313092,1.423886,-0.824384,1.393342,1.548506,0.77738,...,-0.169111,-0.49774,2.245259,0.355446,0.867455,2.395301,0.417111,1.836413,2,4
3,6655314,454,0,3,0.95309,0.947062,-0.833434,2.013085,2.100111,0.97461,...,0.185034,0.866747,0.925349,-0.60942,1.878252,1.123068,-0.699646,2.347015,2,2
4,6655315,454,0,7,1.674723,1.56985,-1.001981,1.830955,1.737677,1.063433,...,-0.02516,-0.026998,0.154214,-0.22548,-0.242701,-0.107649,-0.422369,-0.321855,2,0


In [9]:
train = pl.DataFrame(train)

# Create Lags data from training data

In [10]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
lags

date_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32
455,0,-0.37962,0.622232,-0.443895,-0.16413,-0.064856,-0.706662,0.023419,0.043093,0.062117
455,1,0.460542,0.481335,-0.318708,0.534235,0.308263,-0.149519,-0.001026,0.034362,-0.013399
455,2,-0.250504,0.560345,-0.099183,0.205178,0.185717,-0.502219,-0.080229,-0.005963,-0.165517
455,3,5.0,3.922242,4.143309,2.798591,1.049106,1.343205,-0.327025,-0.11451,-0.591432
455,7,-0.832327,0.260436,1.770283,0.370914,0.11369,-0.845703,0.087104,0.058233,0.175273
…,…,…,…,…,…,…,…,…,…,…
1699,34,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461
1699,35,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063
1699,36,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517
1699,37,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395


# Merge training data and lags data

In [11]:
train = train.join(lags, on=["date_id", "symbol_id"],  how="left")
train

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32
6655311,454,0,0,2.469772,1.333938,-1.113294,1.784451,1.309266,0.987872,0.202065,0.703406,0.832457,-0.042096,-0.910085,0.64038,-0.560764,-0.925331,1.183186,-0.758088,1.991109,0.626821,2.024276,-1.422394,-0.918052,4.976132,-0.275332,0.589272,0.495053,-1.020504,0.897619,0.10255,0.500914,0.840541,1.308578,0.461486,-0.277844,…,0.156923,-0.040245,0.470043,-1.42158,-1.078765,-1.066483,1.892092,-0.297654,-1.355273,0.420824,-0.71551,0.091398,0.094051,1.479111,1.985552,2.578956,2.782909,-0.599413,-0.91255,-0.551103,0.071475,0.213802,0.92071,0.945452,0.355292,1.365892,2,1,,,,,,,,,
6655312,454,0,1,4.609505,1.248235,-1.217257,1.242425,1.921091,0.904284,0.15178,0.919433,0.803433,-0.064288,-0.910085,0.64038,-0.560764,-1.309152,0.570928,-0.588572,1.991109,0.971363,2.024276,-1.4589,-1.113657,3.501829,-0.275332,1.750378,1.861664,-0.74875,1.969799,0.10255,0.500914,1.340068,0.984422,1.098497,-0.277844,…,-0.056206,0.181105,-0.002852,-2.104105,-1.600309,-0.887269,-0.046739,-0.836947,-1.143275,1.431542,-0.639512,0.091398,0.094051,-0.00629,0.009189,-0.12033,-0.141355,-0.450519,-0.381795,-0.265064,0.488816,0.821543,0.539706,0.821673,0.975165,1.707223,2,1,,,,,,,,,
6655313,454,0,2,1.313092,1.423886,-0.824384,1.393342,1.548506,0.77738,0.206468,0.907292,0.413701,-0.166014,2.039796,-0.928634,-0.664775,-0.943022,1.343918,-0.652372,1.991109,1.232643,2.024276,-0.934744,-1.547153,-0.423173,-0.275332,-0.806463,-0.838197,-1.339765,-0.054692,0.10255,0.500914,0.613507,0.212927,0.296796,-0.277844,…,0.469152,30.476044,3.868371,-1.428847,-1.447871,-1.199585,2.857519,-0.134096,-1.013648,-0.290621,-0.694798,0.091398,0.094051,-0.234672,-0.295525,-0.196816,-0.260476,0.198349,-0.169111,-0.49774,2.245259,0.355446,0.867455,2.395301,0.417111,1.836413,2,4,,,,,,,,,
6655314,454,0,3,0.95309,0.947062,-0.833434,2.013085,2.100111,0.97461,0.204402,0.901388,0.504798,-0.094379,-1.205073,-0.614831,-0.958451,-0.894795,3.979377,0.404236,1.991109,0.37942,2.024276,-1.175126,-1.255709,2.177283,-0.275332,-1.223988,-2.086902,-1.073052,0.995321,0.10255,0.500914,-1.865123,0.071962,-0.020008,-0.277844,…,0.109298,0.719647,1.010212,-1.428847,-1.447871,-0.7661,5.58766,0.821026,-1.484724,4.479863,0.01306,0.091398,0.094051,0.036501,0.031306,-0.159729,-0.173449,0.095816,0.185034,0.866747,0.925349,-0.60942,1.878252,1.123068,-0.699646,2.347015,2,2,,,,,,,,,
6655315,454,0,7,1.674723,1.56985,-1.001981,1.830955,1.737677,1.063433,0.172779,0.605629,0.968551,-0.11505,-0.910085,0.64038,-0.560764,-1.448179,-0.279433,-0.694833,1.991109,1.705997,2.024276,-0.902561,-0.942952,1.385134,-0.275332,-0.170408,1.5743,-1.460435,-0.222743,0.10255,0.500914,-0.338325,12.920099,11.987921,-0.277844,…,0.056795,-2.265686,0.747685,-1.442343,-1.475546,-1.429906,0.004497,-0.992144,-1.177742,-0.574875,-1.105902,0.091398,0.094051,-0.272196,-0.214301,-0.190665,-0.242373,0.536572,-0.02516,-0.026998,0.154214,-0.22548,-0.242701,-0.107649,-0.422369,-0.321855,2,0,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47127333,1698,967,34,3.242493,1.834064,-0.692533,1.877867,1.810274,0.37647,0.837775,1.025435,2.04858,0.580161,0.396291,0.012774,-0.108013,1.400633,2.12395,1.734372,0.20212,0.875731,0.675141,1.788172,0.077538,0.235215,-0.832511,0.962434,1.645062,-2.068691,-0.997691,-0.009937,1.525549,1.776374,-0.384962,-1.131028,-0.914452,…,-0.005705,0.473224,-0.106592,2.487577,0.026287,1.376077,1.692126,0.781065,2.157998,2.119947,2.916669,-0.077589,-0.080781,0.238702,0.332623,0.027268,0.086536,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461,9,0,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
47127334,1698,967,35,1.079139,1.184827,-0.755141,2.076181,1.674842,0.775027,0.698301,1.082448,1.832585,0.462256,-0.320109,0.64038,0.167309,0.087845,-0.132628,-0.126518,0.868608,2.483938,1.306111,1.934262,0.272836,-0.062922,-0.603839,-1.076128,-0.10295,-1.588908,-0.356563,0.254926,-0.382045,-0.64734,0.961459,2.086064,-0.651148,…,0.02379,0.692436,0.27386,2.117333,0.071481,0.17848,-0.061415,-0.111349,-0.093604,-0.048466,-0.070002,-0.089227,-0.067058,0.897348,0.650624,0.098854,0.117589,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063,9,0,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
47127335,1698,967,36,1.033172,1.824691,-0.647233,1.627012,1.853514,0.224992,0.976874,1.074893,2.031026,0.370475,0.691279,0.64038,0.791372,1.732393,-0.036447,0.518739,0.583485,-0.233439,0.068235,2.375662,0.394305,0.074088,-0.603839,-1.014183,-0.339718,-1.75943,-0.69143,0.254926,-0.382045,-0.616739,-0.023759,1.980761,-0.651148,…,0.025141,0.741234,0.083004,2.314729,0.682482,1.83591,-0.403733,0.36382,1.454465,0.157479,0.738398,-0.112124,-0.054459,0.07522,0.081315,0.222636,0.210476,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517,9,0,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
47127336,1698,967,37,1.243116,1.968472,-0.84492,1.65055,2.423639,0.289672,0.664661,1.087339,1.492774,0.459966,0.059162,-0.301028,0.283556,1.333345,0.619775,0.809157,0.362347,0.330163,-0.449639,1.909002,0.468912,-2.183676,-0.936072,-0.304789,-0.1023,-1.834514,-0.962381,0.263791,0.485418,0.044742,-0.012475,0.300371,-0.845183,…,0.013538,1.818338,0.343466,2.548679,0.623139,1.753471,-0.168007,0.412375,0.886687,1.573349,1.054513,-0.0063,-0.007983,0.181613,0.188594,0.033585,0.044044,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395,9,0,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


# Split training data and validation data

In [12]:
len_train   = train.select(pl.col("date_id")).shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records
last_tr_dt  = train.select(pl.col("date_id")).row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data = train.filter(pl.col("date_id").gt(last_tr_dt))


 len_train = 9101515

 len_ofl_mdl = 8646440

---> Last offline train date = 1686



In [13]:
'''
len_train   = train.select(pl.col("date_id")).collect().shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records
last_tr_dt  = train.select(pl.col("date_id")).collect().row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data = train.filter(pl.col("date_id").gt(last_tr_dt))
'''

'\nlen_train   = train.select(pl.col("date_id")).collect().shape[0]\nvalid_records = int(len_train * CONFIG.valid_ratio)\nlen_ofl_mdl = len_train - valid_records\nlast_tr_dt  = train.select(pl.col("date_id")).collect().row(len_ofl_mdl)[0]\n\nprint(f"\n len_train = {len_train}")\nprint(f"\n len_ofl_mdl = {len_ofl_mdl}")\nprint(f"\n---> Last offline train date = {last_tr_dt}\n")\n\ntraining_data = train.filter(pl.col("date_id").le(last_tr_dt))\nvalidation_data = train.filter(pl.col("date_id").gt(last_tr_dt))\n'

In [14]:
validation_data

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,label,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32
46675282,1687,0,0,3.781251,2.636479,0.84442,2.870531,2.393151,1.209236,-0.324193,0.272615,-0.419251,0.161659,-0.910085,0.64038,-0.560764,-0.850141,2.376347,-0.414971,0.007815,0.615192,0.032255,-0.755278,-1.71008,1.15622,-0.73667,1.352865,1.410515,-0.368628,0.914911,0.798953,1.542142,1.068545,-0.350025,-0.576282,-0.761642,…,-0.085203,-0.863602,-0.392742,-0.659245,-2.49346,-1.374823,2.276955,-0.279922,-1.252557,0.718943,-0.736165,-0.149618,-0.147832,0.22785,0.266206,-0.096579,-0.041945,-0.388299,-0.033851,-0.340357,-0.077653,-0.971475,1.526969,0.01145,-1.353011,2.108581,9,0,-0.266812,-0.054878,-1.545247,0.385557,0.193606,-0.535236,0.058816,0.057375,0.083458
46675283,1687,0,1,2.606019,2.434783,1.100215,2.741738,2.371337,1.711844,-0.385608,0.299114,-0.474106,0.158555,-0.910085,0.64038,-0.560764,-1.288241,3.381258,0.049925,0.007815,0.78399,0.032255,-1.199512,-1.519921,2.105883,-0.504819,0.911222,0.991078,-0.813784,0.598055,-1.326329,0.411298,1.831198,0.599703,0.76043,-0.43,…,0.041658,-0.018689,-0.450416,-1.210543,-2.218475,-0.994741,6.66521,0.647209,-0.959355,-0.039306,-0.912898,-0.149618,-0.147832,0.337824,0.355619,-0.044912,-0.060303,-0.332037,-0.512074,-0.418187,0.813138,-3.348194,1.64164,1.138862,-2.781623,3.425127,9,2,-0.241919,0.078192,-1.161707,0.000655,0.002046,-0.984262,-0.197686,-0.045213,-0.391194
46675284,1687,0,2,1.981104,2.086829,1.077325,1.928942,2.073144,1.267122,-0.564042,0.278564,-0.50659,0.094577,2.039796,-0.928634,-0.664775,-0.602775,12.487664,1.76666,0.007815,1.241995,0.032255,-1.114504,-2.031931,0.32796,-0.7747,1.26202,0.917004,-1.487237,-0.800989,0.709437,1.11075,0.276917,-0.071472,-0.261531,-0.878782,…,0.140561,0.254594,0.154579,-1.377429,-1.430239,-0.212402,7.787453,1.375543,-1.068442,10.820049,1.60906,-0.149618,-0.147832,0.470091,0.451267,-0.008578,-0.023587,-0.343166,-0.26202,-0.408565,1.182349,2.643964,0.361312,1.431608,2.309818,0.698061,9,2,0.978288,0.76318,1.443481,-0.485256,-0.157901,0.084363,-0.164799,-0.109012,-0.762613
46675285,1687,0,3,2.110739,2.694819,0.754661,3.042663,2.903043,1.821212,-0.407918,0.312354,-0.575308,0.240287,-1.205073,-0.614831,-0.958451,-0.665564,5.540873,0.636631,0.007815,0.701606,0.032255,-1.845915,-1.077971,0.141042,-0.432471,0.567596,0.416976,-1.371953,-0.347908,0.093435,0.215531,-0.099886,1.478549,0.612884,-0.400713,…,0.004479,1.104882,0.328145,-1.71361,-1.207404,-1.303224,4.917625,0.291105,-0.990219,6.091198,0.598453,-0.149618,-0.147832,0.59147,0.776167,0.093198,0.114467,-0.05093,-0.297161,-0.508374,-0.777918,-0.197728,-0.214127,-0.74216,-0.065933,-0.046738,9,-1,-0.855925,-0.629784,0.388898,-0.357745,-0.217354,0.082131,0.055058,0.040775,0.113845
46675286,1687,0,4,3.387619,2.741558,0.960306,2.544225,2.540815,1.783475,-0.254627,0.116089,-0.260447,0.114165,-0.74152,-1.242436,-0.970688,-1.618372,0.126582,-0.536781,0.007815,0.188953,0.032255,-0.821465,-1.121688,-2.540149,2.333273,1.109487,0.432029,1.177271,1.99676,-0.779334,-0.676982,-0.393148,-1.23268,-1.390337,3.079002,…,-0.061308,1.347762,-0.596116,-0.838376,-1.449665,-1.27106,-0.055648,-0.847728,-0.756885,0.131568,-0.997315,-0.149618,-0.147832,0.307161,0.238864,-0.111096,-0.071119,-2.789933,-0.952092,-3.151275,-0.70305,-0.919101,-0.292212,-0.222295,-0.968092,0.372206,9,0,-1.050013,-0.673737,-0.359097,-0.360675,-0.185535,-0.410965,-0.330635,-0.070691,-0.544708
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47127333,1698,967,34,3.242493,1.834064,-0.692533,1.877867,1.810274,0.37647,0.837775,1.025435,2.04858,0.580161,0.396291,0.012774,-0.108013,1.400633,2.12395,1.734372,0.20212,0.875731,0.675141,1.788172,0.077538,0.235215,-0.832511,0.962434,1.645062,-2.068691,-0.997691,-0.009937,1.525549,1.776374,-0.384962,-1.131028,-0.914452,…,-0.005705,0.473224,-0.106592,2.487577,0.026287,1.376077,1.692126,0.781065,2.157998,2.119947,2.916669,-0.077589,-0.080781,0.238702,0.332623,0.027268,0.086536,0.243475,0.166927,0.38494,-0.174297,-0.066046,-0.038767,-0.132337,-0.022426,-0.252461,9,0,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
47127334,1698,967,35,1.079139,1.184827,-0.755141,2.076181,1.674842,0.775027,0.698301,1.082448,1.832585,0.462256,-0.320109,0.64038,0.167309,0.087845,-0.132628,-0.126518,0.868608,2.483938,1.306111,1.934262,0.272836,-0.062922,-0.603839,-1.076128,-0.10295,-1.588908,-0.356563,0.254926,-0.382045,-0.64734,0.961459,2.086064,-0.651148,…,0.02379,0.692436,0.27386,2.117333,0.071481,0.17848,-0.061415,-0.111349,-0.093604,-0.048466,-0.070002,-0.089227,-0.067058,0.897348,0.650624,0.098854,0.117589,0.850152,0.909382,1.015314,0.235962,0.122539,0.099559,-0.249584,-0.123571,-0.46063,9,0,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
47127335,1698,967,36,1.033172,1.824691,-0.647233,1.627012,1.853514,0.224992,0.976874,1.074893,2.031026,0.370475,0.691279,0.64038,0.791372,1.732393,-0.036447,0.518739,0.583485,-0.233439,0.068235,2.375662,0.394305,0.074088,-0.603839,-1.014183,-0.339718,-1.75943,-0.69143,0.254926,-0.382045,-0.616739,-0.023759,1.980761,-0.651148,…,0.025141,0.741234,0.083004,2.314729,0.682482,1.83591,-0.403733,0.36382,1.454465,0.157479,0.738398,-0.112124,-0.054459,0.07522,0.081315,0.222636,0.210476,0.395684,-0.292574,-3.215846,-0.535129,-0.178484,-1.80815,-0.065355,-0.000367,-0.12517,9,0,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
47127336,1698,967,37,1.243116,1.968472,-0.84492,1.65055,2.423639,0.289672,0.664661,1.087339,1.492774,0.459966,0.059162,-0.301028,0.283556,1.333345,0.619775,0.809157,0.362347,0.330163,-0.449639,1.909002,0.468912,-2.183676,-0.936072,-0.304789,-0.1023,-1.834514,-0.962381,0.263791,0.485418,0.044742,-0.012475,0.300371,-0.845183,…,0.013538,1.818338,0.343466,2.548679,0.623139,1.753471,-0.168007,0.412375,0.886687,1.573349,1.054513,-0.0063,-0.007983,0.181613,0.188594,0.033585,0.044044,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395,9,0,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


# Save data as parquets

In [15]:
training_data.\
write_parquet(
    f"training_data.parquet"
)

In [16]:
validation_data.\
write_parquet(
    f"validation_data.parquet"
)