In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import optuna
from lightgbm import LGBMRegressor
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_parquet('/kaggle/input/march-lgbm-lags/all_rows.parquet')
ss = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')
TARGET = 'congestion'
hm = list(df.loc[df.test,'hm'].unique())

In [None]:
for feat in ['xydir','xy','dhm','hm','direction', 'hm_xy','hm_xydir']:
    df[feat] = df[feat].astype('category')

In [None]:
bad_features = ['row_id','time',TARGET, 'test']
POSSIBLE_FEATURES = [feat for feat in df.columns if feat not in bad_features]

In [None]:
val_times = df[df.test].time.unique() - pd.Timedelta(days=7)
val = df[df.time.isin(val_times)].reset_index(drop=True).copy()
train = df[df.time<val_times[0]].reset_index(drop=True).copy()

In [None]:
def objective(trial):
    ###################################
    # Generate our trial model.
    ###################################
    FEATURES = []
    for feat in POSSIBLE_FEATURES:
        select_feat = trial.suggest_categorical(feat, [True, False])
        if select_feat:
            FEATURES.append(feat)
    model = LGBMRegressor()
    
    #Masks for day and hours 
    only_test_day = trial.suggest_categorical('only_test_day',[True,False])
    if only_test_day:
        msk_day = train.time.dt.weekday.isin([0])
    else:
        msk_day = pd.Series([True for i in range(train.shape[0])])
        
    only_test_hours = trial.suggest_categorical('only_test_hours',[True,False])
    if only_test_hours:
        msk_hm = train.hm.isin(hm)
    else:
        msk_hm = pd.Series([True for i in range(train.shape[0])])
    model.fit(train.loc[msk_day & msk_hm, FEATURES], train.loc[msk_day & msk_hm, TARGET])
    
    
    #Val Score
    val_preds = model.predict(val[FEATURES])
    score = np.mean(np.abs(val[TARGET].values - val_preds))    
    return score

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500)

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
train = df[df.test==False].reset_index(drop=True)
test = df[df.test].reset_index(drop=True)

#Getting the best features from model
FEATURES = []
for key, value in trial.params.items():
    if key not in  ['only_test_hours', 'only_test_day']:
        if value:
            FEATURES.append(key)

if trial.params['only_test_day']:
    msk_day = train.time.dt.weekday.isin([0])
else:
    msk_day = pd.Series([True for i in range(train.shape[0])])
if trial.params['only_test_hours']:
    msk_hm = train.hm.isin(hm)
else:
    msk_hm = pd.Series([True for i in range(train.shape[0])])

model = LGBMRegressor()
model.fit(train.loc[msk_day & msk_hm, FEATURES], train.loc[msk_day & msk_hm, TARGET])

ss[TARGET] = model.predict(test[FEATURES])

In [None]:
ss.to_csv('sub.csv',index=False)