In [None]:
# %%capture
# !pip install pycaret==2.3.6
# #!pip install scikit-learn==0.23.2
# !pip install -U setuptools
# from pycaret.regression import *

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly_express as plt
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RepeatedKFold

from xgboost import XGBRegressor
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback

In [None]:
data = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv",index_col="row_id",parse_dates=["time"],
                   infer_datetime_format=True)
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv",index_col="row_id",parse_dates=["time"],
                  infer_datetime_format=True)
sub = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
print("train shape",data.shape,"   test shape",test.shape)

In [None]:
data["congestion"].plot(kind="hist")

In [None]:
def data_encode(data):
    data["month"] = data["time"].dt.month
    data["day"] = data["time"].dt.day
    data["weekday"] = data["time"].dt.weekday
    data["weekend"] = (data["time"].dt.weekday>=5)
    data['hour']    = data['time'].dt.hour
    data['minute']  = data['time'].dt.minute
    data['afternoon'] = data['hour'] >= 12
    data['moment']  = data['time'].dt.hour * 3 + data['time'].dt.minute // 20
    data["named_road"] = data["x"].astype("str")+data["y"].astype("str")+data["direction"].astype("str")
    data.drop(["time","x","y"],inplace=True,axis=1)
    
    return data
data = data_encode(data)
test = data_encode(test)
#data.head()

In [None]:
mins = pd.DataFrame(data.groupby(['named_road', 'weekday', 'hour', 'minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
data = data.merge(mins, on=['named_road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(mins, on=['named_road', 'weekday', 'hour', 'minute'], how='left')

maxs = pd.DataFrame(data.groupby(['named_road', 'weekday', 'hour', 'minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
data = data.merge(maxs, on=['named_road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(maxs, on=['named_road', 'weekday', 'hour', 'minute'], how='left')

medians = pd.DataFrame(data.groupby(['named_road', 'weekday', 'hour', 'minute']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
data = data.merge(medians, on=['named_road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(medians, on=['named_road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
data_encoded = data.copy()
test_encoded = test.copy()

In [None]:
data_encoded = pd.get_dummies(data_encoded,columns=["named_road"])
data_encoded = pd.get_dummies(data_encoded,columns=["direction"])
test_encoded = pd.get_dummies(test_encoded,columns=["named_road"])
test_encoded = pd.get_dummies(test_encoded,columns=["direction"])

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(data_encoded.drop(["congestion"],axis=1),
                                             data_encoded["congestion"],random_state=42)

In [None]:
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=0, importance_type=None,
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=8, min_child_weight=2, 
             monotone_constraints='()', n_estimators=170, n_jobs=-1,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=42, reg_alpha=10, reg_lambda=0.15,
             scale_pos_weight=33.0, subsample=0.9, tree_method='gpu_hist',
             validate_parameters=1, verbosity=1)
model.fit(xtrain,ytrain)
ypred = model.predict(xtest)
print(mean_absolute_error(ypred,ytest))

pred = model.predict(test_encoded)

sub["congestion"] = pred
sub.to_csv("first.csv",index=False)

In [None]:
# X = data
# clf = setup(X,target="congestion",session_id=42,use_gpu=True,categorical_features=None)
# best = compare_models(include=["xgboost","catboost","lightgbm","dt","rf"],sort="MAE",fold=5)
# tuned_model = tune_model(best,optimize="MAE")
# pred = predict_model(tuned_model,data=test)
# sub["congestion"] = pred["Label"]
# sub.to_csv("first.csv",index=False)

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(data_encoded.drop(["congestion"],axis=1),
                                             data_encoded["congestion"],random_state=42)

In [None]:
def objective(trial,X,y,random_state=42,n_splits=3,n_repeats=2,n_jobs=1,early_stopping_rounds=10):
    # XGBoost parameters
    params = {
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
        "seed": random_state,
        "n_jobs": n_jobs,
        "tree_method":'gpu_hist', 
        "gpu_id" : 0
    }
    
    X_A,X_val,y_A,y_val = train_test_split(xtrain,ytrain,random_state=42)
    model = XGBRegressor(**params)
    pruning_callback = XGBoostPruningCallback(trial, "validation_0-rmse")
    
    model.fit(X_A,y_A,
              eval_set=[(X_val, y_val)],
              callbacks=[pruning_callback],
              early_stopping_rounds=early_stopping_rounds,
              eval_metric="rmse",
            verbose=0)
    
    ypred = model.predict(X_val)
    
    return (mean_absolute_error(ypred,y_val))


In [None]:
%%time

# XGBoost
EARLY_STOPPING_ROUNDS = 5

study = create_study(direction="minimize")
study.optimize(
    lambda trial: objective(
        trial,
        xtrain,
        ytrain,
        random_state=42,
        n_jobs=8, # number of parallel threads
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    ),
    n_trials=100,
    #n_jobs=10,
)

# display params
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

In [None]:
hp = study.best_params
hp["verbosity"] = 0
hp["seed"] = 42
hp["n_jobs"] = 8
hp["tree_method"]='gpu_hist'
hp["gpu_id"]=0


X_A,X_val,y_A,y_val = train_test_split(xtrain,ytrain,random_state=42)
model = XGBRegressor(**hp)

model.fit(X_A,y_A,
          eval_set=[(X_val, y_val)])

ypred = model.predict(X_val)
print("validation mae ",mean_absolute_error(ypred,y_val))

ypred_test = model.predict(xtest)
print("validation mae ",mean_absolute_error(ypred_test,ytest))

In [None]:
ypred = model.predict(X_val)
print("validation mae ",mean_absolute_error(ypred,y_val))

ypred_test = model.predict(xtest)
print("validation mae ",mean_absolute_error(ypred_test,ytest))

In [None]:
pred = model.predict(test_encoded)

sub["congestion"] = pred
sub.to_csv("second.csv",index=False)