In [None]:
import numpy as np 
import pandas as pd

# Load csv

In [None]:
df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')

In [None]:
df.info()

# Data Processing

You can start feature engineering quickly by The function 'feat_eng()'

I'm inspired by the notebook [(Very Simple Using the Median)](https://www.kaggle.com/code/robertturro/very-simple-using-the-median)

inputs
* time
* month
* weekday
* hour
* minute
* month_start
* month_end
* weekend
* afternoon
* daytime_id
* road
* median

-----------
output
* congestion

In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()


def med():
    df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['daytime_id'] = ( ( df.time.dt.hour*60 + df.time.dt.minute ) /20 ).astype(int)
    df['road'] = df['x'].astype(str) + df['y'].astype(str) + df['direction']
    df['road'] = le.fit_transform(df['road'])
    # add median
    med = df.groupby(['road', 'daytime_id']).congestion.median().astype(int)
    return med
    
    
def feat_eng(train = True):
    median = med()
    if train:
        # data processing about time
        df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
        df['time'] = pd.to_datetime(df['time'])
        df['month'] = df['time'].dt.month
        df['weekday'] = df['time'].dt.weekday
        df['hour'] = df['time'].dt.hour
        #df['minute'] = df['time'].dt.minute
        #df['month_start'] = df['time'].dt.is_month_start.astype('int')
        #df['month_end'] = df['time'].dt.is_month_end.astype('int')
        df['weekend'] = (df['time'].dt.dayofweek > 5).astype('int')
        df['afternoon'] = (df['time'].dt.hour > 12).astype('int')
        df['daytime_id'] = ( ( df.time.dt.hour*60 + df.time.dt.minute ) /20 ).astype(int)
        
        # labeling road
        df['road'] = df['x'].astype(str) + df['y'].astype(str) + df['direction']
        df['road'] = le.fit_transform(df['road'])
        
        # add median
        df = df.merge(median,left_on=['road', 'daytime_id'], right_index=True,  suffixes=['', '_median'])
        df = df.rename(columns={'congestion_median': 'median'})

        # drop unnecessary columns 
        df = df.drop(['x','y','row_id', 'direction'], axis=1)
        
        x = df.drop(['congestion'], axis=1)
        y = df['congestion']
        
        # return input(other), output(congestion)
        return  x, y 
    
    else:
        # data processing about time
        df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
        df['time'] = pd.to_datetime(df['time'])
        df['month'] = df['time'].dt.month
        df['weekday'] = df['time'].dt.weekday
        df['hour'] = df['time'].dt.hour
        #df['minute'] = df['time'].dt.minute
        # df['month_start'] = df['time'].dt.is_month_start.astype('int')
        # df['month_end'] = df['time'].dt.is_month_end.astype('int')
        df['weekend'] = (df['time'].dt.dayofweek > 5).astype('int')
        df['afternoon'] = (df['time'].dt.hour > 12).astype('int')
        df['daytime_id'] = ( ( df.time.dt.hour*60 + df.time.dt.minute ) /20 ).astype(int)
        
        # labeling road
        df['road'] = df['x'].astype(str) + df['y'].astype(str) + df['direction']
        df['road'] = le.fit_transform(df['road'])
        
        # add median
        df = df.merge(median,left_on=['road', 'daytime_id'], right_index=True)
        df = df.rename(columns={'congestion': 'median'})
        
        # drop unnecessary columns 
        df = df.drop(['x','y','row_id', 'direction'], axis=1)
        
        # return input
        return df

# Datasets

In [None]:
train_x, train_y = feat_eng(train = True)
val_x = feat_eng(train = False)

# Modeling

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold,TimeSeriesSplit
from catboost import CatBoostRegressor


model_list = []
mae_list = []

# fold5
kf = KFold(n_splits = 5, shuffle = True, random_state = 70)

# modeling and training
for fold, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
    print(f'--------fold:{fold+1}--------')
    fold+=1
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    params = {'logging_level': 'Silent',
              'depth': 12,
              'eval_metric': 'MAE',
              'loss_function': 'MAE',
              'n_estimators': 800,
              'task_type': 'GPU'
        
     }
                  
    model = CatBoostRegressor(**params)
    # Training the model
    
    model.fit(tr_x,
              tr_y,
              eval_set=[(va_x, va_y)])
    
    val_pred = model.predict(va_x)
    
    print(f' MAE: {mean_absolute_error(va_y, val_pred)}')
    

# Optuna

In [None]:
# from sklearn.metrics import mean_absolute_error
# from sklearn.model_selection import KFold,TimeSeriesSplit
# from sklearn.linear_model import LinearRegression
# import xgboost as xgb
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor

# from sklearn.model_selection import train_test_split
# from catboost import Pool
# import sklearn.metrics

# model_list = []
# mae_list = []

# # fold5
# kf = KFold(n_splits = 5, shuffle = True, random_state = 70)

# # modeling and training
# def objective(trial):
#     for fold, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
#         if fold ==1:
#             print(f'--------fold:{fold+1}--------')
#             fold+=1
#             tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#             tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

#             params = {
#                 'logging_level': 'Silent',
#                 'depth': trial.suggest_int('depth', 5, 15),
#                 'eval_metric': 'MAE', 
#                 'loss_function': 'MAE', 
#                 'n_estimators': 800, 
#                 'task_type': 'GPU'
#              }

#             model = CatBoostRegressor()
#             # Training the model

#             model.fit(tr_x,
#                       tr_y,
#                       eval_set=[(va_x, va_y)])

#             val_pred = model.predict(va_x)

#             print(f' MAE: {mean_absolute_error(va_y, val_pred)}')
#             return mean_absolute_error(va_y, val_pred)
#         else:
#             pass

In [None]:
# import optuna
# study = optuna.create_study()
# study.optimize(objective, n_trials=1)
# print(study.best_trial)


# Submission

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
ans = model.predict(val_x)
submission['congestion'] = ans
submission.to_csv('submission.csv', index=False)