In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import gc
import holidays

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor

#warnings.filterwarnings("ignore")
seed = 256

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
real_test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
train_df.head()

In [None]:
train_df.dtypes

In [None]:
# Define Model Evaluation functions
def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    result = mean_absolute_error(y, y_pred)
    return result

In [None]:
#Define data pre-processing functions
def label_encoder(df):
    # Create coodinate codes
    dir_mapper = {'EB': [1,0], 'NB': [0,1], 'SB': [0,-1], 'WB': [-1,0], 
                  'NE': [1,1], 'SW': [-1,-1], 'NW': [-1,1], 'SE': [1,-1]}
    # Encode lables
    direction = {d : i for i, d in enumerate(df['direction'].unique())}
    df = df.copy()
    df['direction_coord_0'] = df['direction'].map(lambda x: dir_mapper[x][0])
    df['direction_coord_1'] = df['direction'].map(lambda x: dir_mapper[x][1])
    df['direction'] = df['direction'].replace(direction)
    return df

def preprocess_dates(df):
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    df['minute'] = df['time'].dt.minute
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday
    df['day_of_month']=df['time'].dt.day
    #df['week']=df['time'].dt.isocalendar().week     
    #df['week'][df['week']>52]=52                    
    #df['week']=df['week'].astype('int')
    df['month']=df['time'].dt.month
    #df['quarter'] = df['time'].dt.quarter
    #df['year']=df['time'].dt.year
    #df['day_of_year'] = df['time'].dt.day_of_year
    df['is_month_start'] = df['time'].dt.is_month_start.astype('int')
    df['is_month_end'] = df['time'].dt.is_month_end.astype('int')
    df['is_weekend']=(df['weekday']//5 == 1).astype('int') 
    df['hour+minute'] = df['time'].dt.hour * 60 + df['time'].dt.minute
    df['is_afternoon'] = (df['time'].dt.hour > 12).astype('int')
    df['x+y'] = df['x'].astype('str') + df['y'].astype('str')
    df['x+y+direction'] = df['x'].astype('str') + df['y'].astype('str') + df['direction'].astype('str')
    #df['x+y+direction0'] = df['x'].astype('str') + df['y'].astype('str') + df['direction_coord_0'].astype('str')
    #df['x+y+direction1'] = df['x'].astype('str') + df['y'].astype('str') + df['direction_coord_1'].astype('str')
    df['hour+direction'] = df['hour'].astype('str') + df['direction'].astype('str')
    df['hour+x+y'] = df['hour'].astype('str') + df['x'].astype('str') + df['y'].astype('str')
    df['hour+direction+x'] = df['hour'].astype('str') + df['direction'].astype('str') + df['x'].astype('str')
    df['hour+direction+y'] = df['hour'].astype('str') + df['direction'].astype('str') + df['y'].astype('str')
    df['hour+direction+x+y'] = df['hour'].astype('str') + df['direction'].astype('str') + df['x'].astype('str') + df['y'].astype('str')
    df['hour+x'] = df['hour'].astype('str') + df['x'].astype('str')
    df['hour+y'] = df['hour'].astype('str') + df['y'].astype('str')
    return df

def preprocess_holidays(df):
    holidays_usa = holidays.CountryHoliday(country='US', years=[1991])
    dates = list(holidays_usa.keys())
    dates = sorted(pd.to_datetime(dates))
    df = df.copy()
    df['is_holiday'] = df['time'].apply(lambda x : 1 if x in dates else 0)
    return df

def preprocess_timeseries(df):
    df = df.copy()
    # Sin of date values
    df['sin_minute'] = np.sin(df['minute'])
    df['sin_hour'] = np.sin(df['hour'])
    df['sin_weekday'] = np.sin(df['weekday'])
    df['sin_day_of_month'] = np.sin(df['day_of_month'])
    # Cos of date values
    df['cos_minute'] = np.cos(df['minute'])
    df['cos_hour'] = np.cos(df['hour'])
    df['cos_weekday'] = np.cos(df['weekday'])
    df['cos_day_of_month'] = np.cos(df['day_of_month'])
    return df

In [None]:
train_df = label_encoder(train_df)
train_df = preprocess_dates(train_df)
train_df = preprocess_holidays(train_df)
train_df = preprocess_timeseries(train_df)
train_df

In [None]:
# Train data split
target_name = 'congestion'

X_train = train_df.drop(['row_id', 'time', target_name], axis=1)
y_train = train_df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=seed, shuffle=False)

## Model: CatBoost Regressor
https://catboost.ai/en/docs/concepts/python-quickstart

In [None]:
params = {'iterations': 200, 
          'depth': 8, 
          'learning_rate': 0.1, 
          'loss_function': 'RMSE',  # 'RMSE', 'MAE', 'Poisson'
         }

model =  CatBoostRegressor(**params,
                           random_seed=seed,
                           #n_jobs = -1,
                           verbose=0)

model.fit(X_train, y_train)

In [None]:
#Evaluate model
score = evaluate_model(model, X_test, y_test)
print(score)
#6.919078746957684

## Optuna Optimization

In [None]:
import optuna

In [None]:
def objective(trial):
   
    iters = trial.suggest_int('iterations', 100, 800)
    depth = trial.suggest_int('depth', 7, 12)
    lr = trial.suggest_float('learning_rate', 0.05, 0.3, step = 0.01)
    #loss = trial.suggest_categorical('loss_function', ['RMSE', 'MAE', 'Poisson'])    
    
    params = {'iterations': iters, 
              'depth': depth, 
              'learning_rate': lr, 
              'loss_function': 'MAE',  
             }
    
    model =  CatBoostRegressor(**params,
                               random_seed=seed,
                               #n_jobs = -1,
                               verbose=0)
    
    model.fit(X_train, y_train)
    score = evaluate_model(model, X_test, y_test)
    
    return score

In [None]:
# treat all python warnings as lower-level "ignore" events
warnings.filterwarnings("ignore")

# Create Optuna Trial
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=seed))

# Run trials
study.optimize(objective , n_trials = 125)
#study.optimize(objective, timeout = int(3600*9))    # an hour * X

In [None]:
# See optimization history
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
#See hyper-parameters importances
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [None]:
#See slice
fig = optuna.visualization.plot_slice(study)
fig.show()

In [None]:
# Best trial
print('Best trial score:', study.best_trial.value)
study.best_trial.params

In [None]:
# Create model with best trial parameters
params = {'iterations': study.best_trial.params['iterations'], 
          'depth': study.best_trial.params['depth'], 
          'learning_rate': study.best_trial.params['learning_rate'],
          'loss_function': 'MAE',
         }

best_model =  CatBoostRegressor(**params,
                                random_seed=seed,
                                #n_jobs = -1,
                                verbose=0)

## Submission

In [None]:
# Train best model with all train data
X_train = train_df.drop(['row_id', 'time', target_name], axis=1)
y_train = train_df[target_name]

best_model.fit(X_train, y_train, verbose=False)

In [None]:
real_test_df = label_encoder(real_test_df)
real_test_df = preprocess_dates(real_test_df)
real_test_df = preprocess_holidays(real_test_df)
real_test_df = preprocess_timeseries(real_test_df)
X_real_test = real_test_df.drop(['row_id', 'time'], axis=1)

In [None]:
prediction = best_model.predict(X_real_test).squeeze()
row_id =  real_test_df['row_id'].values
submission = pd.DataFrame({'row_id' : row_id, target_name : prediction})
submission.head()

In [None]:
# Since target values are integer, and model output is float, let's round the predicted values
submission[target_name] = submission[target_name].round().astype(int)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)