In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-mar-2022/train.csv',parse_dates=['time'])
train.head()

In [None]:
test = pd.read_csv(r'../input/tabular-playground-series-mar-2022/test.csv',parse_dates=['time'])
test.head()

In [None]:
sub = pd.read_csv(r'../input/tabular-playground-series-mar-2022/sample_submission.csv')
sub.head()

### **DateTime features:**

In [None]:
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute
train['weekday'] = train['time'].dt.weekday

test['hour'] = test['time'].dt.hour
test['minute'] = test['time'].dt.minute
test['weekday'] = test['time'].dt.weekday

In [None]:
train.drop(['row_id','time'],axis=1,inplace=True)
test.drop(['row_id','time'],axis=1,inplace=True)

In [None]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sub.shape[0]} rows and {sub.shape[1]} columns.') 

In [None]:
train.isnull().sum()

In [None]:
train.nunique()

In [None]:
train.describe().T

### **Data Visualization:**

In [None]:
fig = go.Figure(data=[go.Pie(labels=train['x'], hole=.3)])
fig.add_annotation(text='x',
                   x=0.5,y=0.5,showarrow=False,font_size=34,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='x Unique Values',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [None]:
fig = go.Figure(data=[go.Pie(labels=train['y'], hole=.3)])
fig.add_annotation(text='y',
                   x=0.5,y=0.5,showarrow=False,font_size=34,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='y Unique Values',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [None]:
fig = go.Figure(data=[go.Pie(labels=train['direction'], hole=.3)])
fig.add_annotation(text='direction',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='direction Unique Values',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [None]:
fig = px.histogram(train,x='congestion',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=['#FECB52'],
                  barmode='group',histfunc='count')

fig.update_layout(
    font_family='monospace',
    title=dict(text='congestion Distribution',x=0.53,y=0.95),
    xaxis_title_text='congestion',
    yaxis_title_text='Count',
    bargap=0.3,
)
fig.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['direction'] = le.fit_transform(train['direction'])
test['direction'] = le.fit_transform(test['direction'])

In [None]:
train.head()

In [None]:
y = train['congestion']
train.drop('congestion',axis=1,inplace=True)

### **Catboost+optuna:**

In [None]:
def fit_cat(trial, x_train, y_train, x_test, y_test):
    params = {'iterations':trial.suggest_int("iterations", 1000, 20000),
              'od_wait':trial.suggest_int('od_wait', 500, 2000),
              'task_type':"GPU",
              'eval_metric':'MAE',
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.03 , 0.04),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.32 , 0.33),
              'subsample': trial.suggest_uniform('subsample',0.9,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
               }
    
    
    model = CatBoostRegressor(**params, random_state=123, bootstrap_type='Poisson')
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train mae": mean_absolute_error(y_train, y_train_pred),
        "valid mae": mean_absolute_error(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
def objective(trial):
    mae = 0
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.30)
    model, log = fit_cat(trial, x_train, y_train, x_test, y_test)
    mae += log['valid mae']
    return mae

In [None]:
params = {'iterations': 3989,
 'od_wait': 1354,
 'learning_rate': 0.03722075260573367,
 'reg_lambda': 0.32719505634165696,
 'subsample': 0.975493098309512,
 'random_strength': 28.138066726596414,
 'depth': 11,
 'min_data_in_leaf': 15,
 'leaf_estimation_iterations': 1,'task_type':"GPU",'eval_metric':'MAE','bootstrap_type':'Poisson'}

In [None]:
folds = KFold(n_splits=5, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold: {fold}")
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostRegressor(**params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f" mean_absolute_error: {mae}")
    print("-"*50)

In [None]:
pred = model.predict(test)

In [None]:
sub['congestion'] = pred.round().astype(int)
sub.to_csv(f'cat.csv',index = False)