In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error

In [None]:
train_set = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv',index_col='row_id', parse_dates=['time'])
test_set = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv',index_col='row_id', parse_dates=['time'])

In [None]:
np.unique(train_set.congestion)

# Visualization

In [None]:
plt.figure(figsize=(15, 6), dpi=80)
plt.title('Congestion bar plot')
train_set.congestion.value_counts().sort_index().plot.bar()
plt.show()
plt.figure(figsize=(15, 6), dpi=80)
plt.title('Direction bar plot')
train_set.direction.value_counts().plot.bar()
plt.show()

In [None]:
for d in train_set.direction.unique():
    plt.figure(figsize=(15, 6), dpi=80)
    plt.title('Direction:'+ d)
    train_set[train_set.direction == d].congestion.value_counts().sort_index().plot.bar()
    plt.xlabel('Congestion')
    plt.ylabel('Count')
    plt.show()

# Add features

In [None]:
def add_feature(df):
    # Memorial Day
    df = df[(df.time.dt.month != 5) | (df.time.dt.day != 27)]
    # July 4
    df = df[(df.time.dt.month != 7) | (df.time.dt.day != 4)]
    # Labor Day
    df = df[(df.time.dt.month != 9) | (df.time.dt.day != 2)]
    
    df['dateofweek'] = df['time'].dt.dayofweek
    df['day_of_year'] = df['time'].dt.day_of_year
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    for i in range(1,1):
        df[f'dateofweek_sin{i}'] = np.sin(2* np.pi * df['time'].dt.dayofweek / 7 * i)
        df[f'dateofweek_cos{i}'] = np.cos(2* np.pi * df['time'].dt.dayofweek / 7 * i)
    
    for i in range(1,1):
        df[f'hour_sin{i}'] = np.sin(2* np.pi * df['time'].dt.hour / 24 * i)
        df[f'hour_cos{i}'] = np.cos(2* np.pi * df['time'].dt.hour / 24 * i)
    
    for i in range(1,1):
        df[f'min_sin{i}'] = np.sin(2* np.pi * df['time'].dt.minute / 60 * i)
        df[f'min_cos{i}'] = np.cos(2* np.pi * df['time'].dt.minute / 60 * i)    
    
    df = pd.get_dummies(df,prefix=['D'], columns = ['direction'])
    df['xy'] = df['x'].astype(str) + df['y'].astype(str)
    df = pd.get_dummies(df,prefix=['XY'], columns = ['xy'])
    # df = pd.get_dummies(df,prefix=['X'], columns = ['x'])
    # df = pd.get_dummies(df,prefix=['Y'], columns = ['y'])
    # df.drop(['x','y'],axis = 1, inplace = True)
    return df

# Training

In [None]:
X = add_feature(train_set)
y = X['congestion']
X = X.drop(['time','congestion'],axis = 1)
test_x = add_feature(test_set).drop(['time'],axis = 1)

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y, test_size = .25, shuffle = True)

In [None]:
params ={'learning_rate': 0.05, 
         'n_estimators': 1000, 
         'max_depth': 10}
xgb_reg = xgb.XGBRegressor(**params,tree_method = 'gpu_hist')

In [None]:
xgb_reg.fit(X, y,
            eval_metric = ['mae','rmse'],
            eval_set = [[X_train, y_train],[X_val, y_val]])

# Submission

In [None]:
pred_test = xgb_reg.predict(test_x)
output = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')
output['congestion'] = pred_test
output.to_csv('submission.csv', index=False)

In [None]:
test_set['congestion'] = pred_test.round().astype(int)
for d in train_set.direction.unique():
    # train data
    plt.figure(figsize=(15, 6), dpi=80)
    plt.subplot(1, 2, 1)
    plt.title('Direction:'+ d)
    train_set[train_set.direction == d].congestion.value_counts().sort_index().plot.bar()
    plt.xlabel('Congestion')
    plt.ylabel('Count')
    # test data
    plt.subplot(1, 2, 2)
    plt.title('Direction:'+ d)
    test_set[test_set.direction == d].congestion.value_counts().sort_index().plot.bar()
    plt.xlabel('Congestion')
    plt.ylabel('Count')
    plt.show()