In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
train = train.set_index('row_id')
test = test.set_index('row_id')

In [None]:
train.head(10)

In [None]:
# lot of informtion is packed in time column lets unpack it
def split_data(df,split_list, col_name, split_char):
    x = 0
    for element in split_list:
        df[element] = df[col_name].str.split(split_char, expand=True)[x]
        x = x + 1

In [None]:
split_data(train, ['date', 'time'], 'time', ' ')
split_data(train, ['year','month','day'], 'date', '-')
split_data(train, ['hours','minutes','seconds'],'time',':')

In [None]:
# changing date format
train['date'] = pd.to_datetime(train['date'])
train['week_day'] = train['date'].dt.day_name()

In [None]:
#  droping unneccesary columns
train = train.drop(['time','date', 'year', 'seconds'], axis = 1)

In [None]:
# reindexing column positions
train = train[['month','week_day','day', 'hours', 'minutes', 'x','y','direction', 'congestion']]

In [None]:
# unique elements in dataframe
for col in train.columns:
    print(col)
    print(train[col].unique())

In [None]:
# proportions of target variables
cong = train['congestion'].value_counts()/len(train)
plt.figure(figsize = (10,5))
plt.bar(cong.index, cong, color = 'red')

In [None]:
# splitting the dataset in training and cross -validation dataset
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=1200)
for train_index, cv_index in split.split(train, train['congestion']):
    train_ = train.loc[train_index]
    cv = train.loc[cv_index]

In [None]:
# encoding the variables
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
x_train = enc.fit_transform(train_.iloc[:, :-1]).toarray()
y_train = train_.iloc[:,-1]

In [None]:
x_cv = enc.transform(cv.iloc[:,:-1]).toarray()
y_cv = cv.iloc[:,-1]

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [None]:
params = { 'max_depth':[6,8,16],
           'learning_rate':[0.01, 0.1, 0.2]}


In [None]:
# best parameter search
grid_search = GridSearchCV( estimator= xgb.XGBRegressor(objective ='reg:squarederror',
                                                      seed = 12,
                                                      subsample=0.5,
                                                      colsample_bytree=0.5),
                           param_grid = params,
                           scoring = 'neg_mean_squared_error',
                           verbose=3,
                           cv=3)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
# making test data consistent in structure with train data
split_data(test, ['date', 'time'], 'time', ' ')
split_data(test, ['year','month','day'], 'date', '-')
split_data(test, ['hours','minutes','seconds'],'time',':')

In [None]:
test['date'] = pd.to_datetime(test['date'])
test['week_day'] = test['date'].dt.day_name()
test = test.drop(['time','date', 'year', 'seconds'], axis = 1)
test = test[['month','week_day','day', 'hours', 'minutes', 'x','y','direction']]
index = test.index

In [None]:
test = enc.transform(test).toarray()

In [None]:
# setting xgboost parameters
xgbreg = xgb.XGBRegressor(objective='reg:squarederror',
                          learning_rate=0.2,
                          max_depth=16,
                          reg_lambda = 1.0,
                          subsample = 0.5,
                          seed=12)

In [None]:
# fitting data in xgboost 
xgbreg.fit(x_train,y_train,
                   early_stopping_rounds=5,
                   eval_metric='rmse',
                   eval_set=[(x_cv,y_cv)])

In [None]:
x = enc.transform(train.iloc[:,:-1]).toarray()
y = train.iloc[:,-1]
model = xgbreg.fit(x,y)

In [None]:
# making predictions for test data
y_pred = model.predict(test)

In [None]:
# saving the predictions
prediction = pd.DataFrame({'row_id': index,
                          'congestion': y_pred})

In [None]:
# saving the file
prediction.to_csv('submission.csv', index=False)