# Importing libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
%matplotlib inline 

In [None]:
import xgboost as xgb
from xgboost import plot_tree
from xgboost import plot_importance

from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split

from sklearn import preprocessing    
le = preprocessing.LabelEncoder()

# Data Extraction

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test_df  = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
train_df.head(6)

In [None]:
train_df.shape

In [None]:
def missing_percent_of_column(train_set):
    nan_percent = 100*(train_set.isnull().sum()/len(train_set))
    nan_percent = nan_percent[nan_percent>0].sort_values(ascending=False).round(1)
    DataFrame = pd.DataFrame(nan_percent)
    # Rename the columns
    mis_percent_table = DataFrame.rename(columns = {0 : '% of Misiing Values'}) 
    # Sort the table by percentage of missing descending
    mis_percent = mis_percent_table
    return mis_percent

In [None]:
miss = missing_percent_of_column(train_df)
miss

# Exploratory Data Analysis(EDA)

In [None]:
sns.set(rc={'figure.figsize':(8,6)})
sns.boxplot(x='direction', y='congestion', data=train_df)

In [None]:
sns.countplot(x='direction', data=train_df)

In [None]:
sns.histplot(data=train_df, x="congestion", binwidth=5,multiple="stack", kde=True, bins=20)

# Data Preparation

In [None]:
train_df['time']= pd.to_datetime(train_df['time'])
test_df['time']= pd.to_datetime(test_df['time'])

In [None]:
train_df['month'] = train_df['time'].dt.month
train_df['day'] = train_df['time'].dt.day
train_df['daytime'] = train_df['time'].dt.hour * 60 + train_df['time'].dt.minute
train_df['weekday'] = train_df['time'].dt.weekday
train_df['hour'] = train_df['time'].dt.hour
train_df['dayofyear'] = train_df['time'].dt.dayofyear

test_df['month'] = test_df['time'].dt.month
test_df['day'] = test_df['time'].dt.day
test_df['daytime'] = test_df['time'].dt.hour * 60 + test_df['time'].dt.minute
test_df['hour'] = test_df['time'].dt.hour
test_df['weekday'] = test_df['time'].dt.weekday
test_df['dayofyear'] = test_df['time'].dt.dayofyear


Removing few Outliers

In [None]:
train_df = train_df[(train_df.month != 5) | (train_df.day != 27)]
train_df = train_df[(train_df.month != 7) | (train_df.day != 4)]
train_df = train_df[(train_df.month != 9) | (train_df.day != 2)]

In [None]:
df = train_df[['hour','direction','x','y','congestion','weekday']]
df = df.groupby(by=['hour','direction','x','y','weekday'], dropna=False).mean()

In [None]:
sns.set(rc={'figure.figsize':(10,6)})
fg=sns.scatterplot(data=df, x='weekday',y='congestion',size="congestion")
fg.legend(bbox_to_anchor= (1.15,1))

In [None]:
def isWeekDay(date):
    if date > 4:
        return 1
    else:
        return 0

In [None]:
train_df['isWeekDay'] = train_df['day'].apply(isWeekDay) 
test_df['isWeekDay'] = test_df['day'].apply(isWeekDay) 

In [None]:
train_df['directionxy'] = train_df['direction'] +'_'+train_df['x'].astype(str)+'_'+train_df['y'].astype(str)
test_df['directionxy'] = test_df['direction'] +'_'+test_df['x'].astype(str)+'_'+test_df['y'].astype(str)

train_df['directionxy'] = le.fit_transform(train_df['directionxy'])
test_df['directionxy'] = le.fit_transform(test_df['directionxy'])

In [None]:
direction = {'EB':1, 'NB':2, 'SB':3, 'WB':4, 'NE':5, 'SW':6, 'NW':7,'SE':8}

train_df['direction'] = train_df['direction'].map(direction)
test_df['direction'] = test_df['direction'].map(direction)

In [None]:
corr=train_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(12, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
g=sns.heatmap(corr, mask=mask,annot=True,cmap="YlGnBu", vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
train_df.head(2)

In [None]:
test_row_id = test_df['row_id']
train_df = train_df.drop(['row_id','hour','month','time'], axis=1)
test_df = test_df.drop(['row_id','hour','month','time'], axis=1)

In [None]:
X_test  = test_df.copy()

# Separate target from predictors
X = train_df.drop("congestion", axis=1)             
Y = train_df["congestion"]

# Break off validation set from training data
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape,  X_test.shape

# Model Training & evaluation

**XGBoost Model**

In [None]:
xgb_model = xgb.XGBRegressor(random_state=42,n_estimators= 500,learning_rate=0.05, max_depth=8,booster='gbtree',verbosity=1)
xgb_model.fit(X_train, Y_train,eval_metric='mae')

print("Performance on train data:", xgb_model.score(X_train, Y_train))

In [None]:
y_pred_v = xgb_model.predict(X_valid)

In [None]:
error = mae(Y_valid, y_pred_v)
  
# display
print("Mean absolute error : " + str(error))

**Random Forest**

In [None]:
regr = RandomForestRegressor(random_state=0, bootstrap=True,max_depth= 20, max_features= 'auto',min_samples_leaf= 1,n_estimators=200)
regr.fit(X_train, Y_train)

In [None]:
y_pred_v = regr.predict(X_valid)

In [None]:
error = mae(Y_valid, y_pred_v)
print("Mean absolute error : " + str(error))

# Prediction

Random Forest Regressor has better performance and lower MAE compare to XGBoost on Cross Validation dataset. But when i tried uploading actual testing data, it seems XGB had better performance, so I went ahead with XGB

In [None]:
y_pred_x = regr.predict(X_test)

In [None]:
y_pred_x

# Submission

In [None]:
#Create a  DataFrame with the passengers ids and our prediction
submission = pd.DataFrame({'row_id':test_row_id,'congestion':y_pred_x})

In [None]:
submission.congestion.value_counts(sort=False).sort_index()

In [None]:
submission.congestion = submission.congestion.astype(int)

In [None]:
submission.head(5)

In [None]:
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)