In [None]:
# !pip install catboost

Hello. My name is Alexey.
I used several machine learning models in my work.
Best result on Catboost Resgressor = 5.023.

# Traffic forecast in the metropolis

## Data preparation

For the March edition of the 2022 Tabular Playground Series you're challenged to forecast twelve-hours of traffic flow in a U.S. metropolis. The time series in this dataset are labelled with both location coordinates and a direction of travel -- a combination of features that will test your skill at spatio-temporal forecasting within a highly dynamic traffic network.

Which model will prevail? The venerable linear regression? The deservedly-popular ensemble of decision trees? Or maybe a cutting-edge graph neural-network? We can't wait to see!

In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Exploring Files

Files and Field Descriptions:  
`train.csv` - the training set, comprising measurements of traffic congestion across 65 roadways from April through September of 1991.  
- `row_id` - a unique identifier for this instance  
- `time` - the 20-minute period in which each measurement was taken  
- `x` - the east-west midpoint coordinate of the roadway  
- `y` - the north-south midpoint coordinate of the roadway  
- `direction` - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.  
- `congestion` - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.  

`test.csv` - the test set; you will make hourly predictions for roadways identified by a coordinate location and a direction of travel on the day of 1991-09-30.  

In [None]:
df_train = pd.read_csv(r"../input/tabular-playground-series-mar-2022/train.csv")

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test = pd.read_csv(r"../input/tabular-playground-series-mar-2022/test.csv")

In [None]:
df_test.head()

In [None]:
df_test.info()

### Feature engineering

In [None]:
def feature_engineering(data):
    data['time'] = pd.to_datetime(data['time'])
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    return data

In [None]:
for data in df_train:
    data = feature_engineering(df_train)

In [None]:
for data in df_test:
    data = feature_engineering(df_test)

### Data preprocessing

In [None]:
df_train = df_train.drop(columns='row_id')
df_train = df_train.drop(columns='time')

df_train = pd.get_dummies(df_train, drop_first=True)

In [None]:
df_test = df_test.drop(columns='time')
df_test = pd.get_dummies(df_test, drop_first=True)

In [None]:
features_train = df_train.drop('congestion', axis=1)
target_train = df_train['congestion']

In [None]:
features_train, features_valid = train_test_split(features_train, test_size=0.25, random_state=12345)
target_train, target_valid = train_test_split(target_train, test_size=0.25, random_state=12345)
features_test = df_test.drop(columns='row_id')

## Data analysis

### Train features

In [None]:
features_train.head()

In [None]:
features_valid.head()

In [None]:
features_test.head()

In [None]:
features_train['x'].value_counts()

In [None]:
features_train['y'].value_counts()

### Target feature

In [None]:
df_train['congestion'].value_counts()

In [None]:
plt.figure(figsize=(15, 3))
sns.histplot(data=df_train, x='congestion', kde=True, bins=30, color='green')

plt.title(f'Distribution')
plt.ylabel('Amount')
plt.xlabel('congestion')
plt.grid(color='grey', axis='both', alpha=0.5)
plt.xticks(np.arange(0, 105, 10))
plt.yticks(np.arange(0, 75000, 5000))
plt.show()

plt.rcParams['figure.figsize']=(15, 3)
sns.boxplot( data=df_train['congestion'], orient="h" )
plt.title(f'Boxplot')
plt.xlabel('congestion')
plt.grid(color='grey', axis='both', alpha=.8)
plt.xticks(np.arange(0, 105, 10))
plt.show()

### Feature correlation 

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df_train.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)

## Train and validate models

### Random forest regressor

In [None]:
# %%time


# rf = RandomForestRegressor(random_state=123)

# params = {'n_estimators': [100, 150, 250, 300],
#           'max_depth': [5, 7, 9, 15, 20],
#           'min_samples_leaf' : [1, 2],
#           'max_features':['auto'],
#           'min_samples_split': [2, 4],
#           }

# cv_rfc_model = GridSearchCV(estimator=rf, param_grid=params, n_jobs=-1, cv=5)

# cv_rfc_model.fit(features_train, target_train)

In [None]:
# cv_rfc_model.best_params_

In [None]:
rfc_best_model = RandomForestRegressor(max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=123)
rfc_best_model.fit(features_train, target_train)

In [None]:
predictions_valid = rfc_best_model.predict(features_valid)

In [None]:
mean_absolute_error(target_valid, predictions_valid)

### Extreme gradient boosting regressor

In [None]:
# %%time


# exg = xgb.XGBRegressor(verbosity=0, random_state=123)

# params = {'n_estimators': [100, 150, 200, 300]}

# cv_exg_model = GridSearchCV(estimator=exg, param_grid=params, n_jobs=10, cv=10)

# cv_exg_model.fit(features_train, target_train)

In [None]:
# cv_exg_model.best_params_

In [None]:
# predictions_valid_exg = cv_exg_model.predict(features_valid)

In [None]:
# mean_absolute_error(target_valid, predictions_valid_exg)

### Linear regression

In [None]:
lr_best_model = LinearRegression()
lr_best_model.fit(features_train, target_train)

In [None]:
predictions_valid_exg = lr_best_model.predict(features_valid)

In [None]:
mean_absolute_error(target_valid, predictions_valid_exg)

### Cat boost regressor

In [None]:
# CBR = CatBoostRegressor()

# parameters = {'verbose' : [25, 50, 100],
#               'early_stopping_rounds' : [5, 10],
#               'random_seed' : [1000, 2000],
#               'max_depth' : [5, 10 ,15],
#               'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05],
#               'iterations' :  [100, 250, 500],
#               'loss_function' : ['MAE'],
#               'eval_metric' : ['MAE'],
#              }

# Grid_CBC = GridSearchCV(estimator=CBR, param_grid=parameters, cv=5, n_jobs=-1)
# Grid_CBC.fit(features_train, target_train)

In [None]:
# Grid_CBC.best_params_

In [None]:
cbr_best_model = CatBoostRegressor(early_stopping_rounds=5, eval_metric='MAE', 
                                   iterations=500, learning_rate=0.05, loss_function='MAE', 
                                   max_depth=15, random_seed=2000, verbose=25
                                   )

cbr_best_model.fit(features_train, target_train)

In [None]:
predictions_valid = cbr_best_model.predict(features_valid)

In [None]:
mean_absolute_error(target_valid, predictions_valid)

## Test of the best models

In [None]:
features_test = df_test.drop('row_id', axis=1)

In [None]:
df_test['congestion'] = list(rfc_best_model.predict(features_test))

### Random forest regressor prediction


In [None]:
# df_test['congestion'] = list(rfc_best_model.predict(features_test))
# df_test = df_test[['row_id', 'congestion']]
# df_test.to_csv(r"/kaggle/working/rfc_best_model.csv", index=False)
# df_test.head()

### Catboost regressor prediction

In [None]:
df_test['congestion'] = list(cbr_best_model.predict(features_test))
df_test = df_test[['row_id', 'congestion']]
df_test.to_csv(r"/kaggle/working/submission.csv", index=False)
df_test.head()

## General conclusion

In [None]:
# MAE 1 score: 5.137 (my in collab 6.771626217607527) - XGBRe
# MAE 2 score: 5.096 (my in collab 6.728822236988457) - RandomForestRegressor
# MAE 3 score: 5.219 (my in collab 6.436506568386449) - XGBReScore: 
# MAE 4 score: 5.198 (my in collab 6.312052246252551) - XGBReScore: 
# MAE 5 score: 5.023 (my in collab 6.293241136731521) - CBR + GridSearch: 

**Results:**

1. `Cat boost regressor`:  
Mean absolute error = 6.293, Kaggle MAE = 5.023;

2. `Random forest regressor`:  
Mean absolute error = 6.336, Kaggle MAE = 5.096;

3. `Extreme gradient boosting regressor`:  
Mean absolute error = 6.436, Kaggle MAE = 5.198;

4. `Linear regression`:  
Mean absolute error = 12.166, Kaggle MAE = 14.456.