![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

# Overview

In this competition, you'll forecast twelve-hours of traffic flow in a major U.S. metropolitan area. Time, space, and directional features give you the chance to model interactions across a network of roadways.

## Files and Field Descriptions
* `train.csv` - the training set, comprising measurements of traffic congestion across 65 roadways from April through September of 1991.
    * `row_id` - a unique identifier for this instance
    * `time` - the 20-minute period in which each measurement was taken
    * `x` - the east-west midpoint coordinate of the roadway
    * `y` - the north-south midpoint coordinate of the roadway
    * `direction` - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.
    * `congestion` - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.


* `test.csv` - the test set; you will make hourly predictions for roadways identified by a coordinate location and a direction of travel on the day of 1991-09-30.

* `sample_submission.csv` - a sample submission file in the correct format


# Setup

In [None]:
import numpy as np
import pandas as pd
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm

from pandas import DatetimeIndex
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

import xgboost as xgb

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

In [None]:
TRAIN_PATH = '../input/tabular-playground-series-mar-2022/train.csv'
TEST_PATH = '../input/tabular-playground-series-mar-2022/test.csv'
SUBMISSION_FILE = 'submission.csv'
SIMPEL_SUBMISSION_FILE = 'simple-submission.csv'

RANDOM_STATE = 2022
TEST_SIZE = 0.1

ID = 'row_id'
TARGET = 'congestion'
TIME = 'time'

# Explore data

## Read data

In [None]:
def load_data(fname):
    df = pd.read_csv(fname, parse_dates=True)
    df[TIME] =   pd.to_datetime(df[TIME])
    df['direction'] = pd.Categorical(df['direction'])
    df['x'] = df['x'].astype(np.int8)
    df['y'] = df['y'].astype(np.int8)
    
    df.set_index(ID, inplace=True)
    return df

In [None]:
train_data = load_data(TRAIN_PATH)
train_data

* There are 848,835 rows in train data.
* There are 6 columns in train data: 
    * `row_id` - integer (index column),
    * `time` - datetime, starts with `1991-04-01 00:00:00` and with `1991-09-30 11:40:00`.
    * `x` - integer, range: 0, 1, 2
    * `y` - integer, range 0, 1, 2, 3
    * `congestion` integer, range 0 - 100

In [None]:
test_data = load_data(TEST_PATH)
test_data

* There are 2340 rows in train data.
* Same columns as in the train dataset without the target `direction`.
* The `time` feature starts whit `1991-09-30 12:00:00` and ends with`1991-09-30 23:40:00`.

# Exploratory data analysis (EDA)

## Distribution of `direction`

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))  

sns.countplot(x='direction', data=train_data, palette='Blues_r', ax=ax1)
ax1.set_title('Distribution of direction (train set)')

sns.countplot(x='direction', data=test_data, palette='Blues_r', ax=ax2)
ax2.set_title('Distribution of direction (test set)')

plt.show()

## Distribution of `x` and `y` features

In [None]:
fig, axis = plt.subplots(2, 2, figsize=(12, 6))  

sns.countplot(x='x', data=train_data, palette='Blues_r', ax=axis[0, 0])
axis[0, 0].set_title('Distribution of x (train set)')

sns.countplot(x='x', data=test_data, palette='Blues_r', ax=axis[0, 1])
axis[0, 1].set_title('Distribution of x (test set)')

sns.countplot(x='y', data=train_data, palette='Blues_r', ax=axis[1, 0])
axis[1, 0].set_title('Distribution of y (train set)')

sns.countplot(x='y', data=test_data, palette='Blues_r', ax=axis[1, 1])
axis[1, 1].set_title('Distribution of y (test set)')

fig.tight_layout()
fig.show()

## Target `congestion` histogram

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))   

sns.histplot(
    data=train_data, 
    x=TARGET, 
    bins=50,
    kde=True,
    ax=ax1)

ax1.set_title('Histogram congestion', fontsize=18)
ax1.set_xlabel('Congestion', fontsize=16)
ax1.set_ylabel('Count', fontsize=16)

sns.histplot(
    data=train_data[train_data[TIME].dt.day_name() == 'Monday'], 
    x=TARGET, 
    bins=50,
    kde=True,
    ax=ax2)

ax2.set_title('Histogram congestion of all Mondays', fontsize=18)
ax2.set_xlabel('Congestion', fontsize=16)
ax2.set_ylabel('Count', fontsize=16)

plt.show()

## Histograms of each roadway

In [None]:
def roadway_has_data(data, x, y, direction):
    condition = (data['x'] == x) & (data['y'] == y) & (data['direction'] == direction)
    return len(data[condition])

def data_by_roadway(data, x, y, direction):
    cond = (data['x'] == x) & (data['y'] == y) & ((data['direction'] == direction))
    return data[cond]

In [None]:
xs = train_data['x'].unique()
ys = train_data['y'].unique()
dirs = train_data['direction'].unique()

roadways = [(x, y, d) for x in xs for y in ys for d in dirs]
roadways = list(filter(lambda r: roadway_has_data(train_data, r[0], r[1], r[2]) , roadways))

In [None]:
fig, axis = plt.subplots(nrows=9, ncols=8, figsize=(30, 30))
for (x, y, d), ax in zip(roadways, axis.flatten()):
    data = data_by_roadway(train_data, x, y, d)
    ax = sns.histplot(
        data=data, 
        x=TARGET, 
        bins=40,
        kde=True,
        ax=ax)
    
    ax.set_title(f'x={x}, y={y}, direction={d}', fontsize=12)
    ax.set_xlabel('')
    ax.set_ylabel('')

plt.tight_layout()
plt.show()

## Boxplot grouped by `direction`

In [None]:
fig, ax = plt.subplots(figsize=(15, 7))
sns.boxplot(data=train_data, x='direction', y='congestion', palette='Blues_r', ax=ax)

ax.set_title('Boxplot grouped by direction', fontsize=18)
ax.set_xlabel('Direction', fontsize=16)
ax.set_ylabel('Congestion', fontsize=16)

plt.tight_layout()
plt.show()

## Boxplot grouped by `x` and `y`

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

sns.boxplot(data=train_data, x='x', y='congestion', palette='Blues_r', ax=ax1)
sns.boxplot(data=train_data, x='y', y='congestion', palette='Blues_r', ax=ax2)

ax1.set_title('Boxplot grouped by x', fontsize=18)
ax1.set_ylabel('Congestion', fontsize=16)

ax2.set_title('Boxplot grouped by y', fontsize=18)
ax2.set_ylabel('Congestion', fontsize=16)

plt.tight_layout()
plt.show()

## Mean Roadway-Congestion relationship

In [None]:
def plot_direction(data, x, y, ax, linewidth=8):
    map_direction = {
        'EB': [1, 0],
        'NB': [0, 1],
        'SB': [0, -1],
        'WB': [-1, 0],
        'NE': [1, 1],
        'SW': [-1, -1],
        'NW': [-1, 1],
        'SE': [1, -1]
    }

    normalize = mcolors.Normalize(vmin=0, vmax=72)
    
    df = data.loc[(data['x'] == x) & (data['y'] == y), ['direction', 'congestion']]
    df = df.groupby('direction', as_index=False).agg({'congestion': 'mean'})

    for d, c in zip(df['direction'], df['congestion']):
        xx, yy = map_direction[d]

        x1 = x + (xx / 4)
        y1 = y + (yy / 4)

        ax.plot(
            [x, x1], 
            [y, y1], 
            linewidth=linewidth, 
            color=cm.Blues(normalize(c)))
        ax.set_title(f'({x}, {y})')

    ax.set_axis_off()

In [None]:
def plot_roadways(data, ax, linewidth=8):
    idx = np.zeros((3, 4), tuple)
    for x in range(0, 3):
        for y in range(0, 4):
            idx[x, 3-y] = (x, y)

    for (x, y), ax in zip(idx.T.flatten(), axis.flatten()):
        plot_direction(data, x, y, ax=ax, linewidth=linewidth)

In [None]:
fig, axis = plt.subplots(nrows=len(ys), ncols=len(xs), figsize=(8, 8))        
plot_roadways(train_data, ax, linewidth=10)

plt.tight_layout()
plt.show()

## Mean hourly congestion per weekday

In [None]:
week_days = train_data[TIME].dt.day_name().unique()

for wd in week_days:
    df = train_data[train_data[TIME].dt.day_name() == wd].groupby(by=train_data[TIME].dt.time).mean()[TARGET]
    ax = df.plot(label=wd, figsize=(20, 7));
    
train_data.groupby(by=train_data[TIME].dt.time).mean()[TARGET].plot(marker='o', ls='--', c='r', label='Mean');    
    
ax.set_title('Mean hourly congestion per weekday', fontsize=18)
ax.set_xlabel('Hour', fontsize=16)
ax.set_ylabel('Congestion', fontsize=16)
ax.legend()

plt.show()

## Average congestion on Monday's 

In [None]:
def roadway_time_series(data, x, y, d):
    idx = (data['x'] == x) & (data['y'] == y) & (data['direction'] == d)
    return data[idx].set_index('time')

In [None]:
fig, axis = plt.subplots(nrows=17, ncols=4, figsize=(25, 70))

for (x, y, d), ax in zip(roadways, axis.flatten()):
    ts = roadway_time_series(train_data, x, y, d)
    ts = ts[ts.index.day_name() == 'Monday'].reset_index()    
    ts = ts.groupby(by=ts[TIME].dt.time).mean();

    ts[TARGET].plot(ax=ax);
    ax.axhline(50, color='r', ls='--')
    
    ax.set_title(f'x={x}, y={y}, direction={d}', fontsize=12)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_ylim(0, 100)
    ax.legend(['Congestion', 'Busy'])
    
plt.tight_layout()
plt.show()

# Simple model

In [None]:
def simple_model(train_data, test):
    # Gather the average congestion of all Mondays
    avg_mondays = {}
    for (x, y, d) in roadways:
        key = f'{x}_{y}_{d}'

        ts = roadway_time_series(train_data, x, y, d)
        ts = ts[ts.index.day_name() == 'Monday'].reset_index()    
        ts = ts.groupby(by=ts[TIME].dt.time).mean();
    
        avg_mondays[(x, y, d)] = ts

    # lookup the prediction
    congestion = pd.Series(dtype=np.float64, index=test.index)
    for row_id, row in test.iterrows():
        t = row['time']
        x, y, d = row['x'], row['y'], row['direction']

        idx = datetime.datetime.strptime(str(t.time()), '%H:%M:%S').time()
        congestion[row_id] = avg_mondays[(x, y, d)].loc[idx][TARGET]
    
    return pd.DataFrame({
        ID: test.index,
        TARGET: congestion,
    }).set_index(ID)

In [None]:
# Splitting data 
data = train_data[train_data[TIME].dt.day_name() == 'Monday']
X_train, X_val, y_train, y_val = train_test_split(
    data[['time', 'x', 'y', 'direction']],
    data[TARGET],
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE)

In [None]:
y_pred = simple_model(train_data, X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

ax = pd.DataFrame({
    'true': y_val,
    'pred': y_pred['congestion']
}).plot(x='true', y='pred', kind='scatter', figsize=(8, 8), alpha=0.4)

ax.set_title(f'RMSE: {rmse:.4f}', fontsize=16)
plt.show()

# Submission

In [None]:
pred_simple = simple_model(train_data, test_data)
pred_simple

In [None]:
# save submission file
pred_simple.to_csv(SIMPEL_SUBMISSION_FILE)

# Feature Engineering

In [None]:
def create_time_features(data):
    data['hour'] = data[TIME].dt.hour
    data['minute'] = data[TIME].dt.minute
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    return data

def create_mean_congestion(data):
    keys = ['x', 'y', 'direction', 'hour', 'minute']

    ts = train_data[train_data[TIME].dt.day_name() == 'Monday'].copy()
    ts = create_time_features(ts)
    return ts.groupby(by=keys).mean().reset_index().set_index(keys)

def create_std_congestion(data):
    keys = ['x', 'y', 'direction', 'hour', 'minute']

    ts = train_data[train_data[TIME].dt.day_name() == 'Monday'].copy()
    ts = create_time_features(ts)
    return ts.groupby(by=keys).std().reset_index().set_index(keys)

In [None]:
FEATURES = [
    ID, 
    'x', 
    'y', 
    'direction', 
    'hour', 
    'minute', 
    'mean_congestion', 
    'is_afternoon',
    'std_congestion'
]

def create_feature(data):
    keys = ['x', 'y', 'direction', 'hour', 'minute']
    
    mean_df = create_mean_congestion(train_data)
    std_df = create_std_congestion(train_data)

    data['hour'] = data[TIME].dt.hour
    data['minute'] = data[TIME].dt.minute

    ts = data.reset_index().merge(mean_df, how='left', left_on=keys, right_on=keys)
    ts = ts.merge(std_df, how='left', left_on=keys, right_on=keys)
    
    ts.drop([TIME, 'is_afternoon_y'], axis=1, inplace=True)
    ts.columns = FEATURES
    
    return ts.set_index(ID)

In [None]:
create_feature(test_data)

# Model building

## Splitting data

In [None]:
# Splitting data 
data = train_data[train_data[TIME].dt.day_name() == 'Monday']
X_train, X_val, y_train, y_val = train_test_split(
    data[['time', 'x', 'y', 'direction']],
    data[TARGET],
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE)

## Pipeline

In [None]:
feature_engineering = Pipeline(steps=[
    ('time', FunctionTransformer(create_feature)),
])

cat_transform = Pipeline(steps=[
    ('onehot',  OneHotEncoder(handle_unknown="ignore"))
])

num_transform = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_transform, [
        'mean_congestion',
        'std_congestion',
        'hour', 
        'minute',
    ]),
    ("cat", cat_transform, [
        'direction', 
        'x',
        'y', 
        'is_afternoon'
    ])
])

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    gpu_id=0,
    booster='gblinear')

model = Pipeline(steps=[
    ('feature', feature_engineering),
    ('prep', preprocessor), 
    ('model', xgb_model)
])

In [None]:
prediction = model.fit(X_train, y_train).predict(X_val)

y_pred = pd.DataFrame({
    ID: X_val.index,
    'pred': prediction
}).set_index(ID)

In [None]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred['pred']))

ax = pd.DataFrame({
    'true': y_val,
    'pred': y_pred['pred']
}).plot(x='true', y='pred', kind='scatter', alpha=0.4, figsize=(8, 8))

ax.set_title(f'RMSE: {rmse:.4f}', fontsize=16)
plt.show()

In [None]:
ax = y_pred['pred'].plot.hist(bins=60, figsize=(8, 5))
ax.set_title('Predition Distribution')

plt.show()

# Submission

In [None]:
p = 0.4
pred =  np.floor(p*model.predict(test_data) + (1-p)*pred_simple[TARGET])

submission_data = pd.DataFrame({
    ID: test_data.index,
    TARGET: pred.astype(np.int8),
}).set_index(ID)

submission_data

In [None]:
# save submission file
submission_data.to_csv(SUBMISSION_FILE)

Thank you for reading.