# Table of Contents
* [Import and First Glance](#import)
* [Feature Engineering](#feature_eng)
* [Features](#features)
* [Target](#target)
* [Time Series Plots](#time_series)
* [Target vs Features](#target_vs_features)

In [None]:
# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

<a id='import'></a>
# Import and First Glance

In [None]:
# read data
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# preview of training data
df_train.head(10)

In [None]:
# dimensions of training data
df_train.shape

In [None]:
# dimensions of test data
df_test.shape

In [None]:
df_train.info()

In [None]:
df_test.info()

<a id='feature_eng'></a>
# Feature Engineering

In [None]:
# convert dates
df_train.time = pd.to_datetime(df_train.time)
df_test.time = pd.to_datetime(df_test.time)

In [None]:
# extract time components
df_train['year'] = df_train['time'].dt.year
df_train['month'] = df_train['time'].dt.month
df_train['day'] = df_train['time'].dt.day
df_train['hour'] = df_train['time'].dt.hour
df_train['minute'] = df_train['time'].dt.minute
df_train['weekday'] = df_train['time'].dt.weekday

df_test['year'] = df_test['time'].dt.year
df_test['month'] = df_test['time'].dt.month
df_test['day'] = df_test['time'].dt.day
df_test['hour'] = df_test['time'].dt.hour
df_test['minute'] = df_test['time'].dt.minute
df_test['weekday'] = df_test['time'].dt.weekday

In [None]:
# combine coordinates to pair
df_train['x_y'] =  df_train.x.map(str) + '/' + df_train.y.map(str)
df_test['x_y'] =  df_test.x.map(str) + '/' + df_test.y.map(str)

In [None]:
# numerical version of direction
df_train['direction_num'] = df_train.direction.astype('category').cat.codes
df_test['direction_num'] = df_test.direction.astype('category').cat.codes

In [None]:
# basic stats - training data
df_train.describe(include='all', datetime_is_numeric=True)

In [None]:
# basic stats - test data
df_test.describe(include='all', datetime_is_numeric=True)

### Observations:
* Test Set consists of only (half of) ONE day: Monday 30th of September 1991, 12:00h-23:40h!
* Year is always 1991 => not relevant!

<a id='features'></a>
# Features

In [None]:
features_cat = ['x', 'y', 'x_y', 'direction', 
                'month', 'day', 'hour', 'minute',
                'weekday'] # ignoring year being constant

In [None]:
# plot feature distributions train vs test
for f in features_cat:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,4), sharex=True)
    c = df_train[f].value_counts().sort_index()
    ax1.bar(height=c.values, x=c.index, color='blue')
    ax1.set_title(f + ' - train')
    ax1.grid()
    
    c = df_test[f].value_counts().sort_index()
    ax2.bar(height=c.values, x=c.index, color='green')
    ax2.set_title(f + ' - test')
    ax2.grid()
    
    plt.show()

### Look at distribution of locations (x,y):

In [None]:
# plot distribution of coordinates (two-dimensional)
plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,2,1)
sns.heatmap(data=pd.crosstab(df_train.y, df_train.x),
            cmap='Blues', 
            linewidths=1, linecolor='black',
            annot=True, fmt='d')
plt.title('Coordinate distribution - Train')

ax2 = plt.subplot(1,2,2, sharex=ax1)
sns.heatmap(data=pd.crosstab(df_test.y, df_test.x),
            cmap='Blues', 
            linewidths=1, linecolor='black',
            annot=True, fmt='d')
plt.title('Coordinate distribution - Test')
plt.show()

### Look at distribution of locations combined with direction:

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(pd.crosstab(df_train.x_y, df_train.direction),
            cmap='Blues', 
            linewidths=1, linecolor='black',
            annot=True, fmt='d')
plt.show()

#### Of the 12 x 8=96 potential combinations we actually observe only 65. Let's check this also on the test set:

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(pd.crosstab(df_test.x_y, df_test.direction),
            cmap='Blues', 
            linewidths=1, linecolor='black',
            annot=True, fmt='d')
plt.show()

In [None]:
# add those combinations as yet another feature
df_train['x_y_d'] =  df_train.x_y.map(str) + '/' + df_train.direction.map(str)
df_test['x_y_d'] =  df_test.x_y.map(str) + '/' + df_test.direction.map(str)

<a id='target'></a>
# Target

In [None]:
# basic stats including percentiles
df_train.congestion.describe(percentiles=[0.001,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99,0.999])

In [None]:
# histogram of target
plt.figure(figsize=(10,4))
df_train.congestion.plot(kind='hist', bins=20, color='blue')
plt.title('Target')
plt.grid()
plt.show()

In [None]:
# more bins
plt.figure(figsize=(10,4))
df_train.congestion.plot(kind='hist', bins=100, color='blue')
plt.title('Target')
plt.grid()
plt.show()

<a id='time_series'></a>
# Time Series Plots

In [None]:
my_alpha=0.1
fig, ax = plt.subplots(figsize=(16,4))
ax.scatter(df_train.time, df_train.congestion, color='blue', alpha=my_alpha)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# zoom in on end of time series
my_alpha=0.25
fig, ax = plt.subplots(figsize=(16,4))
ax.scatter(df_train[-5000:].time, df_train[-5000:].congestion, color='blue', alpha=my_alpha)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# ... color by direction
my_alpha=1
fig, ax = plt.subplots(figsize=(16,4))
ax.scatter(df_train[-5000:].time, df_train[-5000:].congestion, 
           c=df_train[-5000:].direction_num, alpha=my_alpha)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# zoom in further
my_alpha=0.25
fig, ax = plt.subplots(figsize=(16,4))
ax.scatter(df_train[-500:].time, df_train[-500:].congestion, color='blue', alpha=my_alpha)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# ... color by direction
my_alpha=1
fig, ax = plt.subplots(figsize=(16,4))
ax.scatter(df_train[-500:].time, df_train[-500:].congestion, 
           c=df_train[-500:].direction_num, alpha=my_alpha)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
plt.show()

<a id='target_vs_features'></a>
# Target vs Features

In [None]:
# violin plots
for f in features_cat:
    plt.figure(figsize=(12,4))
    sns.violinplot(data=df_train, x=f, y='congestion')
    plt.title('Target vs ' + f)
    plt.grid()
    plt.show()

### Explore mean target values:

In [None]:
# mean values by location
target_by_x_y = df_train.groupby('x_y')['congestion'].mean()
print(target_by_x_y)
# plot means
plt.figure(figsize=(12,8))
plt.barh(width=target_by_x_y.values, y=target_by_x_y.index,)
plt.xlabel('congestion')
plt.title('mean target by location')
plt.grid()
plt.show()

In [None]:
# mean values by direction
target_by_d = df_train.groupby('direction')['congestion'].mean()
print(target_by_d)
# plot means
plt.figure(figsize=(12,6))
plt.barh(width=target_by_d.values, y=target_by_d.index,)
plt.xlabel('congestion')
plt.title('mean target by direction')
plt.grid()
plt.show()

In [None]:
# mean values by location and direction
target_by_x_y_d = df_train.groupby('x_y_d')['congestion'].mean()
# plot means
plt.figure(figsize=(12,24))
plt.barh(width=target_by_x_y_d.values, y=target_by_x_y_d.index,)
plt.xlabel('congestion')
plt.title('mean target by location+direction')
plt.grid()
plt.show()