In [None]:
# Importing useful packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))


In [None]:
# Importing the datasets
train = pd.read_csv(r'../input/tabular-playground-series-mar-2022/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-mar-2022/test.csv')

### First overview of the datasets

In [None]:
train.head()

In [None]:
print(f'Train dataset is composed by {train.shape[0]} rows and {train.shape[1]} columns, ',
     f'while test dataset is composed by {test.shape[0]} rows and {test.shape[1]} columns',
     sep='\n')

We can immediately notice that test set has a very little number of rows: 2.3k vs 848k of the train set.

We have to decide how to treat this problem later.

Each feature is described in the competition overview:
- `row_id` - a unique identifier for this instance
- `time` - the 20-minute period in which each measurement was taken
- `x` - the east-west midpoint coordinate of the roadway
- `y` - the north-south midpoint coordinate of the roadway
- `direction` - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.
- `congestion` - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.

I think it is appropriate to drop the `row_id` column.

In [None]:
train.drop(columns='row_id', inplace=True)
test.drop(columns='row_id', inplace=True)

If `time` column is already a timestamp it's a good idea to use it as the dataset index. 

In [None]:
train.info()

In [None]:
train['time'] = pd.to_datetime(train['time'], unit='ns')
train = train.set_index('time')
train.head()

### Exploratory Data Analysis

**Target column**

The cardinality of `congestion` column:

In [None]:
if len(train.congestion.unique()) > 25:
    print('Congestion column is a continuous numerical feature')
else:
    print('Congestion column is a discrete numerical feature')

Let's see its distribution:

In [None]:
congestion_distribution = sns.histplot(train['congestion'])
congestion_distribution.set_title('Target column distribution')
plt.show()

- It seems to follow a normal distribution a part from three points in the left side which are more frequents then others
- Congestion values are integers from 0 to 100

What about a boxplot to see the outliers?

In [None]:
sns.boxplot(x=train['congestion'])

There are few outliers and they aren't much bigger than boxplot maximum value (Q3 * 1.5(IQR)) 

**Relationship between predictors and target column**

In [None]:
plt.bar(x = train['x'].unique(), height=train.groupby('x')['congestion'].mean(), width=0.5)


In [None]:
predictors = ['x', 'y', 'direction']
for col in predictors:
    plt.bar(x=train[col].unique(), 
            height=train.groupby(col)['congestion'].mean(),
            width=0.5)
    plt.title(f'Congestion avg by {col}')
    plt.show()
    

Each one of the three columns seems to by meaningful in order to predict traffic congestion value.

**Predictors**

Distribution of the predictors in the train set:

In [None]:
for col in predictors:
    plt.bar(x=train[col].unique(), 
            height=train[col].value_counts(normalize=True).sort_values(ascending=False),
            width=0.5)
    plt.title(f'{col} proportion in the train set')
    plt.xlabel(col)
    plt.show()

**Time series visualization**

Is the congestion influenced by the date?

First of all, let's extract from the date some columns that can be helpful.

In [None]:
train['year'] = train.index.year
train['month'] = train.index.month
train['weekday'] = train.index.day_name()
train.head()

For which years do we have the data?

In [None]:
train['year'].unique()

We have traffic data for year 1991 only, so this field won't help us to predict test congestions values. 

How does the traffic congestion behave thorugh the months?

In [None]:
plt.figure(figsize=(10, 8))
plt.bar(x=train['month'].unique(),
        height=train.groupby('month')['congestion'].mean().sort_values(ascending=False)
       )
plt.xlabel('Month')
plt.ylabel('Avg congestion')
plt.title('Avg congestion through the months')

It seems that even month column is not very useful to predict the congestion values.

In [None]:
plt.figure(figsize=(10, 8))
plt.bar(x=train['weekday'].unique(),
        height=train.groupby('weekday')['congestion'].mean().sort_values(ascending=False)
       )
plt.xlabel('Day of the week')
plt.ylabel('Avg congestion')
plt.title('Avg congestion by day of the week')

We can see that the average congestion value is stable from monday to thursday, then it gets a little lower every day from friday to sunday.

Let's go deeper with the respective boxplots.

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x=train['month'], y=train['congestion']).set_title('Congestion distribution through months')

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x=train['weekday'], y=train['congestion']).set_title('Congestion distribution through months')

Boxplots don't tell us much more than previous bar plots.

In [None]:
train['x'].unique()

Let's see if we can extract some insights if we add predictors values.

In [None]:
df = train.groupby(['x','month']).congestion.mean().reset_index()
fig = plt.figure(figsize  = (10, 7)) 

# ax = fig.add_subplot(2,2,i+1)
plt.plot(df[df['x']==0]['month'], df[df['x']==0]['congestion'], label = '0')
plt.plot(df[df['x']==1]['month'], df[df['x']==1]['congestion'], label = '1')
plt.plot(df[df['x']==2]['month'], df[df['x']==2]['congestion'], label = '2')
plt.title('Avg Monthly Congestion Trend by coordinate x')
plt.ylabel('Average Congestion')
plt.xlabel('Month')
plt.legend()
plt.show()

- This trend is coherent with the bar plot we did earlier
- Months seemed not to discriminate traffic congestion column earlier, but combined with the coordinate x they can add value to our model in the model building section

Let's see if it's the same with other predictors!

In [None]:
df = train.groupby(['y','month']).congestion.mean().reset_index()
fig = plt.figure(figsize  = (10, 7)) 

# ax = fig.add_subplot(2,2,i+1)
plt.plot(df[df['y']==0]['month'], df[df['y']==0]['congestion'], label = '0')
plt.plot(df[df['y']==1]['month'], df[df['y']==1]['congestion'], label = '1')
plt.plot(df[df['y']==2]['month'], df[df['y']==2]['congestion'], label = '2')
plt.plot(df[df['y']==3]['month'], df[df['y']==3]['congestion'], label = '3')
plt.title('Avg Monthly Congestion Trend by coordinate y')
plt.ylabel('Average Congestion')
plt.xlabel('Month')
plt.legend()
plt.show()

- Months combined with y coordinate can add much value to our model too!

In [None]:
train['direction'].unique()

In [None]:
df = train.groupby(['direction','month']).congestion.mean().reset_index()
fig = plt.figure(figsize  = (10, 7)) 

# ax = fig.add_subplot(2,2,i+1)
plt.plot(df[df['direction']=='EB']['month'], df[df['direction']=='EB']['congestion'], label = 'EB')
plt.plot(df[df['direction']=='NB']['month'], df[df['direction']=='NB']['congestion'], label = 'NB')
plt.plot(df[df['direction']=='SB']['month'], df[df['direction']=='SB']['congestion'], label = 'SB')
plt.plot(df[df['direction']=='WB']['month'], df[df['direction']=='WB']['congestion'], label = 'WB')
plt.plot(df[df['direction']=='NE']['month'], df[df['direction']=='NE']['congestion'], label = 'NE')
plt.plot(df[df['direction']=='SW']['month'], df[df['direction']=='SW']['congestion'], label = 'SW')
plt.plot(df[df['direction']=='NW']['month'], df[df['direction']=='NW']['congestion'], label = 'NW')
plt.plot(df[df['direction']=='SE']['month'], df[df['direction']=='SE']['congestion'], label = 'SE')
plt.title('Avg Monthly Congestion Trend by direction')
plt.ylabel('Average Congestion')
plt.xlabel('Month')
plt.legend()
plt.show()

Trends are pretty flat, but there is some value that a model can extract.

Let's repeat the same things for weekdays. I will skip barplots because avg trends embody their insight.

In [None]:
df = train.groupby(['x','weekday']).congestion.mean().reset_index()
fig = plt.figure(figsize  = (10, 7)) 

# ax = fig.add_subplot(2,2,i+1)
plt.plot(df[df['x']==0]['weekday'], df[df['x']==0]['congestion'], label = '0')
plt.plot(df[df['x']==1]['weekday'], df[df['x']==1]['congestion'], label = '1')
plt.plot(df[df['x']==2]['weekday'], df[df['x']==2]['congestion'], label = '2')
plt.title('Avg Daily Congestion Trend by coordinate x')
plt.ylabel('Average Congestion')
plt.xlabel('Weekday')
plt.legend()
plt.show()

In [None]:
df = train.groupby(['y','weekday']).congestion.mean().reset_index()
fig = plt.figure(figsize  = (10, 7)) 

# ax = fig.add_subplot(2,2,i+1)
plt.plot(df[df['y']==0]['weekday'], df[df['y']==0]['congestion'], label = '0')
plt.plot(df[df['y']==1]['weekday'], df[df['y']==1]['congestion'], label = '1')
plt.plot(df[df['y']==2]['weekday'], df[df['y']==2]['congestion'], label = '2')
plt.plot(df[df['y']==3]['weekday'], df[df['y']==3]['congestion'], label = '3')
plt.title('Avg Daily Congestion Trend by coordinate y')
plt.ylabel('Average Congestion')
plt.xlabel('Weekday')
plt.legend()
plt.show()

In [None]:
df = train.groupby(['direction','weekday']).congestion.mean().reset_index()
fig = plt.figure(figsize  = (10, 7)) 

# ax = fig.add_subplot(2,2,i+1)
plt.plot(df[df['direction']=='EB']['weekday'], df[df['direction']=='EB']['congestion'], label = 'EB')
plt.plot(df[df['direction']=='NB']['weekday'], df[df['direction']=='NB']['congestion'], label = 'NB')
plt.plot(df[df['direction']=='SB']['weekday'], df[df['direction']=='SB']['congestion'], label = 'SB')
plt.plot(df[df['direction']=='WB']['weekday'], df[df['direction']=='WB']['congestion'], label = 'WB')
plt.plot(df[df['direction']=='NE']['weekday'], df[df['direction']=='NE']['congestion'], label = 'NE')
plt.plot(df[df['direction']=='SW']['weekday'], df[df['direction']=='SW']['congestion'], label = 'SW')
plt.plot(df[df['direction']=='NW']['weekday'], df[df['direction']=='NW']['congestion'], label = 'NW')
plt.plot(df[df['direction']=='SE']['weekday'], df[df['direction']=='SE']['congestion'], label = 'SE')
plt.title('Avg Daily Congestion Trend by direction')
plt.ylabel('Average Congestion')
plt.xlabel('Weekday')
plt.legend()
plt.show()

What about traffic congestion through daily hours?

In [None]:
plt.plot(train.index.hour.unique(), train.groupby(train.index.hour)['congestion'].mean())
plt.title('Traffic congestion through daily hours')
plt.xlabel('Hour')
plt.ylabel('Congestion')

### Data preprocessing and modelling coming soon

If you leave a meaningful comment I'll appreciate it! Keep in touch