In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
print(train.shape)
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
print(test.shape)
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
train.head(65)

- Every 20-mins snippet has 65 (x,y,direction) combinations

In [None]:
train.groupby('time')['row_id'].size().unique()

- There are 6/4/8/3 types of (x,y) pairs for every 20-mins snippet

In [None]:
train.groupby(['time','x','y'])['row_id'].size().value_counts()

In [None]:
train['time'] = pd.to_datetime(train['time'])

In [None]:
train.groupby(['x','y'])['congestion'].agg(['min','max','mean'])

In [None]:
plt.figure(figsize = (15,10))
for i, (combi, df) in enumerate(train.groupby(['x','y'])):
    ax = plt.subplot(4,3, i+1)
    ax.hist(df.congestion, bins = 50, alpha = 0.9)
    ax.set_title(combi)
    
plt.suptitle('histogram of congestion') 
plt.tight_layout()
plt.show()    

In [None]:
train.set_index('time', inplace = True, drop = False)
plt.figure(figsize = (15,10))
for i, (combi, df) in enumerate(train.groupby(['x','y'])):
    ax = plt.subplot(4,3, i+1)
    rs = df['congestion'].resample('MS').sum()
    ax.bar(range(len(rs)), rs)
    ax.set_title(combi)
    
plt.suptitle('Monthly time series of congestion') 
plt.tight_layout()
plt.show()  

In [None]:
train.set_index('time', inplace = True, drop = False)
plt.figure(figsize = (15,10))
for i, (combi, df) in enumerate(train.groupby(['x','y'])):
    ax = plt.subplot(4,3, i+1)
    rs = df['congestion'].resample('D').mean()
    ax.plot( rs)
    ax.set_title(combi)
    
plt.suptitle('daily time series of congestion') 
plt.tight_layout()
plt.show() 

# Learn EDA from Ambrose 
https://www.kaggle.com/ambrosm/tpsmar22-eda-which-makes-sense

# Congestion and its special values

In [None]:
train.congestion.unique()

In [None]:
train.congestion.hist(bins = 100)
plt.show()

In [None]:
plt.subplots(2, 2, sharex=True, sharey=True, figsize=(16, 12))
for y in range(4):
    plt.subplot(2, 2, y+1)
    vc = train[(train.x == 2) & (train.y == y)].congestion.value_counts().sort_index()
    plt.bar(vc.index, vc, width=1,
            color=['r' if con in [15, 20, 21, 29, 34] else 'b' for con in vc.index])
    plt.ylabel('Count')
    plt.xlabel('Congestion')
    plt.title(f"(x = {2}) & (y = {y})")
plt.show()

Obviously, congestions 15, 29 and 34 are a particularity of the roadway with x = 2 and y = 1. We'll drill down further, into the eight directions of this roadway, plotting all congestion values on a time axis, just to see that some of the directions have these special congestion values on every day of the six months of training data.

In [None]:
for direction in train.direction.unique():
    temp = train[(train.x == 2) & (train.y == 1) & (train.direction == direction)]
    plt.subplots(1, 2, figsize=(18, 4))
    plt.subplot(1, 2, 1)
    vc = temp.congestion.value_counts().sort_index()
    plt.bar(vc.index, vc, width=1,
            color=['r' if con in [15, 20, 21, 29, 34] else 'b' for con in vc.index], alpha = 0.6)
    plt.ylabel('Count')
    plt.xlabel('Congestion')
    plt.title(f"(x = {2}) & (y = {1}) & (direction = {direction})")
    plt.subplot(1, 2, 2)
    plt.scatter(temp.time, temp.congestion, s=1, color=['r' if con in [15, 20, 21, 29, 34] else 'b' for con in temp.congestion])
    plt.title(f"(x = {2}) & (y = {1}) & (direction = {direction})")
    plt.ylabel('Congestion')
    plt.show()


Insight: At this level of detail, the dataset no longer looks like something which a linear regression can fit. Decision trees may be the better choice for these data.

- non-continuous time

In [None]:
unique_time = train.time.unique()
unique_time[1:][np.diff(unique_time)!= np.diff(unique_time)[0]]

# Dependence on time and date

We start by looking for a weekly pattern. As was to be expected, there is less traffic on weekends.



In [None]:
temp = train.groupby(train.time.dt.dayofweek).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Days of the week')
plt.bar(temp.index, temp, color='b')
plt.xticks(ticks=temp.index, labels='MTWTFSS')
plt.show()

In [None]:
temp = train.groupby(train.time.dt.hour + train.time.dt.minute/60).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Time of the day')
plt.bar(temp.index, temp, color='b', width=0.34)
plt.xticks(range(24))
plt.xlabel('Time of day')
plt.ylabel('Congestion')
plt.show()

In [None]:
temp = train.groupby(train.time.dt.week).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Time of the day')
plt.bar(temp.index, temp, color='b', width=0.34)
plt.xticks(range(24))
plt.xlabel('week of year')
plt.ylabel('Congestion')
plt.show()

- Drilling down we see that every road has its own daily pattern:



In [None]:
plt.subplots(2, 2, sharex=True, sharey=True, figsize=(16, 12))
for y in range(4):
    plt.subplot(2, 2, y+1)
    vc = train[(train.x == 2) & (train.y == y)]
    temp = vc.groupby(vc.time.dt.hour + vc.time.dt.minute/60).congestion.mean()
    plt.bar(temp.index, temp, color='b', width=0.34)
    plt.xticks(range(24))
    plt.xlabel('Time of day')
    plt.ylabel('Congestion')
    plt.title(f"(x = {2}) & (y = {y})")
plt.show()

If we plot the daily values for all days of the summer of 1991, we see more fluctuations:

* The last week of April (the fifth week of the diagram) has exceptionally high traffic.
* May 27 (Memorial Day) is a long weekend with little traffic on Monday.
* The fourth of July (which was a Thursday) has exceptionally low traffic.
* Labor day (September 2, the first Monday of September) has exceptionally low traffic.
* The last two weekends in September look strange. Friday September 27 has very high traffic.
* The rightmost bar of the diagram (Monday September 30) is quite low. This can probably be explained because the training data contains only the morning of this day (the afternoon is the test data).


Overall, there seem to be week with high traffic and weeks with low traffic.
(I used this [calendar](https://www.timeanddate.com/calendar/?year=1991&country=1)  for looking up the holidays.)

Insight:

* We will have to deal with outliers. A simple approach is dropping all the holidays before training.
* If morning and afternoon of the same day are correlated, the morning of September 30 will play a special role in predicting the test afternoon.
* We may need to find suitable external data to explain the high and low traffic weeks.
* The worst case for our predictions will be if September 30 is a holiday or there is a big event which changes the traffic patterns.
* A good validation strategy will be important. Perhaps we can use a few Monday afternoons as validation set.

In [None]:
import datetime
temp = train.groupby(train.time.dt.date).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Daily congestion')
plt.bar(temp.index, temp, color='g')
plt.ylim(40, 52)
for holiday_name, month, day in [('Memorial Day', 5, 27), ('Fourth of July', 7, 4), ('Labor Day', 9, 2)]:
    date = datetime.date(1991, month, day)
    plt.annotate(holiday_name,
                 (np.datetime64(date), temp.loc[date]),
                 xytext=(np.datetime64(date), temp.loc[date]-2),
                 arrowprops={'arrowstyle': '->'},
                 weight='bold',
                 color='k')
plt.show()

# Trend

We ca fit a linear regression to the daily averages to see if there is any significant growth during the six months. The diagram shows that the congestion grows by less than 0.2 over the whole period. I cannot yet tell whether this growth is significant.

Insight:

Considering that the total growth amounts to 0.2 and we are predicting integers, we may neglect growth for the beginning.
Before we include the trend as a feature in a model, we should test its significance.

In [None]:
temp = train.groupby(train.time.dt.date).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Trend')
poly = np.polynomial.polynomial.Polynomial.fit(range(len(temp.index)), temp, deg=1)
plt.plot(temp.index, poly(range(len(temp.index))))
plt.ylim(47, 48)
plt.show()

- 'NB' is more busy than 'NW'

In [None]:
tmp = train.groupby(['time','direction'])['congestion'].mean().unstack()
tmp[['NW','NB']].plot(figsize = (20,9))
plt.show()

# Forecast with yesterday's congestion at the same time

In [None]:
yesterday = train[(train['time'].dt.month >= 9)&(train['time'].dt.day == 29)&(train['time'].dt.hour >= 12)]
yesterday

In [None]:
sub['congestion'] = yesterday['congestion'].values
sub

In [None]:
sub.to_csv('submission.csv', index = False)