In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt

## Version history

* v.1: 
    * Started
* v.2: 
    * Changed graph edges to arcs so as to distinguish between roads in opposite directions
    * Added a new section on correlations between roadways
* v.3: 
    * Added assertions to check that all time series for roadways have the same time points

# Introduction

This short notebook has the following objectives.

1. It produces a simple visualization of the network of roadways. It may be helpful in graph modeling or obtaining insights about the interactions of different roadways.
2. A casual look at how congestion data correlate between different roadways.

In [None]:
train_data =  pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv',parse_dates=['time'])

In [None]:
train_data

In [None]:
roadways_df = train_data[['x','y','direction']].drop_duplicates()
len(roadways_df)

# A geographical graph

An obvious graph is a graph connecting different roadways based on their geographical locations. In this graph, the endpoints (not the midpoints) of roadways are the nodes of the graph. We assume that the endpoints coincide if they meet. This is an assumption because the problem description only gives the midpoints but does not provide details on how long each roadway is.

In [None]:
base_vectors = {'E':np.array([0.5,0]),'S':np.array([0,-0.5]),'W':np.array([-0.5,0]),'N':np.array([0,0.5]),'B':np.zeros((2,))}
def get_vector(direction):
    return base_vectors[direction[0]]+base_vectors[direction[1]]

In [None]:
import networkx as nx

def create_graph(roadways_df):
    G = nx.DiGraph()
    for i in range(len(roadways_df)):
        x,y,direction = roadways_df.iloc[i]
        center = np.array([x,y])
        vector = get_vector(direction)
        G.add_edge(tuple(center-vector),tuple(center+vector),id=(x,y,direction))
    return G

In [None]:
G = create_graph(roadways_df)

We render the roadways as (curved) arcs so as to distinguish roadways traveling in opposite directions. The actual roadway should pass through the center (red dot). 

In [None]:
plt.figure(figsize=(10,10))
nx.draw_networkx(G,pos={x:x for x in G.nodes},with_labels=False, node_size=0, 
                 connectionstyle='arc3,rad=0.15', edge_color='b',width=3,
                 arrowstyle='->',arrowsize=15)
for x in range(3):
    for y in range(4):
        plt.plot(x,y,'ro')
        plt.annotate(F'({x},{y})',xy=(x,y),xytext=(x+0.05,y+0.05))

# Correlations between roadways

Next we'll investigate how/if the congestion numbers correlate between roadways.

In [None]:
def get_time_series(X,x,y,direction):
    X = X[(X.x==x)&(X.y==y)&(X.direction==direction)].sort_values(by=['time'])
    return X[['time','congestion']]

In [None]:
congestion_array = []
road_id = []
time_array = []
for e in G.edges:
    x,y,direction = G[e[0]][e[1]]['id']
    ts = get_time_series(train_data,x,y,direction)
    congestion_array.append(ts.congestion.to_numpy())
    time_array.append(ts.time.to_numpy())
    road_id.append((x,y,direction))

# Check that the time series for each roadway have the same time points
for i in range(len(time_array)):
    assert np.all(time_array[0]==time_array[i])
    
congestion_array = np.array(congestion_array)

We note in passing that while the time series for the roadways all have the same time points, some 81 time points are missing from the full sampling range (with 20 minute frequency).

In [None]:
len(pd.date_range(start=np.datetime64('1991-04-01 00:00:00'), end=np.datetime64('1991-09-30 11:40:00'), freq='20T'))- len(time_array[0])

## Correlation matrix

In [None]:
plt.figure(figsize=(8,8))
corr_m = np.corrcoef(congestion_array)
plt.matshow(corr_m,0,interpolation='none')
plt.colorbar()
plt.show()

We see that there is some correlation but it is not strong. Let's pick a couple of roadways (one with more positive correlations and one with more negative correlations) to try to visualize if nearby roadways are correlated.

In [None]:
def visualize_roadway_correlation(i):
    colors = corr_m[i,:]
    plt.colorbar(mpl.cm.ScalarMappable(mpl.colors.Normalize(colors.min(),colors.max())))
    nx.draw_networkx(G,pos={x:x for x in G.nodes},with_labels=False, node_size=0,
                     connectionstyle='arc3,rad=0.15',edge_color=colors,
                    arrowsize=15,arrowstyle='->',width=3)

In [None]:
road_index_0 = 35
road_index_1 = 53
fig=plt.figure(figsize=(16,8))
fig.add_subplot(1,2,1)
visualize_roadway_correlation(road_index_0)
plt.title('x={0}, y={1}, direction={2}'.format(*road_id[road_index_0]))
fig.add_subplot(1,2,2)
visualize_roadway_correlation(road_index_1)
plt.title('x={0}, y={1}, direction={2}'.format(*road_id[road_index_1]))
plt.show()

In each figure, the yellow roadway is the one being focused on. It is not obvious that roadways directly connected to the focused roadway are more strongly (positively or negatively) correlated. It seems that defining a neighborhood based on geographical closeness may not be useful, although the correlation between roadways may still be exploited to improve predictions.