In [39]:
# measure execution time
%load_ext autotime

# disable warnings
import warnings
warnings.filterwarnings('ignore')

time: 0 ns (started: 2023-07-14 16:48:36 +03:00)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mlp
import seaborn as sns

In [2]:
# read data and preprocess them
data = pd.read_csv('data.csv')
data.rename(columns={'Unnamed: 0': 'Time Column'}, inplace=True)
data['Time Column'] = pd.to_datetime(data['Time Column'])
data.sort_values(by=['Path','Time Column'],inplace=True)
data.reset_index(drop=True,inplace=True)

In [3]:
# convert path column from categorical to numerical
from sklearn.preprocessing import LabelEncoder

# create an instance of LabelEncoder
label_encoder = LabelEncoder()

# fit label encoder on the 'Path' column
data['#Path'] = label_encoder.fit_transform(data['Path'])

In [94]:
data.head()

Unnamed: 0,Time Column,Taxi ID,Traj ID,Path,Length,Traffic Flow,temp,humidity,windspeed,sealevelpressure,visibility,conditions_Clear,conditions_Overcast,conditions_Partially cloudy,#Path
0,2008-05-18 00:00:00,279,254,"[100400941, 100400941, 100400941, 100400941, 1...",11,1,12.4,87.29,16.1,1017.7,16.0,0,0,1,0
1,2008-05-18 00:15:00,279,254,"[100400941, 100400941, 100400941, 100400941, 1...",11,3,12.4,87.29,16.1,1017.7,16.0,0,0,1,0
2,2008-05-18 00:30:00,279,254,"[100400941, 100400941, 100400941, 100400941, 1...",11,1,12.4,87.29,16.1,1017.7,16.0,0,0,1,0
3,2008-05-18 00:45:00,279,254,"[100400941, 100400941, 100400941, 100400941, 1...",11,0,12.4,87.29,16.1,1017.7,16.0,0,0,1,0
4,2008-05-18 01:00:00,279,254,"[100400941, 100400941, 100400941, 100400941, 1...",11,2,12.0,89.79,16.2,1017.7,16.0,0,0,1,0


time: 31 ms (started: 2023-07-14 19:16:48 +03:00)


In [5]:
import ast
import networkx as nx

In [21]:
# Extract unique paths from the dataset
unique_paths = data['Path'].unique()

# Create a directed graph using NetworkX
graph = nx.DiGraph()

# Add nodes to the graph for each unique path
for path in unique_paths:
    node_id = data.loc[data['Path']==path,'#Path'].iloc[0]
    graph.add_node(node_id)
    
# Connect nodes in the graph based on the last and first ID values
for path1 in unique_paths:
    node_ids1 = eval(path1)
    last_id1 = node_ids1[-1]
    first_id1 = node_ids1[0]
    for path2 in unique_paths:
        if path1 == path2:
            continue
            
        node_ids2 = eval(path2)
        first_id2 = node_ids2[0]
        last_id2 = node_ids2[-1]
        
        if last_id1 == first_id2:
            ed1 = data.loc[data['Path'] == path1,'#Path'].iloc[0]
            ed2 = data.loc[data['Path'] == path2,'#Path'].iloc[0]
            graph.add_edge(ed1,ed2)

In [25]:
# Build the adjacency matrix
adjacency_matrix = nx.adjacency_matrix(graph)

# Convert the adjacency matrix to a NumPy array
adjacency_matrix = np.array(adjacency_matrix)

# Print the adjacency matrix
print(adjacency_matrix)

  (0, 740)	1
  (0, 741)	1
  (1, 119)	1
  (1, 120)	1
  (1, 121)	1
  (1, 122)	1
  (1, 123)	1
  (2, 119)	1
  (2, 120)	1
  (2, 121)	1
  (2, 122)	1
  (2, 123)	1
  (3, 119)	1
  (3, 120)	1
  (3, 121)	1
  (3, 122)	1
  (3, 123)	1
  (6, 712)	1
  (8, 965)	1
  (8, 966)	1
  (9, 875)	1
  (9, 876)	1
  (10, 959)	1
  (13, 965)	1
  (13, 966)	1
  :	:
  (943, 370)	1
  (943, 371)	1
  (944, 945)	1
  (946, 173)	1
  (947, 725)	1
  (949, 545)	1
  (957, 340)	1
  (962, 576)	1
  (964, 887)	1
  (966, 967)	1
  (969, 970)	1
  (970, 868)	1
  (975, 811)	1
  (978, 255)	1
  (979, 478)	1
  (979, 479)	1
  (980, 255)	1
  (981, 255)	1
  (982, 959)	1
  (984, 940)	1
  (988, 680)	1
  (989, 681)	1
  (990, 680)	1
  (992, 69)	1
  (993, 179)	1


In [36]:
# define train data
train = data.loc[data['Time Column'] < '2008-05-24']

# define test data
test = data.loc[data['Time Column'] >= '2008-05-24']

# print the results
print("Train set shape: ",train.shape)
print("Test set shape: ",test.shape)

Train set shape:  (576000, 15)
Test set shape:  (96000, 15)


In [57]:
def define_timestamps_for_sliding_window(dataset):
    '''
    This function slides over the timestamps and creates
    the time information contained in each window. The time
    information slides one step ahead each time
    
    Return a list with 5 timestamps that each window will have.
    '''
    
    # sort values by Time Column and Path
    dataset.sort_values(by=['Path','Time Column'])
    
    result = [] # list in which 5-window timestamps will be added
    unique_timestamps = dataset['Time Column'].unique()
    
    for i in range(len(unique_timestamps)-4):
        
        # define list to save timestamps for current window
        current_window_timestamps = []
        
        # append five consequtive timestamps
        current_window_timestamps.append(unique_timestamps[i])
        current_window_timestamps.append(unique_timestamps[i+1])
        current_window_timestamps.append(unique_timestamps[i+2])
        current_window_timestamps.append(unique_timestamps[i+3])
        current_window_timestamps.append(unique_timestamps[i+4])
        
        result.append(current_window_timestamps)
    
    return result

time: 0 ns (started: 2023-07-14 17:19:17 +03:00)


In [58]:
timestamps_train = define_timestamps_for_sliding_window(train)
timestamps_test = define_timestamps_for_sliding_window(test)

time: 484 ms (started: 2023-07-14 17:19:50 +03:00)


In [120]:
def create_inputs_and_output(dataset,window_timestamps):
    '''
    This function is used to create the input and the output to the ML algorithm.
    The input is a graph with node features of traffic flow. The output is the 
    future traffic flow in each node.
    '''
    graphs = [] # list of graphs (one per window)
    label_vectors = [] # list of output vector (one per window)
    
    for time in window_timestamps:
        #### CREATE FEATURE GRAPH ####

        # select only the data at current timestep
        needed_data = dataset[(dataset['Time Column']>= time[0]) & (dataset['Time Column']<= time[3])]
        needed_data = needed_data.sort_values(by=['Path','Time Column'])

        features_for_graph = []
        for i in range (1000):
            features = np.array(needed_data[needed_data['#Path'] == i]['Traffic Flow'])
            features_for_graph.append(features)

        for node in graph.nodes():
            # update the feature attribute of each node
            graph.nodes[node]['features'] = features_for_graph[i]

        #### CREATE LABEL OUTPUT VECTOR ####

        # select only the data at current timestep
        needed_data = dataset[(dataset['Time Column'] == time[4])]
        needed_data = needed_data.sort_values(by=['Path','Time Column'])

        label_vector = []
        for i in range (1000):
            output = np.array(needed[needed['#Path'] == i]['Traffic Flow'])
            label_vector.append(output)

        graphs.append(graph)
        label_vectors.append(label_vector)
        break
    return graphs, label_vectors

time: 16 ms (started: 2023-07-14 19:26:17 +03:00)


In [121]:
input,output = create_inputs_and_output(train,timestamps_train)

time: 1.09 s (started: 2023-07-14 19:26:18 +03:00)


In [None]:
X_train,y_train = create_inputs_and_output(train,timestamps_train)
X_test,y_test = create_inputs_and_output(test,timestamps_test)

In [124]:
c=0
for item in timestamps_train:
    c = c+1
    print(item)

[numpy.datetime64('2008-05-18T00:00:00.000000000'), numpy.datetime64('2008-05-18T00:15:00.000000000'), numpy.datetime64('2008-05-18T00:30:00.000000000'), numpy.datetime64('2008-05-18T00:45:00.000000000'), numpy.datetime64('2008-05-18T01:00:00.000000000')]
[numpy.datetime64('2008-05-18T00:15:00.000000000'), numpy.datetime64('2008-05-18T00:30:00.000000000'), numpy.datetime64('2008-05-18T00:45:00.000000000'), numpy.datetime64('2008-05-18T01:00:00.000000000'), numpy.datetime64('2008-05-18T01:15:00.000000000')]
[numpy.datetime64('2008-05-18T00:30:00.000000000'), numpy.datetime64('2008-05-18T00:45:00.000000000'), numpy.datetime64('2008-05-18T01:00:00.000000000'), numpy.datetime64('2008-05-18T01:15:00.000000000'), numpy.datetime64('2008-05-18T01:30:00.000000000')]
[numpy.datetime64('2008-05-18T00:45:00.000000000'), numpy.datetime64('2008-05-18T01:00:00.000000000'), numpy.datetime64('2008-05-18T01:15:00.000000000'), numpy.datetime64('2008-05-18T01:30:00.000000000'), numpy.datetime64('2008-05-1

In [123]:
len(timestamps_train)

572

time: 15 ms (started: 2023-07-14 19:27:15 +03:00)


In [125]:
c

572

time: 15 ms (started: 2023-07-14 19:27:45 +03:00)


In [1]:
nvidia-smi

NameError: name 'nvidia' is not defined

In [2]:
subprocess

NameError: name 'subprocess' is not defined