In [1]:
#@title Convert XML file to CSV file (SKIP THIS CELL)

import xml.etree.ElementTree as ET
import csv

tree = ET.parse('OpenStreetMap Trace for a Sparse Traffic.xml')  # replace with your file path
root = tree.getroot()

# open a file for writing
with open('sparse.csv', 'w', newline='') as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["time", "id", "x", "y", "angle", "type", "speed", "pos", "lane", "slope"])

    for timestep in root.findall('timestep'):
        time = timestep.get('time')
        for vehicle in timestep.findall('vehicle'):
            id = vehicle.get('id')
            x = vehicle.get('x')
            y = vehicle.get('y')
            angle = vehicle.get('angle')
            type = vehicle.get('type')
            speed = vehicle.get('speed')
            pos = vehicle.get('pos')
            lane = vehicle.get('lane')
            slope = vehicle.get('slope')
            writer.writerow([time, id, x, y, angle, type, speed, pos, lane, slope])

In [3]:
#@title Read the input data (Decompress sparse.zip if you haven't already) - (SKIP THIS CELL IF YOU ARE WORKING WITH preprocessed_sparse.csv)

import pandas as pd

# from google.colab import drive
# drive.mount('/content/drive')

# Load data
data = pd.read_csv('sparse.csv')
# data = pd.read_csv('/content/drive/MyDrive/2024-05- 7088CEM (ANN Module)/Assignments/handover_prediction/sparse.csv')
data.head()
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1483385 entries, 0 to 1483384
Data columns (total 10 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   time    1483385 non-null  float64
 1   id      1483385 non-null  object 
 2   x       1483385 non-null  float64
 3   y       1483385 non-null  float64
 4   angle   1483385 non-null  float64
 5   type    1483385 non-null  object 
 6   speed   1483385 non-null  float64
 7   pos     1483385 non-null  float64
 8   lane    1483385 non-null  object 
 9   slope   1483385 non-null  float64
dtypes: float64(7), object(3)
memory usage: 113.2+ MB


Unnamed: 0,time,x,y,angle,speed,pos,slope
count,1483385.0,1483385.0,1483385.0,1483385.0,1483385.0,1483385.0,1483385.0
mean,3154.547,542.5987,1616.521,162.4481,2.809118,55.10633,0.0
std,1609.442,162.9608,208.9379,107.3253,5.907708,80.76256,0.0
min,0.0,2.14,820.68,0.0,0.0,0.0,0.0
25%,2000.0,461.12,1559.0,90.28,0.0,6.8,0.0
50%,3121.0,478.07,1683.82,178.02,0.0,22.32,0.0
75%,4001.0,569.9,1697.19,270.96,0.4,59.22,0.0
max,8219.0,1420.58,2212.71,360.0,37.38,417.1,0.0


In [4]:
#@title Preprocess data (getting rid of unnecessary columns) (SKIP THIS CELL IF YOU ARE WORKING WITH preprocessed_sparse.csv (You can decompress the zip file with the same name))
# Convert the 'time' column to float and then to integer
data['time'] = data['time'].astype(float).astype(int)

# Remove 'veh' from 'id' column and convert to integer
data['id'] = data['id'].str.replace('veh', '').astype(int)

# Remove specified columns
columns_to_remove = ['type', 'lane', 'slope', 'pos']
data = data.drop(columns=columns_to_remove)

# Display the first few rows of the processed data
data.head(20)
data.info()
data.describe()

# Save the preprocessed DataFrame to a CSV file for future use
# output_path = '/content/drive/MyDrive/2024-05-7088CEM (ANN Module)/Assignments/handover_prediction/preprocessed_sparse.csv'
output_path = 'preprocessed_sparse.csv'
data.to_csv(output_path, index=False)

print(f"Preprocessed data saved to {output_path}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1483385 entries, 0 to 1483384
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   time    1483385 non-null  int64  
 1   id      1483385 non-null  int64  
 2   x       1483385 non-null  float64
 3   y       1483385 non-null  float64
 4   angle   1483385 non-null  float64
 5   speed   1483385 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 67.9 MB
Preprocessed data saved to preprocessed_sparse.csv


In [5]:
#@title Preprocesing: Preparing train and test datasets (Save sequenced data) (YOU CAN SKIP THIS BY DECOMPRESSING sequences_vehicle_ids.zip)
import pickle

sequence_length = 5

def create_sequences(data, sequence_length):
    sequences = [] # each seqence has the properties of one vehicle in different time steps (sequential)
    vehicle_ids = []
    for veh_id, group in data.groupby('id'):
        group = group.sort_values(by='time')
        for i in range(len(group) - sequence_length):
            seq = group.iloc[i:i+sequence_length]
            sequences.append((seq[['x', 'y', 'speed', 'angle']].values, seq[['x', 'y']].values[-1]))
            vehicle_ids.append(veh_id)
    return sequences, vehicle_ids

sequences, vehicle_ids = create_sequences(data, sequence_length)
# Save sequences and vehicle IDs to a file

# with open('/content/drive/MyDrive/2024-05- 7088CEM (ANN Module)/Assignments/handover_prediction/sequences_vehicle_ids.pkl', 'wb') as f:
with open('sequences_vehicle_ids.pkl', 'wb') as f:
    pickle.dump((sequences, vehicle_ids), f)

# how to load it later?
# with open('sequences_vehicle_ids.pkl', 'rb') as f:
#     sequences, vehicle_ids = pickle.load(f)