In [1]:
import pandas as pd
import sys
sys.path.append("../scripts/")
from extractor import DataExtractor

In [2]:
data_extractor=DataExtractor()

In [3]:
with open('../data/20181024_d1_0830_0900.csv','r') as f:
    lines=f.readlines()

columns=lines[0].replace('\n','').split(';')
data=lines[1:]

In [4]:
columns[:4]

['track_id', ' type', ' traveled_d', ' avg_speed']

In [5]:
columns[4:]

[' lat', ' lon', ' speed', ' lon_acc', ' lat_acc', ' time']

As mentioned in the data source:
- the first 4 cols are trajectory data
- the next 6 cols are time dependent vehicle data

In [6]:
def chunk_list(list,chunk_size,default_first_val=None):
    chunked_list=[]
    for i in range(0, len(list), chunk_size):
        if default_first_val:
            values=[default_first_val]
            values.extend(list[i:i+chunk_size])
            chunked_list.append(values)
        else:
            chunked_list.append(list[i:i+chunk_size])

    return chunked_list

In [7]:
trajectory_cols=columns[:4]
trajectory_rows=[]

timed_vehicle_cols=['track_id']+columns[4:]
timed_vehicle_rows=[]

for row in data:
    items=row.replace('\n','').split(';')
    trajectory_rows.append(items[:4])
    timed_vehicle_rows.extend(chunk_list(items[4:],6,items[0]))


In [8]:
trajectory_data=pd.DataFrame(columns=trajectory_cols,data=trajectory_rows)
timed_vehicle_data=pd.DataFrame(columns=timed_vehicle_cols,data=timed_vehicle_rows)

In [9]:
trajectory_data.head()

Unnamed: 0,track_id,type,traveled_d,avg_speed
0,1,Car,48.85,9.770344
1,2,Motorcycle,98.09,19.839417
2,3,Motorcycle,63.8,18.228752
3,4,Motorcycle,145.72,26.229014
4,5,Motorcycle,138.01,24.841425


In [10]:
timed_vehicle_data.head()

Unnamed: 0,track_id,lat,lon,speed,lon_acc,lat_acc,time
0,1,37.977391,23.737688,4.9178,0.0518,-0.0299,0.0
1,1,37.977391,23.737688,4.9207,-0.0124,-0.0354,0.04
2,1,37.977391,23.737688,4.916,-0.0519,-0.0413,0.08
3,1,37.97739,23.737688,4.9057,-0.0914,-0.0478,0.12
4,1,37.97739,23.737689,4.8871,-0.1679,-0.055,0.16


In [11]:
# check if every id (evry timed data) is included in the former dataframe

trajectory_data.shape[0]==len(timed_vehicle_data.track_id.unique())

True

In [12]:
# transfer the above computation to script file because we will use it more often
# test if our script works as expected

loaded_df=data_extractor.extract_data(file_name='20181024_d1_0830_0900.csv')

In [13]:
trajectory,vehicle=loaded_df
display(trajectory.head())
display(vehicle.head())

Unnamed: 0,track_id,type,traveled_d,avg_speed
0,20181024_d1_0830_0900_1,Car,48.85,9.770344
1,20181024_d1_0830_0900_2,Motorcycle,98.09,19.839417
2,20181024_d1_0830_0900_3,Motorcycle,63.8,18.228752
3,20181024_d1_0830_0900_4,Motorcycle,145.72,26.229014
4,20181024_d1_0830_0900_5,Motorcycle,138.01,24.841425


Unnamed: 0,track_id,lat,lon,speed,lon_acc,lat_acc,time
0,20181024_d1_0830_0900_1,37.977391,23.737688,4.9178,0.0518,-0.0299,0.0
1,20181024_d1_0830_0900_1,37.977391,23.737688,4.9207,-0.0124,-0.0354,0.04
2,20181024_d1_0830_0900_1,37.977391,23.737688,4.916,-0.0519,-0.0413,0.08
3,20181024_d1_0830_0900_1,37.97739,23.737688,4.9057,-0.0914,-0.0478,0.12
4,20181024_d1_0830_0900_1,37.97739,23.737689,4.8871,-0.1679,-0.055,0.16


Working good 👌

In [15]:
trajectory.head(5).to_json(f'../temp_storage/h.json',orient='records')

In [16]:
import json


with open(f'../temp_storage/h.json','r') as file:
    data=file.readlines()
dt=data[0]

# dt=json.loads('../temp_storage/h.json')
df=pd.DataFrame.from_dict(json.loads(dt))
# print(json.loads(dt))

In [17]:
df

Unnamed: 0,track_id,type,traveled_d,avg_speed
0,20181024_d1_0830_0900_1,Car,48.85,9.770344
1,20181024_d1_0830_0900_2,Motorcycle,98.09,19.839417
2,20181024_d1_0830_0900_3,Motorcycle,63.8,18.228752
3,20181024_d1_0830_0900_4,Motorcycle,145.72,26.229014
4,20181024_d1_0830_0900_5,Motorcycle,138.01,24.841425


In [18]:
df.columns

Index(['track_id', ' type', ' traveled_d', ' avg_speed'], dtype='object')

In [21]:
df.columns=df.columns.str.replace(' ','')

In [22]:
df.columns

Index(['track_id', 'type', 'traveled_d', 'avg_speed'], dtype='object')