In [7]:
#--------------------------------------------------------------------------------------------------------------------------------------#
# Title: Import single JSON file BY Vehicle 
# Project: STIB - Data Mining 
# Author: E.O
#--------------------------------------------------------------------------------------------------------------------------------------#

# 0. -----------------  MODULES -------------------------------------------------------------------------------------------------------#
import json
import pandas as pd
import ipdb 
import datetime

# 1. ---------------- FUNCTIONS --------------------------------------------------------------------------------------------------------#

# ipdb.set_trace()  # for trace

# Function 1: Generate  pandas df for each lineId 
def create_df_lineid(df_single_line):
    """"create pandas df for a line ID"""
    
    # Initialize list
    append_data = []

    # Convert each row in df line
    for index, row in df_single_line.iterrows():
        append_data.append(pd.DataFrame(row['vehiclePositions'],index=[0]))

    # Concatenate the new data
    df_vehicle_pos = pd.concat(append_data)
    df_line_id = pd.DataFrame(df_single_line['lineId'])
    return df_vehicle_pos.join(df_line_id)


# Function 2: Preparation data from a single json file converted in pandas df.
def prepare_data(df):  
    """"Prepare the data for computation"""
    list_df = [] # initialize df
    for i in range(round(len(df)/1000) + 10):  
          
          df_data = pd.DataFrame.from_dict(df.loc[i, "data"])
          print('*******************************************************************')
          print('PROCESSING DATA: ' + str(i) + ' of ' + str(round(len(df)/1000) + 10)) 
          print('*******************************************************************')
          
          for ind in range(len(df_data)):
             df_lines = pd.DataFrame.from_dict(df_data.loc[ind, "Responses"])
             df_lines = df_lines.join(df_data['time'])
             df_lines.dropna(subset = ["time"], inplace=True) #remove nan from ts colum
        
             for j in range(len(df_lines)):
                df_lineId = pd.DataFrame.from_dict(df_lines.loc[j, 'lines'])
                df_vehicle_line_id = create_df_lineid(df_single_line = df_lineId) # create df for each vehicle line with features 
                df_time = pd.DataFrame(df_lines['time'].unique())
                df_time.columns = ['time']
                df_time = pd.to_datetime(df_time['time'], unit='ms')
                df_final = pd.concat([df_time, df_vehicle_line_id], axis = 1)
                list_df.append(df_final.head()) # list of dataframes
                #print('-------------------------------------------------------------------')
    return pd.concat(list_df)
       
                
# 2. ---------------- READING & PROCESSING ------------------------------------------------------------------------------------------------#
        
# 2.1 Opening single JSON file 
f = open('0-data/vehiclePosition02.json')
data = json.load(f)

# 2.2 Converting dictionary to Pandas DF
df = pd.DataFrame.from_dict(data)
df.head()

# Selecting just one li

# 2.3 Processing Testset 
testset = prepare_data(df)

*******************************************************************
PROCESSING DATA: 0 of 15
*******************************************************************
*******************************************************************
PROCESSING DATA: 1 of 15
*******************************************************************
*******************************************************************
PROCESSING DATA: 2 of 15
*******************************************************************
*******************************************************************
PROCESSING DATA: 3 of 15
*******************************************************************
*******************************************************************
PROCESSING DATA: 4 of 15
*******************************************************************
*******************************************************************
PROCESSING DATA: 5 of 15
*******************************************************************
**********************************

In [10]:
testset[(testset['lineId']=='1')]

Unnamed: 0,time,directionId,distanceFromPoint,pointId,lineId
0,2021-09-07 07:14:54.287,8731,0,8101,1
0,2021-09-07 07:14:54.287,8731,0,8011,1
0,2021-09-07 07:14:54.287,8731,0,8141,1
0,2021-09-07 07:14:54.287,8162,0,8072,1
0,2021-09-07 07:14:54.287,8731,1,8071,1
...,...,...,...,...,...
0,2021-09-07 07:22:22.909,8731,0,8051,1
0,2021-09-07 07:22:22.909,8731,0,8741,1
0,2021-09-07 07:22:22.909,8731,0,8081,1
0,2021-09-07 07:22:22.909,8162,1,8122,1


In [4]:
testset[(testset['lineId']==1) & (testset['directionId']=='8731')]

Unnamed: 0,time,directionId,distanceFromPoint,pointId,lineId


Unnamed: 0,data
0,"{'time': '1630998894287', 'Responses': [{'line..."
1,"{'time': '1630998928695', 'Responses': [{'line..."
2,"{'time': '1630998959313', 'Responses': [{'line..."
3,"{'time': '1630998991365', 'Responses': [{'line..."
4,"{'time': '1630999023417', 'Responses': [{'line..."
