In [None]:
import pandas as pd  # for csv reader
import datetime 
import numpy as np  # for checking NaN values
import json  # for writing file (temp)

In [None]:
# reading data files
data = pd.read_csv('./2018_1.csv', usecols=list(range(0, 32)),
                   # reading dates as string to simplify casting 
                   dtype={'CRS_DEP_TIME': str, 'DEP_TIME': str, 
                          'CRS_ARR_TIME': str, 'ARR_TIME': str})
# converting crs departure/arrival time into date, these columns NOT contains NaN
data['CRS_DEP_TIME'] = pd.to_datetime(data['CRS_DEP_TIME'], format='%H%M')
data['CRS_ARR_TIME'] = pd.to_datetime(data['CRS_ARR_TIME'], format='%H%M')
# converting real dep/arr time into date, if value is NaN it will be converted to NaT
data['DEP_TIME'] = pd.to_datetime(data['DEP_TIME'], errors='coerce', format='%H%M')
data['ARR_TIME'] = pd.to_datetime(data['ARR_TIME'], errors='coerce', format='%H%M')

data.head()

In [None]:
# function that return True if a value is NaN or NaT in case of Date types
def is_NaN(val):
  # check string NaN
  if isinstance(val, str) and not (val == '' or pd.isnull(val)):
    return False
  # check NaT/NaN value for date
  elif not pd.isnull(val):
    return False

  return True

In [None]:
# this function return a py dictionary (json file) with all the information of a
# specific flight! Only valid values will be added to the fill 
# (i.e. NaN values will not be add, refers to db schema)
def json_single_flight(fligth_data):
  time_dic = {}

  print(fligth_data)
  # ailine's IATA code
  time_dic['airline'] = fligth_data.OP_CARRIER
  # departure time
  time_dic['crs_dep_time'] = fligth_data.CRS_DEP_TIME.strftime("%H:%M") # provare anche a salvare con .time() e con .to_pydate()

  # saving departure time
  if not is_NaN(fligth_data.DEP_TIME):
    time_dic['dep_time'] = fligth_data.DEP_TIME.strftime("%H:%M")
  # saving delay as integer, cast will be always safe thanks to if
  if not is_NaN(fligth_data.DEP_DELAY):
    time_dic['dep_delay'] = int(fligth_data.DEP_DELAY)
  # saving dep delay group as integer, cast will be always safe thanks to if
  if not is_NaN(fligth_data.DEP_DELAY_GROUP):
    time_dic['dep_delay_group'] = int(fligth_data.DEP_DELAY_GROUP)
  # arrival time (Does not contains NaN)
  time_dic['crs_arr_time'] = row.CRS_ARR_TIME.strftime("%H:%M")
  # saving arrival time
  if not is_NaN(fligth_data.ARR_TIME):
    time_dic['arr_time'] = fligth_data.ARR_TIME.strftime("%H:%M")
  # saving delay as integer, cast will be always safe thanks to if
  if not is_NaN(fligth_data.ARR_DELAY):
    time_dic['arr_delay'] = int(fligth_data.ARR_DELAY)
  # saving arrival delay group as integer, cast will be always safe thanks to if
  if not is_NaN(fligth_data.ARR_DELAY_GROUP):
    time_dic['arr_delay_group'] = int(fligth_data.ARR_DELAY_GROUP)

  # check if flight was cancelled (No NaN)
  if fligth_data.CANCELLED == 1.0:
    time_dic['cancelled'] = True
    time_dic['cancellation_code'] = fligth_data.CANCELLATION_CODE
  else:
    time_dic['cancelled'] = False
  # check if flight was diverted
  if fligth_data.DIVERTED == 1.0:
    time_dic['diverted'] = True
  else:
    time_dic['diverted'] = False

  # saving crs Elapsed Time of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.CRS_ELAPSED_TIME):
    time_dic['crs_elapsed_time'] = int(fligth_data.CRS_ELAPSED_TIME)
  # saving real Elapsed Time of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.ACTUAL_ELAPSED_TIME):
    time_dic['actual_elapsed_time'] = int(fligth_data.ACTUAL_ELAPSED_TIME)
  # saving air Time of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.AIR_TIME):
    time_dic['air_time'] = int(fligth_data.AIR_TIME)
  # saving carrier_delay of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.CARRIER_DELAY):
    time_dic['carrier_delay'] = int(fligth_data.CARRIER_DELAY)
  # saving weather delay of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.WEATHER_DELAY):
    time_dic['weather_delay'] = int(fligth_data.WEATHER_DELAY)
  # saving nas delay of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.NAS_DELAY):
    time_dic['nas_delay'] = int(fligth_data.NAS_DELAY)
  # saving security delay of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.SECURITY_DELAY):
    time_dic['security_delay'] = int(fligth_data.SECURITY_DELAY)
  # saving late aircraft delay of Flight, cast will be always safe thanks to if
  if not is_NaN(fligth_data.LATE_AIRCRAFT_DELAY):
    time_dic['late_aircraft_delay'] = int(fligth_data.LATE_AIRCRAFT_DELAY)	

  return time_dic

In [None]:
data_groups = data.groupby(['DAY_OF_WEEK', 'ORIGIN', 'DEST'])

In [None]:
# Create a json file and upload it to the database for each group of 
# day-origin-destination (Remember: each file contains a specific month and year)
for group in data_groups:
  document = {}
  # list of all flights for a specific day, origin and destination
  flights = []
  for flight in group[1].itertuples():
    # Add basic information in case it is a new group
    if document == {}:
      document['year'] = flight.YEAR
      document['month'] = flight.MONTH
      document['day_of_month'] = flight.DAY_OF_MONTH
      document['day_of_week'] = flight.DAY_OF_WEEK
      document['origin'] = flight.ORIGIN
      document['destination'] = flight.DEST
      document['distance'] = int(flight.DISTANCE)
      document['distance_group'] = flight.DISTANCE_GROUP
    # Append all flight information to the list
    flights.append(json_single_flight(flight))

  # add the list of all flight to the document
  document['flights'] = flights
  with open('result.json', 'w') as fp:
    json.dump(document, fp)
  #print(document) # upload document to mongo!
  
  break