# Assignment 3

Import libraries and define common helper functions

In [6]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
# import s3fs
import pyarrow as pa

import pyarrow.parquet as pq

import fastavro
import pygeohash
import snappy
import jsonschema
from jsonschema.exceptions import ValidationError

Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz 

In [7]:

current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)

src_data_dir = current_dir.parent.parent.parent.joinpath('data\processed\openflights\\routes.jsonl.gz')

print(current_dir)
print(schema_dir)
print(src_data_dir)

c:\Users\saman\git_repos\dsc650\dsc650\assignments\assignment03
c:\Users\saman\git_repos\dsc650\dsc650\assignments\assignment03\schemas
c:\Users\saman\git_repos\dsc650\data\processed\openflights\routes.jsonl.gz


In [8]:
def read_jsonl_data():
    with gzip.open(src_data_dir, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]
    return records

In [15]:
myRec = read_jsonl_data()
df_myRec = pd.DataFrame(myRec)
# Create separate dataframes for each of the nested dicts
df_airline = pd.json_normalize(df_myRec['airline'])
df_src_airport = pd.json_normalize(df_myRec['src_airport'])
df_dst_airport = pd.json_normalize(df_myRec['dst_airport'])
df_final = pd.concat([df_airline,df_src_airport,df_dst_airport,df_myRec['codeshare'],df_myRec['equipment']], axis=1)
df_final.describe


<bound method NDFrame.describe of        airline_id              name                   alias iata icao  \
0             410        Aerocondor  ANA All Nippon Airways   2B  ARD   
1             410        Aerocondor  ANA All Nippon Airways   2B  ARD   
2             410        Aerocondor  ANA All Nippon Airways   2B  ARD   
3             410        Aerocondor  ANA All Nippon Airways   2B  ARD   
4             410        Aerocondor  ANA All Nippon Airways   2B  ARD   
...           ...               ...                     ...  ...  ...   
67658        4178  Regional Express          Qantas Airways   ZL  RXA   
67659       19016        Apache Air                  Apache   ZM  IWA   
67660       19016        Apache Air                  Apache   ZM  IWA   
67661       19016        Apache Air                  Apache   ZM  IWA   
67662       19016        Apache Air                  Apache   ZM  IWA   

         callsign        country  active  airport_id  \
0      AEROCONDOR       Portugal 

In [19]:
myRec[0]['airline']

{'airline_id': 410,
 'name': 'Aerocondor',
 'alias': 'ANA All Nippon Airways',
 'iata': '2B',
 'icao': 'ARD',
 'callsign': 'AEROCONDOR',
 'country': 'Portugal',
 'active': True}

In [23]:
df_myRec.head()

Unnamed: 0,airline,src_airport,dst_airport,codeshare,equipment
0,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2965, 'name': 'Sochi Internatio...","{'airport_id': 2990, 'name': 'Kazan Internatio...",False,[CR2]
1,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2966, 'name': 'Astrakhan Airpor...","{'airport_id': 2990, 'name': 'Kazan Internatio...",False,[CR2]
2,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2966, 'name': 'Astrakhan Airpor...","{'airport_id': 2962, 'name': 'Mineralnyye Vody...",False,[CR2]
3,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2968, 'name': 'Chelyabinsk Bala...","{'airport_id': 2990, 'name': 'Kazan Internatio...",False,[CR2]
4,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2968, 'name': 'Chelyabinsk Bala...","{'airport_id': 4078, 'name': 'Tolmachevo Airpo...",False,[CR2]


### 3.1.b Avro

In [24]:
from fastavro import writer, reader, schema
from rec_avro import to_rec_avro_destructive, from_rec_avro_destructive, rec_avro_schema

def create_avro_dataset():
    schema_path = schema_dir.joinpath('routes.avsc')
    data_path = results_dir.joinpath('routes.avro')

    with open(schema_path) as f:
        myschema = json.load(f)

    avro_objects = (to_rec_avro_destructive(read_jsonl_data()))
    with open(data_path, 'wb') as f_out:
            writer(f_out, schema.parse_schema(myschema), avro_objects)

create_avro_dataset()

ValueError: "records" argument should be an iterable, not dict

### 3.1.c Parquet

In [11]:
def create_parquet_dataset():
        parquet_output_path = results_dir.joinpath('routes.parquet')

        with gzip.open(src_data_dir, 'rb') as f:
                records = [json.loads(line) for line in f.readlines()]
        df = pd.DataFrame(records)        
        # df.head()
        table = pa.Table.from_pandas(df)
        pq.write_table(table,parquet_output_path, compression=None)

create_parquet_dataset()

NotImplementedError: struct<active: bool, airline_id: int64, alias: string, callsign: string, country: string, iata: string, icao: string, name: string>

### 3.1.d Protocol Buffers

In [None]:
sys.path.insert(0, os.path.abspath('routes_pb2'))

import routes_pb2

def _airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if airport is None:
        return None
    if airport.get('airport_id') is None:
        return None

    obj.airport_id = airport.get('airport_id')
    if airport.get('name'):
        obj.name = airport.get('name')
    if airport.get('city'):
        obj.city = airport.get('city')
    if airport.get('iata'):
        obj.iata = airport.get('iata')
    if airport.get('icao'):
        obj.icao = airport.get('icao')
    if airport.get('altitude'):
        obj.altitude = airport.get('altitude')
    if airport.get('timezone'):
        obj.timezone = airport.get('timezone')
    if airport.get('dst'):
        obj.dst = airport.get('dst')
    if airport.get('tz_id'):
        obj.tz_id = airport.get('tz_id')
    if airport.get('type'):
        obj.type = airport.get('type')
    if airport.get('source'):
        obj.source = airport.get('source')

    obj.latitude = airport.get('latitude')
    obj.longitude = airport.get('longitude')

    return obj


def _airline_to_proto_obj(airline):
    obj = routes_pb2.Airline()
    ## TODO: Create an Airline obj using Protocol Buffers API
    if airline is None:
        return None
    if airline.get('airline_id') is None:
        return None

    obj.airline_id = airline.get('airline_id')
    if airline.get('name'):
        obj.name = airline.get('name')
    if airline.get('alias'):
	    obj.name = airline.get('alias')
    if airline.get('iata'):
        obj.name = airline.get('iata')
    if airline.get('icao'):
	    obj.name = airline.get('icao')
    if airline.get('callsign'):
        obj.name = airline.get('callsign')
    if airline.get('country'):
        obj.name = airline.get('country')
    if airline.get('active'):
        obj.name = airline.get('active')

    return obj


def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = routes_pb2.Route()
        ## TODO: Implement the code to create the Protocol Buffers Dataset

        routes.route.append(route)

    data_path = results_dir.joinpath('routes.pb')

    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())
        
    compressed_path = results_dir.joinpath('routes.pb.snappy')
    
    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))
        
create_protobuf_dataset(records)

In [5]:
all_recs = read_jsonl_data
df = pd.DataFrame(all_recs)
df.head()

ValueError: DataFrame constructor not properly called!

In [None]:
# Open the compressed zip file
with open(src_data_dir, 'rb') as fread:
	# Now open the file to write to
	json_out = results_dir.joinpath('routes.jsonl')
	with open(json_out, 'wb') as fwrite:
		fwrite.write(gzip.decompress(fread.read()))