#### a. JSON Schema¶

In [2]:
import json
import jsonschema
import gzip

# Load the JSON schema from file
with open('C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/schemas/routes-schema.json') as f:
    schema = json.load(f)

# Open the compressed JSONL file
with gzip.open('C:/Users/chris/DSC650-T301/dsc650/data/processed/openflights/routes.jsonl.gz', 'rt') as f:
    valid_routes = []
    invalid_routes = []
    # Read each line of the file and parse as JSON
    for line in f:
        route = json.loads(line)
        try:
            # Validate the route against the schema
            jsonschema.validate(route, schema)
            valid_routes.append(route)
        except jsonschema.exceptions.ValidationError:
            invalid_routes.append(route)

# Write the valid routes to a new JSON file
with open('C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/valid_routes.json', 'w') as f:
    json.dump(valid_routes, f)

# Write the invalid routes to a new JSON file
with open('C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/invalid_routes.json', 'w') as f:
    json.dump(invalid_routes, f)


#### b. Avro

In [21]:
import fastavro
import json
import gzip

# Load the JSONL file
with gzip.open('C:/Users/chris/DSC650-T301/dsc650/data/processed/openflights/routes.jsonl.gz', 'rt') as f:
    routes = [json.loads(line) for line in f]

# Define the schema
schema_path = 'C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/schemas/routes.avsc'
with open(schema_path) as f:
    schema = json.load(f)

# Write the Avro file
avro_path = 'C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/results/routes.avro'
with open(avro_path, 'wb') as out:
    fastavro.writer(out, schema, routes)


#### c. Parquet¶

In [24]:
import fastavro

avro_path = 'C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/results/routes.avro'

with open(avro_path, 'rb') as f:
    avro_data = list(fastavro.reader(f))
    
# Convert the data to a Pandas DataFrame
df = pd.DataFrame.from_records(avro_data)

# Convert the DataFrame to an Arrow Table
table = pa.Table.from_pandas(df)

# Write the Arrow Table to a Parquet dataset
parquet_path = 'C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/results/routes.parquet'
pq.write_table(table, parquet_path)

#### d. Protocol Buffers

In [31]:
import os
import json
import gzip
import snappy
import routes_pb2

def load_records(file_path):
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            yield json.loads(line)

def airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if not airport:
        return None
    if not airport.get('airport_id'):
        return None

    obj.airport_id = airport.get('airport_id')
    obj.name = airport.get('name', '')
    obj.city = airport.get('city', '')
    obj.iata = airport.get('iata', '')
    obj.icao = airport.get('icao', '')
    obj.altitude = airport.get('altitude', 0)
    obj.timezone = airport.get('timezone', '')
    obj.dst = airport.get('dst', '')
    obj.tz_id = airport.get('tz_id', '')
    obj.type = airport.get('type', '')
    obj.source = airport.get('source', '')
    obj.latitude = airport.get('latitude', 0)
    obj.longitude = airport.get('longitude', 0)
    return obj

def airline_to_proto_obj(airline):
    obj = routes_pb2.Airline()
    if not airline:
        return None
    if not airline.get('airline_id'):
        return None

    obj.airline_id = airline.get('airline_id')
    obj.name = airline.get('name', '')
    obj.alias = airline.get('alias', '')
    obj.iata = airline.get('iata', '')
    obj.icao = airline.get('icao', '')
    obj.callsign = airline.get('callsign', '')
    obj.country = airline.get('country', '')
    obj.active = airline.get('active', False)
    return obj

def create_route_proto_obj(record):
    route = routes_pb2.Route()
    airline = airline_to_proto_obj(record.get('airline', {}))
    if airline:
        route.airline.CopyFrom(airline)
    src_airport = airport_to_proto_obj(record.get('src_airport', {}))
    if src_airport:
        route.src_airport.CopyFrom(src_airport)
    dst_airport = airport_to_proto_obj(record.get('dst_airport', {}))
    if dst_airport:
        route.dst_airport.CopyFrom(dst_airport)
    route.codeshare = record.get('codeshare', False)
    route.stops = record.get('stops', 0)
    equipment = record.get('equipment', [])
    route.equipment.extend(equipment)
    return route

def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = create_route_proto_obj(record)
        routes.route.append(route)

    data_path = 'C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/results/routes.pb'
    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())
        
    compressed_path = 'C:/Users/chris/DSC650-T301/dsc650/dsc650/assignments/assignment03/results/routes.pb.snappy'
    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))

if __name__ == '__main__':
    file_path = 'C:/Users/chris/DSC650-T301/dsc650/data/processed/openflights/routes.jsonl.gz'
    records = load_records(file_path)
    create_protobuf_dataset(records)
