In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict


BASE_DIR = "/tmp/flights/"
os.makedirs(BASE_DIR, exist_ok=True)

NUM_FILES = 5000
MIN_RECORDS = 50
MAX_RECORDS = 100
NUM_CITIES = random.randint(100, 200)
NULL_PROB = random.uniform(0.005, 0.001)

cities = [f"City_{i}" for i in range(NUM_CITIES)]
def generate_random_date():
    start_date = datetime.today() - timedelta(days=365)
    random_days = random.randint(0, 364)
    return (start_date + timedelta(days=random_days)).strftime("%Y-%m-%d")
def generate_flight_record():
    record = {
        "date": generate_random_date(),
        "origin_city": random.choice(cities),
        "destination_city": random.choice(cities),
        "flight_duration_secs": random.randint(1800, 14400),
        "num_passengers": random.randint(10, 300)
    }

    if random.random() < NULL_PROB:
        random_key = random.choice(list(record.keys()))
        record[random_key] = None
    return record
def generate_flight_data():
    for i in range(NUM_FILES):
        num_records = random.randint(MIN_RECORDS, MAX_RECORDS)
        records = [generate_flight_record() for j in range(num_records)]
        sample_origin = records[0]["origin_city"]
        sample_date = datetime.today().strftime("%m-%Y")
        file_path = os.path.join(BASE_DIR, f"{sample_date}-{sample_origin}-flights.json")

        with open(file_path, "w") as f:
            json.dump(records, f, indent=2)
def analyze_flight_data():
    total_records = 0
    dirty_records = 0
    total_duration = 0
    city_passenger_count = defaultdict(int)
    city_flight_durations = defaultdict(list)

    for filename in os.listdir(BASE_DIR):
        file_path = os.path.join(BASE_DIR, filename)
        with open(file_path, "r") as f:
            flights = json.load(f)
        for flight in flights:
            total_records += 1

            if any(v is None for v in flight.values()):
                dirty_records += 1
                continue
            total_duration += flight["flight_duration_secs"]
            city_passenger_count[flight["destination_city"]] += flight["num_passengers"]
            city_passenger_count[flight["origin_city"]] -= flight["num_passengers"]
            city_flight_durations[flight["destination_city"]].append(flight["flight_duration_secs"])
    destination_city_sums = {}
    for city in city_flight_durations:
        destination_city_sums[city] = sum(city_flight_durations[city])
    top_25_destinations = sorted(destination_city_sums.keys(), key=destination_city_sums.get, reverse=True)[:25]
    avg_durations = {city: np.mean(city_flight_durations[city]) for city in top_25_destinations}
    p95_durations = {city: np.percentile(city_flight_durations[city], 95) for city in top_25_destinations}

    max_arrived_city = max(city_passenger_count, key=lambda city: city_passenger_count[city])
    max_departed_city = min(city_passenger_count, key=lambda city: city_passenger_count[city])
    print("### Analysis Summary ###")
    print(f"Total records processed: {total_records}")
    print(f"Total dirty records: {dirty_records}")
    print(f"Total flight duration (seconds): {total_duration}")
    print(f"City with MAX passengers arrived: {max_arrived_city} ({city_passenger_count[max_arrived_city]})")
    print(f"City with MAX passengers left: {max_departed_city} ({-city_passenger_count[max_departed_city]})\n")
    print("### Top 25 Destination Cities (Flight Duration Stats) ###")
    for city in top_25_destinations:
        print(f"{city}: AVG={avg_durations[city]:.2f} secs, P95={p95_durations[city]:.2f} secs")

generate_flight_data()
analyze_flight_data()