## Importing

In [6]:
%pip install rdflib
%pip install meteostat

import pandas as pd
import csv
from pathlib import Path
from urllib.parse import quote
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import XSD, RDFS
from datetime import datetime, timedelta
from meteostat import Point, Hourly
import warnings






Note: you may need to restart the kernel to use updated packages.




## Settings

In [3]:

# Define paths
BASE_PATH = Path(r"D:\\OneDrive - Università degli Studi di Padova\\Lezioni\\Magistrale\\Terzo semestre\\Graph DB\\Homework\\flydata")
AIRPORTS_PATH = BASE_PATH / "DataCollection" / "CSVData" / "airports.csv"
CITIES_PATH = BASE_PATH / "DataCollection" / "CSVData" / "cities.csv"
FLIGHTS_PATH = BASE_PATH / "DataCollection" / "CSVData" / "flights.csv"
OUTPUT_PATH_AIRPORT = BASE_PATH / "Serialization" /"ttl"/ "airport.ttl"
OUTPUT_PATH_CITY = BASE_PATH / "Serialization" /"ttl"/ "city.ttl"
OUTPUT_PATH_WEATHER = BASE_PATH / "Serialization" /"ttl"/ "weather.ttl"
OUTPUT_PATH_FLIGHTS = BASE_PATH / "Serialization" /"ttl"/ "flights.ttl"
OUTPUT_PATH_ROUTES = BASE_PATH / "Serialization" /"ttl"/ "routes.ttl"


# Define namespaces
FDO = Namespace("http://www.semanticweb.org/nele/ontologies/2024/10/flydata/")

# Define the time interval for the weather data
start = pd.to_datetime('2024-8-1')
end = pd.to_datetime('2024-8-31')

## Cities

In [3]:
g_city = Graph()

g_city.bind("fdo", FDO)
g_city.bind("xsd", XSD)

cities = {}
try:
    # Load the CSV file in memory using pandas
    cities_df = pd.read_csv(CITIES_PATH)
    for _, row in cities_df.iterrows():
        # Convert city_ascii to string and skip if it's NaN
        if pd.isna(row['city_ascii']):
            continue
            
        city_ascii = str(row['city_ascii'])
        cities[city_ascii] = {
            'name': row['city'],
            'population': str(row['population']) if pd.notna(row['population']) else "0"
        }
                    
except Exception as e:
    print(f"Error reading file: {str(e)}")
    raise

# Add cities to the city graph
for city_ascii, data in cities.items():
    # Remove special characters and spaces, then URL encode
    city_id = quote(city_ascii.encode('ascii', 'ignore').decode().replace(" ", "_").replace("'", "").replace(",", ""))
    city_uri = URIRef(str(FDO) + city_id)
    
    # Add city triples
    g_city.add((city_uri, RDF.type, FDO.City))
    g_city.add((city_uri, FDO.name, Literal(data['name'], datatype=XSD.string)))
    g_city.add((city_uri, FDO.population, Literal(data['population'], datatype=XSD.string)))

g_city.serialize(destination=str(OUTPUT_PATH_CITY), format='turtle')

<Graph identifier=Nc4786862cd424edba65fe071f71449b6 (<class 'rdflib.graph.Graph'>)>

## Airports

In [4]:
g_airport = Graph()

g_airport.bind("fdo", FDO)
g_airport.bind("xsd", XSD)

# Define the isLocatedInCity property in the airport graph
g_airport.add((FDO.isLocatedInCity, RDF.type, RDF.Property))
g_airport.add((FDO.isLocatedInCity, RDFS.domain, FDO.Airport))
g_airport.add((FDO.isLocatedInCity, RDFS.range, FDO.City))

airports = []
try:
    with open(AIRPORTS_PATH, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row['IATA']:  # Only include airports with IATA code
                airports.append({
                    'name': row['Name'].strip('"'),
                    'city': row['City'].strip('"'),
                    'iata': row['IATA'].strip('"'),
                    'lat': row['LAT'].strip('"'),
                    'long': row['LONG'].strip('"')
                })
except Exception as e:
    print(f"Error reading airports file: {str(e)}")
    raise

# Add airports and their relationships to the airport graph
for airport in airports:
    # Use IATA code directly in the URI
    airport_uri = URIRef(str(FDO) + airport['iata'])
    
    # Add airport triples
    g_airport.add((airport_uri, RDF.type, FDO.Airport))
    g_airport.add((airport_uri, FDO.name, Literal(airport['name'], datatype=XSD.string)))
    
    # Only add isLocatedInCity if the city exists in our cities dataset
    if airport['city'] in cities:
        city_id = quote(airport['city'].encode('ascii', 'ignore').decode().replace(" ", "_").replace("'", "").replace(",", ""))
        city_uri = URIRef(str(FDO) + city_id)
        g_airport.add((airport_uri, FDO.isLocatedInCity, city_uri))

# Serialize both graphs to separate TTL files
g_airport.serialize(destination=str(OUTPUT_PATH_AIRPORT), format='turtle')

<Graph identifier=N9c1148d918d1402ea24f170e1e7b3e3a (<class 'rdflib.graph.Graph'>)>

## Weather

In [None]:
#suppression of the meteostat warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

g_weather = Graph()

g_weather.bind("fdo", FDO)
g_weather.bind("xsd", XSD)

# for each airport, get the weather data
index = 0
for airport in airports:
    # Try to get the weather data for each airport
    try:   
        # Get the weather data
        point = Point(float(airport['lat']), float(airport['long']), 0)
        data = Hourly(point, start, end)
        data = data.fetch()
        
        # Create the node to add to the Graph
        for idx, row in data.iterrows():
            # the node has the namespace + the airport iata + timestamp as URI
            airport_uri = URIRef(str(FDO) + airport['iata'] + str(int(row.name.timestamp()/1000)))
            
            # Add airport triples
            g_weather.add((airport_uri, RDF.type, FDO.Weather))
            g_weather.add((airport_uri, FDO['weatherDate'], Literal(int(row.name.timestamp()), datatype=XSD.dateTime)))
            coco_standardized = 0 if pd.isna(row.coco) else int(row.coco)
            g_weather.add((airport_uri, FDO['weatherType'], Literal(coco_standardized, datatype=XSD.nonNegativeInteger)))
            g_weather.add((airport_uri, FDO['hasAirport'], URIRef(FDO[airport['iata']])))
    except Exception as e:
        print(f"Error weather ({airport['iata']}): {str(e)}")

    print(f"{int(index/len(airports)*100)} % complete\r", end="\r")
    index+=1

print(f"serialization\r", end="finished")
g_weather.serialize(destination=str(OUTPUT_PATH_WEATHER), format='turtle')

## Flights and routes

In [None]:
g_flights = Graph()
g_routes = Graph()

g_flights.bind("fdo", FDO)
g_flights.bind("xsd", XSD)
g_routes.bind("fdo", FDO)
g_routes.bind("xsd", XSD)

timefieldsontology = {
        'Scheduled departure time as shown in Official Airline Guide(OAG)': 'ScheduledDepartureOAGTime',
        'Scheduled departure time as shown in CRS(selected by the Carrier)': 'ScheduledDepartureCRSTime',
        'Gate departure time (actual)': 'ActualGateDepartureTime',
        'Scheduled arrival time per OAG': 'ScheduledArrivalTimePerOAG',
        'Scheduled arrival time per CRS': 'ScheduledArrivalTimePerCRS',
        'Gate arrival time (actual)': 'ActualGateArrivalTime',
        'Wheels-off time (actual)': 'ActualWheels-offTime',
        'Wheels-on time (actual)': 'ActualWheels-onTime'
    }
minutesfieldsontology = {
    'Scheduled elapsed time per CRS': 'ScheduledElapsedTimePerCRS',
    'Actual gate-to-gate time': 'ActualGate-to-gateTime',
    'Departure delay time (actual minutes)': 'ActualDepartureDelayTime',
    'Arrival delay time (actual minutes)': 'ActualArrivalDelayTime',
    'Elapsed time difference (actual minutes)': 'ActualElapsedTimeDifference',
    'Minutes late for Delay Code E - Carrier Caused': 'LateE',
    'Minutes late for Delay Code F - Weather': 'LateF',
    'Minutes late for Delay Code G - National Aviation System (NAS)': 'LateG',
    'Minutes late for Delay Code H - Security': 'LateH',
    'Minutes late for Delay Code I - Late Arriving Flight (Initial)': 'LateI'
}

try:
    # Load the CSV file in memory using pandas
    flights_df = pd.read_csv(FLIGHTS_PATH, dtype={"Scheduled Operating Carrier Code": "string", "Date of flight operation": "string"})
    # Fill NaN values in "Actual Operating Carrier Flight Number" with 0 and convert to int
    flights_df["Actual Operating Carrier Flight Number"] = flights_df["Actual Operating Carrier Flight Number"].fillna(0).astype(int)
                    
except Exception as e:
    print(f"Error reading file: {str(e)}")
    raise

index = 0
# Add flights to the flights graph
for idx, row in flights_df.iterrows():

    route_id = quote(str(row['Departure airport code']) + str(row['Arrival airport code']))
    route_uri = URIRef(str(FDO) + route_id)

    # Add route triples
    # check if the route already exists
    if (route_uri, RDF.type, FDO.Route) not in g_routes:
        g_routes.add((route_uri, RDF.type, FDO.Route))
        g_routes.add((route_uri, FDO['hasDepartureAirport'], URIRef(FDO[row['Departure airport code']])))
        g_routes.add((route_uri, FDO['hasArrivalAirport'], URIRef(FDO[row['Arrival airport code']])))
    
    flight_date = datetime.strptime(row['Date of flight operation'], "%m/%d/%Y")
    flight_id = quote(str(row['Actual Operating Carrier Code']) + str(row['Actual Operating Carrier Flight Number']) + flight_date.strftime("%Y%m%d"))
    flight_uri = URIRef(str(FDO) + flight_id)
    
    start_time = row['Scheduled departure time as shown in Official Airline Guide(OAG)']
    for time in timefieldsontology.keys():
        if pd.isna(row[time]) or row[time] == 0:
            row[time] = None
            continue
        # convert the time to a datetime object
        m = int(str(int(row[time]))[-2:])
        h = int(str(int(row[time]))[:2]) if len(str(row[time])) == 4 else 0
        # check if the time is in the next day
        d = 1 if row[time] < start_time-200 else 0
        row[time] = flight_date + timedelta(days=d, hours=h, minutes=m)
    
    # Add flights triples
    g_flights.add((flight_uri, RDF.type, FDO.Flight))
    g_flights.add((flight_uri, FDO['hasAircraft'], URIRef(FDO[str(row['Aircraft tail number'])])))
    g_flights.add((flight_uri, FDO['hasRoute'], route_uri))
    g_flights.add((flight_uri, FDO['isOperatedBy'], URIRef(FDO[str(row['Actual Operating Carrier Code'])])))
    if not pd.isna(row['Cancellation code']):
        g_flights.add((flight_uri, FDO['CancellationCode'], Literal(row['Cancellation code'], datatype=XSD.string)))
    g_flights.add((flight_uri, FDO['flightDate'], Literal(flight_date, datatype=XSD.dateTime)))
    
    
    # time fields
    for original, ontology in timefieldsontology.items():
        if not pd.isna(row[original]):
            g_flights.add((flight_uri, FDO[ontology], Literal(row[original], datatype=XSD.dateTime)))
    # minutes fields
    for original, ontology in minutesfieldsontology.items():
        if not pd.isna(row[original]):
            g_flights.add((flight_uri, FDO[ontology], Literal(int(row[original]), datatype=XSD.integer)))
    
    index+=1
    print(f"{int(index/len(flights_df)*100)} % complete", end="\r")

print(f"serialization\r")
g_flights.serialize(destination=str(OUTPUT_PATH_FLIGHTS), format='turtle')
g_routes.serialize(destination=str(OUTPUT_PATH_ROUTES), format='turtle')