## Importing

In [10]:
import pandas as pd
import csv
from pathlib import Path
from urllib.parse import quote
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import XSD, RDFS
from meteostat import Point, Hourly
import warnings

## Settings

In [17]:

# Define paths
BASE_PATH = Path(r"D:\\OneDrive - Università degli Studi di Padova\\Lezioni\\Magistrale\\Terzo semestre\\Graph DB\\Homework\\flydata")
AIRPORTS_PATH = BASE_PATH / "DataCollection" / "CSVData" / "airports.csv"
CITIES_PATH = BASE_PATH / "DataCollection" / "CSVData" / "cities.csv"
FLIGHTS_PATH = BASE_PATH / "DataCollection" / "CSVData" / "flights.csv"
OUTPUT_PATH_AIRPORT = BASE_PATH / "Serialization" /"ttl"/ "airport.ttl"
OUTPUT_PATH_CITY = BASE_PATH / "Serialization" /"ttl"/ "city.ttl"
OUTPUT_PATH_WEATHER = BASE_PATH / "Serialization" /"ttl"/ "weather.ttl"
OUTPUT_PATH_FLIGHTS = BASE_PATH / "Serialization" /"ttl"/ "flights.ttl"


# Define namespaces
FDO = Namespace("http://www.semanticweb.org/nele/ontologies/2024/10/flydata/")

# Define the time interval for the weather data
start = pd.to_datetime('2024-8-1')
end = pd.to_datetime('2024-8-31')

## Cities

In [18]:
g_city = Graph()

g_city.bind("fdo", FDO)
g_city.bind("xsd", XSD)

cities = {}
try:
    # Load the CSV file in memory using pandas
    cities_df = pd.read_csv(CITIES_PATH)
    for _, row in cities_df.iterrows():
        # Convert city_ascii to string and skip if it's NaN
        if pd.isna(row['city_ascii']):
            continue
            
        city_ascii = str(row['city_ascii'])
        cities[city_ascii] = {
            'name': row['city'],
            'population': str(row['population']) if pd.notna(row['population']) else "0"
        }
                    
except Exception as e:
    print(f"Error reading file: {str(e)}")
    raise

# Add cities to the city graph
for city_ascii, data in cities.items():
    # Remove special characters and spaces, then URL encode
    city_id = quote(city_ascii.encode('ascii', 'ignore').decode().replace(" ", "_").replace("'", "").replace(",", ""))
    city_uri = URIRef(str(FDO) + city_id)
    
    # Add city triples
    g_city.add((city_uri, RDF.type, FDO.City))
    g_city.add((city_uri, FDO.name, Literal(data['name'], datatype=XSD.string)))
    g_city.add((city_uri, FDO.population, Literal(data['population'], datatype=XSD.string)))

g_city.serialize(destination=str(OUTPUT_PATH_CITY), format='turtle')

<Graph identifier=N4ddb4f560ec1498aa0b277ffee1e06f5 (<class 'rdflib.graph.Graph'>)>

## Airports

In [19]:
g_airport = Graph()

g_airport.bind("fdo", FDO)
g_airport.bind("xsd", XSD)

# Define the isLocatedInCity property in the airport graph
g_airport.add((FDO.isLocatedInCity, RDF.type, RDF.Property))
g_airport.add((FDO.isLocatedInCity, RDFS.domain, FDO.Airport))
g_airport.add((FDO.isLocatedInCity, RDFS.range, FDO.City))

airports = []
try:
    with open(AIRPORTS_PATH, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row['IATA']:  # Only include airports with IATA code
                airports.append({
                    'name': row['Name'].strip('"'),
                    'city': row['City'].strip('"'),
                    'iata': row['IATA'].strip('"'),
                    'lat': row['LAT'].strip('"'),
                    'long': row['LONG'].strip('"')
                })
except Exception as e:
    print(f"Error reading airports file: {str(e)}")
    raise

# Add airports and their relationships to the airport graph
for airport in airports:
    # Use IATA code directly in the URI
    airport_uri = URIRef(str(FDO) + airport['iata'])
    
    # Add airport triples
    g_airport.add((airport_uri, RDF.type, FDO.Airport))
    g_airport.add((airport_uri, FDO.name, Literal(airport['name'], datatype=XSD.string)))
    
    # Only add isLocatedInCity if the city exists in our cities dataset
    if airport['city'] in cities:
        city_id = quote(airport['city'].encode('ascii', 'ignore').decode().replace(" ", "_").replace("'", "").replace(",", ""))
        city_uri = URIRef(str(FDO) + city_id)
        g_airport.add((airport_uri, FDO.isLocatedInCity, city_uri))

# Serialize both graphs to separate TTL files
g_airport.serialize(destination=str(OUTPUT_PATH_AIRPORT), format='turtle')

<Graph identifier=N8ef450fb457c42f09101060ff7e79cc2 (<class 'rdflib.graph.Graph'>)>

## Weather

In [24]:
#suppression of the meteostat warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

g_weather = Graph()

g_weather.bind("fdo", FDO)
g_weather.bind("xsd", XSD)

# for each airport, get the weather data
index = 0
for airport in airports:
    # Try to get the weather data for each airport
    try:   
        # Get the weather data
        point = Point(float(airport['lat']), float(airport['long']), 0)
        data = Hourly(point, start, end)
        data = data.fetch()
        
        # Create the node to add to the Graph
        for idx, row in data.iterrows():
            # the node has the namespace + the airport iata + timestamp as URI
            airport_uri = URIRef(str(FDO) + airport['iata'] + str(int(row.name.timestamp()/1000)))
            
            # Add airport triples
            g_weather.add((airport_uri, RDF.type, FDO.Weather))
            g_weather.add((airport_uri, FDO['weatherDate'], Literal(int(row.name.timestamp()), datatype=XSD.dateTime)))
            coco_standardized = 0 if pd.isna(row.coco) else int(row.coco)
            g_weather.add((airport_uri, FDO['weatherType'], Literal(coco_standardized, datatype=XSD.nonNegativeInteger)))
            g_weather.add((airport_uri, FDO['hasAirport'], URIRef(FDO[airport['iata']])))
    except Exception as e:
        print(f"Error weather ({airport['iata']}): {str(e)}")

    print(f"{int(index/len(airports)*100)} % complete\r", end="")
    index+=1

g_weather.serialize(destination=str(OUTPUT_PATH_WEATHER), format='turtle')

99 % complete

<Graph identifier=N9618c1849cd647fa9d4e67ae15793236 (<class 'rdflib.graph.Graph'>)>