# Pipeline

1. Maak SQL scripts voor schema's: RAW, ARCHIVED, CLEANSED
2. Importeer source data in RAW
3. Data cleaning => RAW naar ARCHIVED en CLEANSED
4. Maak SQL scripts voor Data Warehouse / Ster schema
5. Import van CLEANSED naar DWH
6. Prep Data lake: export tabellen naar Parquet files
7. Upload Parquet files naar S3 (eerst bucket aanmaken)
8. Maak Athena tables
9. Gebruik Athena in BI tool naar keuze

In [54]:
%pip install -q pandas sqlalchemy psycopg2-binary

Note: you may need to restart the kernel to use updated packages.


## Stap 1: SQL scripts

In [55]:
import psycopg2

# Verbindingsgegevens
host = "192.168.56.1"
dbname = "postgres"
user = "postgres"
password = "Newpassword"
port = "5433"  # Standaard PostgreSQL poort

# Maak de verbinding
conn = psycopg2.connect(
    host=host,
    dbname=dbname,
    user=user,
    password=password,
    port=port
)

# Maak een cursor aan
cur = conn.cursor()
# Open het SQL-bestand
with open('./sql_scripts/raw.sql', 'r') as file:
    sql_script = file.read()
cur.execute(sql_script)

with open('./sql_scripts/archived.sql', 'r') as file:
    sql_script = file.read()
cur.execute(sql_script)

with open('./sql_scripts/cleansed.sql', 'r') as file:
    sql_script = file.read()
cur.execute(sql_script)

conn.commit()  # Vergeet niet te committeren als het script wijzigingen maakt



## Stap 2: raw importeren

In [56]:
from sqlalchemy import create_engine, types as sqlalchemytypes
import pandas as pd

# Database connection details
engine = create_engine('postgresql://postgres:Newpassword@192.168.56.1:5433/postgres')

# List of table names and corresponding file paths
tables = {
    'aankomst': './source_data/export_aankomst.txt',
    'banen': './source_data/export_banen.csv',
    'klant': './source_data/export_klant.csv',
    'luchthavens': './source_data/export_luchthavens.txt',
    'maatschappijen': './source_data/export_maatschappijen.txt',
    'planning': './source_data/export_planning.txt',
    'vertrek': './source_data/export_vertrek.txt',
    'vliegtuig': './source_data/export_vliegtuig.txt',
    'vliegtuigtype': './source_data/export_vliegtuigtype.csv',
    'vlucht': './source_data/export_vlucht.txt',
    'weer': './source_data/export_weer.txt'
}

# Column types for each table
column_types = {
    'aankomst': {
        "Vluchtid": sqlalchemytypes.String,
        "Vliegtuigcode": sqlalchemytypes.String,
        "Terminal": sqlalchemytypes.String,
        "Gate": sqlalchemytypes.String,
        "Baan": sqlalchemytypes.String,
        "Bezetting": sqlalchemytypes.String,
        "Vracht": sqlalchemytypes.String,
        "Aankomsttijd": sqlalchemytypes.String,
    },
    'banen': {
        "Baannummer": sqlalchemytypes.String,
        "Code": sqlalchemytypes.String,
        "Naam": sqlalchemytypes.String,
        "Lengte": sqlalchemytypes.String,
    },
    'klant': {
        "Vluchtid": sqlalchemytypes.String,
        "Operatie": sqlalchemytypes.String,
        "Faciliteiten": sqlalchemytypes.String,
        "Shops": sqlalchemytypes.String,
    },
    'luchthavens': {
        "Airport": sqlalchemytypes.String,
        "City": sqlalchemytypes.String,
        "Country": sqlalchemytypes.String,
        "IATA": sqlalchemytypes.String,
        "ICAO": sqlalchemytypes.String,
        "Lat": sqlalchemytypes.String,
        "Lon": sqlalchemytypes.String,
        "Alt": sqlalchemytypes.String,
        "TZ": sqlalchemytypes.String,
        "DST": sqlalchemytypes.String,
        "Tz": sqlalchemytypes.String,
    },
    'maatschappijen': {
        "Name": sqlalchemytypes.String,
        "IATA": sqlalchemytypes.String,
        "ICAO": sqlalchemytypes.String,
    },
    'planning': {
        "Vluchtnr": sqlalchemytypes.String,
        "Airlinecode": sqlalchemytypes.String,
        "Destcode": sqlalchemytypes.String,
        "Planterminal": sqlalchemytypes.String,
        "Plangate": sqlalchemytypes.String,
        "Plantijd": sqlalchemytypes.String,
    },
    'vertrek': {
        "Vluchtid": sqlalchemytypes.String,
        "Vliegtuigcode": sqlalchemytypes.String,
        "Terminal": sqlalchemytypes.String,
        "Gate": sqlalchemytypes.String,
        "Baan": sqlalchemytypes.String,
        "Bezetting": sqlalchemytypes.String,
        "Vracht": sqlalchemytypes.String,
        "Vertrektijd": sqlalchemytypes.String,
    },
    'vliegtuig': {
        "Airlinecode": sqlalchemytypes.String,
        "Vliegtuigcode": sqlalchemytypes.String,
        "Vliegtuigtype": sqlalchemytypes.String,
        "Bouwjaar": sqlalchemytypes.String,
    },
    'vliegtuigtype': {
        "IATA": sqlalchemytypes.String,
        "ICAO": sqlalchemytypes.String,
        "Merk": sqlalchemytypes.String,
        "Type": sqlalchemytypes.String,
        "Wake": sqlalchemytypes.String,
        "Cat": sqlalchemytypes.String,
        "Capaciteit": sqlalchemytypes.String,
        "Vracht": sqlalchemytypes.String,
    },
    'vlucht': {
        "Vluchtid": sqlalchemytypes.String,
        "Vluchtnr": sqlalchemytypes.String,
        "Airlinecode": sqlalchemytypes.String,
        "Destcode": sqlalchemytypes.String,
        "Vliegtuigcode": sqlalchemytypes.String,
        "Datum": sqlalchemytypes.String,
    },
    'weer': {
        "Datum": sqlalchemytypes.String,
        "DDVEC": sqlalchemytypes.String,
        "FHVEC": sqlalchemytypes.String,
        "FG": sqlalchemytypes.String,
        "FHX": sqlalchemytypes.String,
        "FHXH": sqlalchemytypes.String,
        "FHN": sqlalchemytypes.String,
        "FHNH": sqlalchemytypes.String,
        "FXX": sqlalchemytypes.String,
        "FXXH": sqlalchemytypes.String,
        "TG": sqlalchemytypes.String,
        "TN": sqlalchemytypes.String,
        "TNH": sqlalchemytypes.String,
        "TX": sqlalchemytypes.String,
        "TXH": sqlalchemytypes.String,
        "T10N": sqlalchemytypes.String,
        "T10NH": sqlalchemytypes.String,
        "SQ": sqlalchemytypes.String,
        "SP": sqlalchemytypes.String,
        "Q": sqlalchemytypes.String,
        "DR": sqlalchemytypes.String,
        "RH": sqlalchemytypes.String,
        "RHX": sqlalchemytypes.String,
        "RHXH": sqlalchemytypes.String,
        "PG": sqlalchemytypes.String,
        "PX": sqlalchemytypes.String,
        "PXH": sqlalchemytypes.String,
        "PN": sqlalchemytypes.String,
        "PNH": sqlalchemytypes.String,
        "VVN": sqlalchemytypes.String,
        "VVNH": sqlalchemytypes.String,
        "VVX": sqlalchemytypes.String,
        "VVXH": sqlalchemytypes.String,
        "NG": sqlalchemytypes.String,
        "UG": sqlalchemytypes.String,
        "UX": sqlalchemytypes.String,
        "UXH": sqlalchemytypes.String,
        "UN": sqlalchemytypes.String,
        "UNH": sqlalchemytypes.String,
        "EV2": sqlalchemytypes.String,
    }
}

# Load each table's CSV file and write to the database
for table, file_path in tables.items():
    if table in ['banen', 'klant', 'vliegtuigtype']:
        try:
            df = pd.read_csv(file_path, sep=';', dtype=str, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, sep=';', dtype=str, encoding='latin1')
    else:
        try:
            df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='latin1')


    df.to_sql(table, con=engine, schema='raw', if_exists='append', index=False, dtype=column_types[table])

print("Data import complete.")


Data import complete.


# Data Cleaning

In [58]:
from sqlalchemy import create_engine, inspect
import pandas as pd

# Database connection details
engine = create_engine('postgresql://postgres:Newpassword@192.168.56.1:5433/postgres')

# Create an inspector to get the list of tables in the schema
inspector = inspect(engine)

# Get the list of tables in the 'raw' schema
tables = inspector.get_table_names(schema='raw')

# Load each table into a DataFrame and assign it to a variable
for table in tables:
    df = pd.read_sql_table(table, con=engine, schema='raw')
    globals()[f'{table}_df'] = df

tables_raw = []
# Verify that variables are created
for table in tables:
    tables_raw.append(table)
    var_name = f'{table}_df'
    print(f"Variable created: {var_name}")

Variable created: aankomst_df
Variable created: banen_df
Variable created: klant_df
Variable created: luchthavens_df
Variable created: maatschappijen_df
Variable created: planning_df
Variable created: vertrek_df
Variable created: vliegtuig_df
Variable created: vliegtuigtype_df
Variable created: vlucht_df
Variable created: weer_df
