In [3]:
import pandas as pd
from geopy.geocoders import Nominatim
from time import sleep
from geopy.exc import GeocoderTimedOut

# Path to the CSV file
file_path = r'C:\Users\omniq\Documents\GitHub\TIL6022-LabAssignments\Flytax-Project-Group24\CO2-effect\Data\avia_par_nl__custom_13210883_spreadsheet (csvformat).csv'

# Load the CSV file into a DataFrame using ';' as the separator
df = pd.read_csv(file_path, sep=';')

# Extract the first column and rename it to "Flight Route"
first_column = df.iloc[:, 0].rename("Flight Route")

# Create a new DataFrame for geocoding information (Flight Route, Coordinates, Land Code)
geo_df = pd.DataFrame({'Flight Route': first_column})

# Clean the 'Flight Route' column by removing unwanted prefixes and formatting the text
geo_df['Flight Route'] = geo_df['Flight Route'].astype(str)
geo_df['Flight Route'] = geo_df['Flight Route'].str.replace('AMSTERDAM/SCHIPHOL airport - ', '', regex=False)
geo_df['Flight Route'] = geo_df['Flight Route'].str.replace('Maastricht/Aachen airport - ', '', regex=False)
geo_df['Flight Route'] = geo_df['Flight Route'].str.replace('Eindhoven airport - ', '', regex=False)
geo_df['Flight Route'] = geo_df['Flight Route'].str.replace('Rotterdam airport - ', '', regex=False)
geo_df['Flight Route'] = geo_df['Flight Route'].str.replace('Groningen Eelde airport - ', '', regex=False)
geo_df['Flight Route'] = geo_df['Flight Route'].str.title().str.strip()

# Initialize the geolocator with a user agent to avoid request issues
geolocator = Nominatim(user_agent="airport_geolocator")

# Function to get the coordinates of the airport
def get_airport_coordinates(airport_name):
    try:
        location = geolocator.geocode(airport_name)
        if location:
            return (location.latitude, location.longitude)
        else:
            retry_query = f"{airport_name} Airport"
            location = geolocator.geocode(retry_query)
            if location:
                return (location.latitude, location.longitude)
            else:
                return (None, None)
    except:
        return (None, None)

# Add a delay between requests to avoid overwhelming the geocoding service
def get_coordinates_with_delay(airport_name):
    coords = get_airport_coordinates(airport_name)
    sleep(0.05)  # Add a delay between requests
    return coords

# Function to get the land code (country code) based on coordinates
def get_land_code(lat, lon):
    if pd.notna(lat) and pd.notna(lon):
        try:
            location = geolocator.reverse((lat, lon), language='en', timeout=10)
            if location and 'country_code' in location.raw['address']:
                return location.raw['address']['country_code'].upper()  # Return country code in uppercase
            else:
                return None
        except GeocoderTimedOut:
            return None
    else:
        return None

# Apply the function to the 'Flight Route' column to get coordinates and land codes
geo_df['Coordinates'] = geo_df['Flight Route'].apply(get_coordinates_with_delay)
geo_df[['Latitude', 'Longitude']] = pd.DataFrame(geo_df['Coordinates'].tolist(), index=geo_df.index)
geo_df['Land Code'] = geo_df.apply(lambda row: get_land_code(row['Latitude'], row['Longitude']), axis=1)

# Drop the original 'Coordinates' column for clarity
geo_df = geo_df.drop(columns=['Coordinates'])

# Remove duplicate rows
geo_df = geo_df.drop_duplicates()

# Display the DataFrame with Flight Route, Coordinates, and Land Code
print(geo_df[['Flight Route', 'Latitude', 'Longitude', 'Land Code']].to_string(index=False))

# Save this geo_df as a CSV or continue using it in memory
geo_df.to_csv('geo_data.csv', index=False)  # Optional: Save the geo data for future use



                                                           Flight Route   Latitude   Longitude Land Code
                                                       Airp_Pr (Labels)        NaN         NaN      None
                                        Abu Dhabi International Airport  24.431899   54.641977        AE
                                            Dubai International Airport  25.252129   55.365716        AE
                                               Bonaire/Flamingo Airport  12.131027  -68.264918        NL
                                        Curacao/Aeropuerto Hato Airport  12.189053  -68.962162        NL
                                   St. Maarten/Princess Juliana Airport  18.040874  -63.111852        NL
                                 Ezeiza Ministro Pistarini (Ba) Airport        NaN         NaN      None
                                                      Innsbruck Airport  47.259726   11.341871        AT
                                                 Wien-S

In [8]:
import pandas as pd
from geopy.distance import geodesic

# Load the geo_df DataFrame (assume this is already loaded)
# geo_df = pd.read_csv('geo_data.csv')  # Uncomment if loading from a saved file

# Extract the last column from the original data and rename it to "2024 Q2 Total Passengers"
last_column = df.iloc[:, -1].rename("2024 Q2 Total Passengers")

# Merge geo_df with the passengers data to create the final DataFrame
final_df = pd.DataFrame({
    'Flight Route': geo_df['Flight Route'],
    '2024 Q2 Total Passengers': last_column
})

# Merge with the geo data (coordinates and land codes)
final_df = pd.merge(final_df, geo_df[['Flight Route', 'Latitude', 'Longitude', 'Land Code']], on='Flight Route', how='left')

# Define the coordinates for the most central point in the Netherlands (Amersfoort)
central_netherlands_coords = (52.1561, 5.3878)

# Function to calculate the distance from the central point in the Netherlands to each airport
def calculate_distance_to_netherlands(lat, lon):
    if pd.notna(lat) and pd.notna(lon):
        return geodesic(central_netherlands_coords, (lat, lon)).kilometers
    else:
        return None

# Calculate the distance to the Netherlands for each airport
final_df['Distance to Netherlands (km)'] = final_df.apply(
    lambda row: calculate_distance_to_netherlands(row['Latitude'], row['Longitude']), axis=1
)

# Dictionary of known distances to Amersfoort, Netherlands for specific airports
known_distances = {
    "Montreal/Pierre Elliot Trudeau Intl, Qc Airport": 5560,
    "Toronto/Lester B. Pearson Intl, On Airport": 6000,
    "Hannover Uir Airport": 300,
    "Zagreb/Franjo Tudjman Airport": 1100,
    "Bangalore International Airport, Devenahalli, Bangalore Airport": 7800,
    "Tehran/Imam Khomaini Intl Airport": 4400,
    "Nairobi Acc,Fic,Rcc,Com Airport": 6500,
    "Luftfartstilsynet Civil Aviation Authority Airport": 1100,
    "Enfidha Zine El Abidine Ben Ali Airport": 1900,
    "Taibei City/Taibei Intl Ap Airport": 9500,
    "Kilimanjaro App, Twr, Ais, Met, Civil Airlines Airport": 7000,
    "Boston/General Edward Lawrence Logan International, Ma. Airport": 5600,
    "Washington Dulles International, Dc. Airport": 6200,
    "Maiquetia, Intl, Simon Bolivar, Vargas Airport": 7600,
    "Maastricht/Aachen Airport - Kos/Ippokratis Airport": 2100,
    "Maastricht/Aachen Airport - Zakinthos/Dionisios Solomos Airport": 2300,
    "Eindhoven Airport - Athinai/Eleftherios Venizelos Airport": 2200,
    "Eindhoven Airport - Kos/Ippokratis Airport": 2100,
    "Eindhoven Airport - Zagreb/Franjo Tudjman Airport": 1100,
    "Eindhoven Airport - Oujda/Angads Airport": 1900,
    "Eindhoven Airport - Nador/El Aroui Airport": 1900,
    "Groningen/Eelde Airport - Kos/Ippokratis Airport": 2100,
    "Rotterdam Airport - Kos/Ippokratis Airport": 2100,
    "Rotterdam Airport - Bergerac-Roumaniere Airport": 800,
    "Rotterdam Airport - Nador/El Aroui Airport": 1900,
}

# Update the distance in the DataFrame if it is NaN and has a known value in the dictionary
final_df['Distance to Netherlands (km)'] = final_df.apply(
    lambda row: known_distances[row['Flight Route']] if pd.isna(row['Distance to Netherlands (km)']) and row['Flight Route'] in known_distances else row['Distance to Netherlands (km)'],
    axis=1
)

# Remove duplicate rows from the DataFrame
final_df = final_df.drop_duplicates()

# Clean '2024 Q2 Total Passengers' column
final_df['2024 Q2 Total Passengers'] = final_df['2024 Q2 Total Passengers'].replace(',', '', regex=True)
final_df['2024 Q2 Total Passengers'] = pd.to_numeric(final_df['2024 Q2 Total Passengers'], errors='coerce')

# Create a new column that multiplies total passengers by distance to the Netherlands
final_df['Passenger Distance Product'] = final_df.apply(
    lambda row: row['2024 Q2 Total Passengers'] * row['Distance to Netherlands (km)']
    if pd.notna(row['2024 Q2 Total Passengers']) and pd.notna(row['Distance to Netherlands (km)'])
    else 'N/A',
    axis=1
)

# Create a new column for distance categories based on the distance to the Netherlands
def categorize_distance(distance):
    if pd.isna(distance):
        return 'N/A'
    elif distance < 1000:
        return 'Short'
    elif 1000 <= distance <= 5000:
        return 'Middle'
    else:
        return 'Long'

final_df['distancecategory'] = final_df['Distance to Netherlands (km)'].apply(categorize_distance)

# Load the emission factors from the CSV file
emission_factors_path = r'C:\Users\omniq\Documents\GitHub\TIL6022-LabAssignments\Flytax-Project-Group24\CO2-effect\Data\carbon-footprint-travel-mode.csv'
emission_df = pd.read_csv(emission_factors_path)

# Extract emission factors
short_flight_factor = emission_df.iloc[-2, -1]  # Penultimate row for short flights
middle_flight_factor = emission_df.iloc[5, -1]  # 6th row for middle flights
long_flight_factor = emission_df.iloc[10, -1]  # 11th row for long flights

# Map emission factors based on the distance category
emission_factors = {'Short': short_flight_factor, 'Middle': middle_flight_factor, 'Long': long_flight_factor}
final_df['Emission Factor (g CO2/pax-km)'] = final_df['distancecategory'].map(emission_factors)

# Convert relevant columns to numeric
final_df['Passenger Distance Product'] = pd.to_numeric(final_df['Passenger Distance Product'], errors='coerce')
final_df['Emission Factor (g CO2/pax-km)'] = pd.to_numeric(final_df['Emission Factor (g CO2/pax-km)'], errors='coerce')

# Create a new column for total CO2 emissions for all passengers in tonnes
final_df['Total CO2 Emissions (tonnes, all passengers)'] = final_df.apply(
    lambda row: (row['Passenger Distance Product'] * row['Emission Factor (g CO2/pax-km)'] / 1_000_000)
    if pd.notna(row['Passenger Distance Product']) and pd.notna(row['Emission Factor (g CO2/pax-km)'])
    else 'N/A',
    axis=1
)

# Function to calculate flight time
def calculate_flight_time(distance, avg_speed, pre_flight_time):
    if pd.notna(distance) and distance > 0 and avg_speed > 0:
        flight_time = distance / avg_speed  # Time = Distance / Speed
        total_time = flight_time + pre_flight_time  # Add pre-flight time
        return total_time
    else:
        return None

# Example inputs for speed and pre-flight time
avg_speed = 800  # Example average speed in km/h
pre_flight_time = 1.5  # Example pre-flight time in hours

# Calculate the flight time for each row
final_df['Flight Time (hours)'] = final_df['Distance to Netherlands (km)'].apply(
    lambda x: calculate_flight_time(x, avg_speed, pre_flight_time)
)

# Display the final DataFrame with all relevant columns
print(final_df[['Flight Route', '2024 Q2 Total Passengers', 'Latitude', 'Longitude', 'Distance to Netherlands (km)', 'Passenger Distance Product', 'distancecategory', 'Emission Factor (g CO2/pax-km)', 'Total CO2 Emissions (tonnes, all passengers)', 'Flight Time (hours)']].to_string(index=False))



                                                           Flight Route  2024 Q2 Total Passengers   Latitude   Longitude  Distance to Netherlands (km)  Passenger Distance Product distancecategory  Emission Factor (g CO2/pax-km) Total CO2 Emissions (tonnes, all passengers)  Flight Time (hours)
                                                       Airp_Pr (Labels)                       NaN        NaN         NaN                           NaN                         NaN              N/A                             NaN                                          N/A                  NaN
                                        Abu Dhabi International Airport                   53133.0  24.431899   54.641977                   5150.168866                2.736439e+08             Long                          113.55                                 31072.267386             7.937711
                                            Dubai International Airport                  224942.0  25.252129   55.3657

In [86]:
import pandas as pd

# Path to your CSV file
file_path = r'C:\Users\omniq\Documents\GitHub\TIL6022-LabAssignments\Flytax-Project-Group24\CO2-effect\final_data.csv'

# Load the CSV file without skipping any rows
df_final = pd.read_csv(file_path)

# Extract the first column ('Flight Route') and the last column ('Distance to Netherlands (km)')
df_extracted = df_final.iloc[:, [0, -1]]  # First and last columns

# Rename the columns for clarity
df_extracted.columns = ['Flight Route', 'Distance to Netherlands (km)']

# Skip the first 4 rows (2 initial rows and 2 more rows), and the last two rows, then reset the index
df_extracted_cut = df_extracted.iloc[4:-2].reset_index(drop=True)

# Display the resulting DataFrame
print(df_extracted_cut)





                                    Flight Route  Distance to Netherlands (km)
0                Curacao/Aeropuerto Hato Airport                   7879.480192
1           St. Maarten/Princess Juliana Airport                   6980.201827
2         Ezeiza Ministro Pistarini (Ba) Airport                           NaN
3                              Innsbruck Airport                    693.064730
4                         Wien-Schwechat Airport                    917.280610
..                                           ...                           ...
494  Rotterdam Airport - London Heathrow Airport                           NaN
495        Rotterdam Airport - Edinburgh Airport                           NaN
496  Rotterdam Airport - London Stansted Airport                           NaN
497                                          Nan                    768.876795
498                                Special Value                   8570.221611

[499 rows x 2 columns]


In [76]:
import pandas as pd

# Paths to your CSV files
file_final_data = r'C:\Users\omniq\Documents\GitHub\TIL6022-LabAssignments\Flytax-Project-Group24\CO2-effect\final_data.csv'
file_path = r'C:\Users\omniq\Documents\GitHub\TIL6022-LabAssignments\Flytax-Project-Group24\CO2-effect\Data\Flight Netherlands\avia_par_nl__custom_13280147_spreadsheet.csv'

# Load the final data CSV (Flight Route and Distance to Netherlands)
df_final = pd.read_csv(file_final_data)

# Extract the first column ('Flight Route') and the last column ('Distance to Netherlands (km)')
df_extracted = df_final.iloc[:, [0, -1]]  # First and last columns
df_extracted.columns = ['Flight Route', 'Distance to Netherlands (km)']  # Rename columns

# Load the CSV file with ';' as the separator, and skip the first 10 rows
df = pd.read_csv(file_path, sep=';', skiprows=10)

# Assign meaningful column names to only the relevant columns, ignoring the 'Unnamed' columns
df.columns = ['Flight Route', '2010', 'Unnamed_2', '2011', 'Unnamed_4', '2012', 'Unnamed_6', 
              '2013', 'Unnamed_8', '2014', 'Unnamed_10', '2015', 'Unnamed_12', '2016', 'Unnamed_14', 
              '2017', 'Unnamed_16', '2018', 'Unnamed_18', '2019', 'Unnamed_20', '2020', 'Unnamed_22', 
              '2021', 'Unnamed_24', '2022', 'Unnamed_26', '2023', 'Unnamed_28']

# Extract the 'Flight Route' and '2022' columns
df_2022 = df[['Flight Route', '2022']].copy()

# Replace commas in the '2022' column and convert to numeric while keeping NaNs
df_2022['2022'] = df_2022['2022'].str.replace(',', '', regex=False)
df_2022['2022'] = pd.to_numeric(df_2022['2022'], errors='coerce')

# Fill missing values with a placeholder (e.g., 0 or 'N/A') to avoid skipping rows
df_2022['2022'] = df_2022['2022'].fillna('N/A')

# Skip the first two rows and cut the last 5 rows
df_2022_cut = df_2022.iloc[2:-5]

# Reset the index so that it starts from 0
df_2022_cut = df_2022_cut.reset_index(drop=True)

# Display the DataFrame without the first two rows and last 5 rows
print(df_2022_cut)



                                          Flight Route      2022
0    AMSTERDAM/SCHIPHOL airport - CURACAO/AEROPUERT...  671781.0
1    AMSTERDAM/SCHIPHOL airport - ST. MAARTEN/PRINC...   80203.0
2    AMSTERDAM/SCHIPHOL airport - EZEIZA MINISTRO P...  108080.0
3       AMSTERDAM/SCHIPHOL airport - INNSBRUCK airport   78207.0
4    AMSTERDAM/SCHIPHOL airport - WIEN-SCHWECHAT ai...  611758.0
..                                                 ...       ...
494         ROTTERDAM airport - LONDON GATWICK airport       N/A
495            ROTTERDAM airport - LONDON/CITY airport  112276.0
496        ROTTERDAM airport - LONDON HEATHROW airport       N/A
497              ROTTERDAM airport - EDINBURGH airport   27460.0
498        ROTTERDAM airport - LONDON STANSTED airport       N/A

[499 rows x 2 columns]


In [85]:
import pandas as pd

# Assuming df_extracted_cut is already defined as above

# Path to your other CSV file (from which df_2022_cut comes)
# Make sure df_2022_cut has been defined similarly, with the second column representing the '2022' data
# Load df_2022 if it's not already loaded
# df_2022 = pd.read_csv(<your_path>)  # Assuming this was already done

# Assuming df_2022_cut already exists and has the column you want
# Ensure df_2022_cut is aligned properly in terms of index with df_extracted_cut
df_2022_cut = df_2022_cut.reset_index(drop=True)

# Combine the two dataframes
df_combined = pd.DataFrame({
    'Flight Route': df_extracted_cut['Flight Route'],
    'Distance to Netherlands (km)': df_extracted_cut['Distance to Netherlands (km)'],
    'Passengers 2022': df_2022_cut.iloc[:, 1]  # Adding the second column from df_2022_cut
})

# Display the resulting DataFrame
print(df_combined)


                                    Flight Route  \
0                Curacao/Aeropuerto Hato Airport   
1           St. Maarten/Princess Juliana Airport   
2         Ezeiza Ministro Pistarini (Ba) Airport   
3                              Innsbruck Airport   
4                         Wien-Schwechat Airport   
..                                           ...   
494  Rotterdam Airport - London Heathrow Airport   
495        Rotterdam Airport - Edinburgh Airport   
496  Rotterdam Airport - London Stansted Airport   
497                                          Nan   
498                                Special Value   

     Distance to Netherlands (km) Passengers 2022  
0                     7879.480192        671781.0  
1                     6980.201827         80203.0  
2                             NaN        108080.0  
3                      693.064730         78207.0  
4                      917.280610        611758.0  
..                            ...             ...  
494        