In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from time import sleep

# Path to the CSV file
file_path = r'C:\Users\omniq\Documents\GitHub\PractiseGit\CO2-effect\avia_par_nl__custom_13210883_spreadsheet (csvformat).csv'

# Load the CSV file into a DataFrame using ';' as the separator
df = pd.read_csv(file_path, sep=';')

# Extract the first column and rename it to "Flight Route"
first_column = df.iloc[:, 0].rename("Flight Route")

# Extract the last column and rename it to "2024 Q2 Total Passengers"
last_column = df.iloc[:, -1].rename("2024 Q2 Total Passengers")

# Create a new DataFrame with the desired columns
new_df = pd.DataFrame({
    'Flight Route': first_column,
    '2024 Q2 Total Passengers': last_column
})

# Clean the 'Flight Route' column by removing unwanted prefixes and formatting the text
new_df['Flight Route'] = new_df['Flight Route'].astype(str)
new_df['Flight Route'] = new_df['Flight Route'].str.replace('AMSTERDAM/SCHIPHOL airport - ', '', regex=False)
new_df['Flight Route'] = new_df['Flight Route'].str.replace('Maastricht/Aachen airport - ', '', regex=False)
new_df['Flight Route'] = new_df['Flight Route'].str.replace('Eindhoven airport - ', '', regex=False)
new_df['Flight Route'] = new_df['Flight Route'].str.replace('Rotterdam airport - ', '', regex=False)
new_df['Flight Route'] = new_df['Flight Route'].str.replace('Groningen Eelde airport - ', '', regex=False)
new_df['Flight Route'] = new_df['Flight Route'].str.title().str.strip()

# Handle non-numeric values and clean up the '2024 Q2 Total Passengers' column
new_df['2024 Q2 Total Passengers'] = new_df['2024 Q2 Total Passengers'].replace(',', '', regex=True)
new_df['2024 Q2 Total Passengers'] = pd.to_numeric(new_df['2024 Q2 Total Passengers'], errors='coerce')

# Initialize the geolocator with a user agent to avoid request issues
geolocator = Nominatim(user_agent="airport_geolocator")

# Function to get the coordinates of the airport
def get_airport_coordinates(airport_name):
    try:
        location = geolocator.geocode(airport_name)
        if location:
            return (location.latitude, location.longitude)
        else:
            retry_query = f"{airport_name} Airport"
            location = geolocator.geocode(retry_query)
            if location:
                return (location.latitude, location.longitude)
            else:
                return (None, None)
    except:
        return (None, None)

# Add a delay between requests to avoid overwhelming the geocoding service
def get_coordinates_with_delay(airport_name):
    coords = get_airport_coordinates(airport_name)
    sleep(0.05)
    return coords

# Apply the function to the 'Flight Route' column to get coordinates
new_df['Coordinates'] = new_df['Flight Route'].apply(get_coordinates_with_delay)

# Split coordinates into separate latitude and longitude columns
new_df[['Latitude', 'Longitude']] = pd.DataFrame(new_df['Coordinates'].tolist(), index=new_df.index)

# Drop the original 'Coordinates' column for clarity
new_df = new_df.drop(columns=['Coordinates'])

# Define the coordinates for the most central point in the Netherlands (Amersfoort)
central_netherlands_coords = (52.1561, 5.3878)

# Function to calculate the distance from the central point in the Netherlands to each airport
def calculate_distance_to_netherlands(lat, lon):
    if pd.notna(lat) and pd.notna(lon):
        return geodesic(central_netherlands_coords, (lat, lon)).kilometers
    else:
        return None

# Calculate the distance to the Netherlands for each airport
new_df['Distance to Netherlands (km)'] = new_df.apply(
    lambda row: calculate_distance_to_netherlands(row['Latitude'], row['Longitude']), axis=1
)

# Dictionary of known distances to Amersfoort, Netherlands for specific airports
known_distances = {
    "Montreal/Pierre Elliot Trudeau Intl, Qc Airport": 5560,
    "Toronto/Lester B. Pearson Intl, On Airport": 6000,
    "Hannover Uir Airport": 300,
    "Zagreb/Franjo Tudjman Airport": 1100,
    "Bangalore International Airport, Devenahalli, Bangalore Airport": 7800,
    "Tehran/Imam Khomaini Intl Airport": 4400,
    "Nairobi Acc,Fic,Rcc,Com Airport": 6500,
    "Luftfartstilsynet Civil Aviation Authority Airport": 1100,
    "Enfidha Zine El Abidine Ben Ali Airport": 1900,
    "Taibei City/Taibei Intl Ap Airport": 9500,
    "Kilimanjaro App, Twr, Ais, Met, Civil Airlines Airport": 7000,
    "Boston/General Edward Lawrence Logan International, Ma. Airport": 5600,
    "Washington Dulles International, Dc. Airport": 6200,
    "Maiquetia, Intl, Simon Bolivar, Vargas Airport": 7600,
    "Maastricht/Aachen Airport - Kos/Ippokratis Airport": 2100,
    "Maastricht/Aachen Airport - Zakinthos/Dionisios Solomos Airport": 2300,
    "Eindhoven Airport - Athinai/Eleftherios Venizelos Airport": 2200,
    "Eindhoven Airport - Kos/Ippokratis Airport": 2100,
    "Eindhoven Airport - Zagreb/Franjo Tudjman Airport": 1100,
    "Eindhoven Airport - Oujda/Angads Airport": 1900,
    "Eindhoven Airport - Nador/El Aroui Airport": 1900,
    "Groningen/Eelde Airport - Kos/Ippokratis Airport": 2100,
    "Rotterdam Airport - Kos/Ippokratis Airport": 2100,
    "Rotterdam Airport - Bergerac-Roumaniere Airport": 800,
    "Rotterdam Airport - Nador/El Aroui Airport": 1900,
}

# Function to update the distance in the DataFrame if it is NaN and has a known value in the dictionary
def update_distance(row):
    if pd.isna(row['Distance to Netherlands (km)']) and row['Flight Route'] in known_distances:
        return known_distances[row['Flight Route']]
    else:
        return row['Distance to Netherlands (km)']

# Apply the update function to the DataFrame to fill in the missing distances
new_df['Distance to Netherlands (km)'] = new_df.apply(update_distance, axis=1)

# Remove duplicate rows from the DataFrame
new_df = new_df.drop_duplicates()

# Display the ultimate DataFrame with all relevant columns, including NaNs where applicable
print(new_df[['Flight Route', '2024 Q2 Total Passengers', 'Latitude', 'Longitude', 'Distance to Netherlands (km)']].to_string(index=False))

print(f"Number of rows after processing: {len(new_df)}")


                                                           Flight Route  2024 Q2 Total Passengers   Latitude   Longitude  Distance to Netherlands (km)
                                                       Airp_Pr (Labels)                       NaN        NaN         NaN                           NaN
                                        Abu Dhabi International Airport                   53133.0  24.431899   54.641977                   5150.168866
                                            Dubai International Airport                  224942.0  25.252129   55.365716                   5128.537550
                                               Bonaire/Flamingo Airport                   44039.0  12.131027  -68.264918                   7836.943565
                                        Curacao/Aeropuerto Hato Airport                  167798.0  12.189053  -68.962162                   7879.480192
                                   St. Maarten/Princess Juliana Airport                       

In [2]:
# Create a new column that multiplies total passengers by distance to the Netherlands
new_df['Passenger Distance Product'] = new_df.apply(
    lambda row: row['2024 Q2 Total Passengers'] * row['Distance to Netherlands (km)'] 
    if pd.notna(row['2024 Q2 Total Passengers']) and pd.notna(row['Distance to Netherlands (km)']) 
    else 'N/A',
    axis=1
)

# Display the updated DataFrame with the new column
print(new_df[['Flight Route', '2024 Q2 Total Passengers', 'Latitude', 'Longitude', 'Distance to Netherlands (km)', 'Passenger Distance Product']].to_string(index=False))
print(f"Number of rows after processing: {len(new_df)}")


                                                           Flight Route  2024 Q2 Total Passengers   Latitude   Longitude  Distance to Netherlands (km) Passenger Distance Product
                                                       Airp_Pr (Labels)                       NaN        NaN         NaN                           NaN                        N/A
                                        Abu Dhabi International Airport                   53133.0  24.431899   54.641977                   5150.168866           273643922.374492
                                            Dubai International Airport                  224942.0  25.252129   55.365716                   5128.537550          1153623493.588541
                                               Bonaire/Flamingo Airport                   44039.0  12.131027  -68.264918                   7836.943565           345131157.667056
                                        Curacao/Aeropuerto Hato Airport                  167798.0  12.189053  

In [3]:
# Create a new column for distance categories based on the distance to the Netherlands
def categorize_distance(distance):
    if pd.isna(distance):
        return 'N/A'
    elif distance < 1000:
        return 'Short'
    elif 1000 <= distance <= 5000:
        return 'Middle'
    else:
        return 'Long'

# Apply the function to the 'Distance to Netherlands (km)' column
new_df['distancecategory'] = new_df['Distance to Netherlands (km)'].apply(categorize_distance)

# Display the updated DataFrame with the new distance category column
print(new_df[['Flight Route', '2024 Q2 Total Passengers', 'Latitude', 'Longitude', 'Distance to Netherlands (km)', 'Passenger Distance Product', 'distancecategory']].to_string(index=False))


                                                           Flight Route  2024 Q2 Total Passengers   Latitude   Longitude  Distance to Netherlands (km) Passenger Distance Product distancecategory
                                                       Airp_Pr (Labels)                       NaN        NaN         NaN                           NaN                        N/A              N/A
                                        Abu Dhabi International Airport                   53133.0  24.431899   54.641977                   5150.168866           273643922.374492             Long
                                            Dubai International Airport                  224942.0  25.252129   55.365716                   5128.537550          1153623493.588541             Long
                                               Bonaire/Flamingo Airport                   44039.0  12.131027  -68.264918                   7836.943565           345131157.667056             Long
                         

In [4]:
# Load the emission factors from the CSV file
emission_factors_path = r'C:\Users\omniq\Documents\GitHub\PractiseGit\CO2-effect\carbon-footprint-travel-mode.csv'  # Update this to the correct CSV file path

# Load the emission factors DataFrame
emission_df = pd.read_csv(emission_factors_path)

# Extract the emission factors based on the specified rows
# Assuming the rows are as follows:
short_flight_factor = emission_df.iloc[-2, -1]  # Penultimate row for short flights
middle_flight_factor = emission_df.iloc[5, -1]  # 6th row for middle flights
long_flight_factor = emission_df.iloc[10, -1]  # 11th row for long flights

# Define the emission factors for each flight category
emission_factors = {
    'Short': short_flight_factor,
    'Middle': middle_flight_factor,
    'Long': long_flight_factor
}

# Create a new column for the emission factor based on the distance category
new_df['Emission Factor (g CO2/pax-km)'] = new_df['distancecategory'].map(emission_factors)

# Display the updated DataFrame with the new emission factor column
print(new_df[['Flight Route', '2024 Q2 Total Passengers', 'Latitude', 'Longitude', 'Distance to Netherlands (km)', 'Passenger Distance Product', 'distancecategory', 'Emission Factor (g CO2/pax-km)']].to_string(index=False))


                                                           Flight Route  2024 Q2 Total Passengers   Latitude   Longitude  Distance to Netherlands (km) Passenger Distance Product distancecategory  Emission Factor (g CO2/pax-km)
                                                       Airp_Pr (Labels)                       NaN        NaN         NaN                           NaN                        N/A              N/A                             NaN
                                        Abu Dhabi International Airport                   53133.0  24.431899   54.641977                   5150.168866           273643922.374492             Long                          113.55
                                            Dubai International Airport                  224942.0  25.252129   55.365716                   5128.537550          1153623493.588541             Long                          113.55
                                               Bonaire/Flamingo Airport                   44

In [6]:
# Convert the columns to numeric, ensuring any non-convertible values become NaN
new_df['Passenger Distance Product'] = pd.to_numeric(new_df['Passenger Distance Product'], errors='coerce')
new_df['Emission Factor (g CO2/pax-km)'] = pd.to_numeric(new_df['Emission Factor (g CO2/pax-km)'], errors='coerce')

# Create a new column for total CO2 emissions for all passengers in tonnes
new_df['Total CO2 Emissions (tonnes, all passengers)'] = new_df.apply(
    lambda row: (row['Passenger Distance Product'] * row['Emission Factor (g CO2/pax-km)'] / 1_000_000) 
    if pd.notna(row['Passenger Distance Product']) and pd.notna(row['Emission Factor (g CO2/pax-km)']) 
    else 'N/A', 
    axis=1
)

# Display the updated DataFrame with the new total CO2 emissions column
print(new_df[['Flight Route', '2024 Q2 Total Passengers', 'Latitude', 'Longitude', 'Distance to Netherlands (km)', 'Passenger Distance Product', 'distancecategory', 'Emission Factor (g CO2/pax-km)', 'Total CO2 Emissions (tonnes, all passengers)']].to_string(index=False))

# Print the number of rows after processing
print(f"Number of rows after processing: {len(new_df)}")


                                                           Flight Route  2024 Q2 Total Passengers   Latitude   Longitude  Distance to Netherlands (km)  Passenger Distance Product distancecategory  Emission Factor (g CO2/pax-km) Total CO2 Emissions (tonnes, all passengers)
                                                       Airp_Pr (Labels)                       NaN        NaN         NaN                           NaN                         NaN              N/A                             NaN                                          N/A
                                        Abu Dhabi International Airport                   53133.0  24.431899   54.641977                   5150.168866                2.736439e+08             Long                          113.55                                 31072.267386
                                            Dubai International Airport                  224942.0  25.252129   55.365716                   5128.537550                1.153623e+09   