In [2]:
import pandas as pd
# import string
import math
from rapidfuzz import process, fuzz

def split_route(route):
    route_clean = str(route).lower().strip()
    if ' to ' in route_clean:
        if ' via ' in route_clean:
            direct_route = route_clean.split(' via ')
            locations = direct_route[0].split(' to ')
        else:
            locations = route_clean.split(' to ')
        origin = locations[0].strip()
        destination = locations[-1].strip()
    else:
        origin = None
        destination = None

    # Extracting the first (departing) and last (arriving) locations
    return origin, destination

# Function to clean and get unique values from a DataFrame column
def get_unique_col_values(df, col_names):
    unique_vals = pd.concat([df[col].str.lower().str.strip() for col in col_names]).unique()
    return unique_vals
    
def closest_match(query, choices):
    try:
        closest_match = process.extractOne(query, choices, scorer=fuzz.WRatio)
        match = closest_match[0]
        score = closest_match[1]
        return match, score
    except Exception:
        return (None, 0)

def try_map(word,mapper):
    try:
        return mapper[word]
    except:
        return None

def calculate_distance(origin, destination):
    try:
        origin_lat, origin_lon = [float(x) for x in origin.split(', ')]
        destination_lat, destination_lon = [float(x) for x in destination.split(', ')]
        # Calculating haversine distance
        origin_lon, origin_lat, destination_lon, destination_lat = map(math.radians, [origin_lon, origin_lat, destination_lon, destination_lat])
        dlon = destination_lon - origin_lon
        dlat = destination_lat - origin_lat
        a = math.sin(dlat/2)**2 + math.cos(origin_lat) * math.cos(destination_lat) * math.sin(dlon/2)**2
        c = 2 * math.asin(math.sqrt(a))
        r = 6371
        return c * r
    except:
        return None

In [3]:
# Load the original dataset
df = pd.read_csv('data/Airline_review.csv').iloc[:, 1:]
df.dropna(subset='Route', inplace=True)

# Splitting Route into origin and destination
df['origin'], df['destination'] = zip(*df['Route'].apply(split_route))

In [31]:
# Define the column names if they are not included in the file
column_names = [
    "big_code", "code", "airport", "city", "country",
    "Latitude Degree", "Latitude Minute", "Latitude Second", "Latitude Direction",
    "Longitude Degree", "Longitude Minute", "Longitude Second", "Longitude Direction",
    "Altitude", "latitude", "longitude"
]

# Load the data
codes = pd.read_csv(
    'data/GlobalAirportDatabase.txt',  # File path
    delimiter=':',                # Delimiter
    names=column_names            # Names of the columns
)[['big_code','code','airport','city','country', 'latitude', 'longitude']]
codes['code'] = codes['code'].str.lower()
codes['airport'] = codes['airport'].str.lower()
codes['city'] = codes['city'].str.lower()
codes['country'] = codes['country'].str.lower()
codes['coordinates'] = codes['latitude'].astype(str) + ', ' + codes['longitude'].astype(str)
codes.drop(columns=['latitude','longitude'],inplace=True)
codes.dropna(subset='airport',inplace=True)
# Display the first few rows of the DataFrame to check
codes.head()

Unnamed: 0,big_code,code,airport,city,country,coordinates
0,AYGA,gka,goroka,goroka,papua new guinea,"-6.082, 145.392"
2,AYMD,mag,madang,madang,papua new guinea,"-5.207, 145.789"
3,AYMH,hgu,mount hagen,mount hagen,papua new guinea,"-5.826, 144.296"
4,AYNZ,lae,nadzab,nadzab,papua new guinea,"-6.57, 146.726"
5,AYPY,pom,port moresby jacksons international,port moresby,papua new guinea,"-9.443, 147.22"


In [45]:
unique_codes = get_unique_col_values(codes, ['code'])
unique_cities = get_unique_col_values(codes, ['city'])
unique_originals = get_unique_col_values(df, ['origin', 'destination'])

In [46]:
type(unique_originals)

numpy.ndarray

In [49]:
unique_originals

array(['moroni', 'anjouan', 'frankfurt', ..., 'bhairahawa', 'bhadrapur',
       'japan'], dtype=object)

In [63]:
# Finding the closest match to help merge dataframes
matches = []
scores = []
for word in unique_originals:
    try:
        if len(word)==3:
            match, score = closest_match(word,unique_codes)
        else:
            match, score = closest_match(word,unique_cities)
        
    except:
        match = None
        score = 0
    matches.append(match)
    scores.append(score)
score_df = pd.DataFrame({'word': unique_originals, 'match': matches, 'score': scores})
score_df = score_df.loc[score_df['score'] >= 90]

In [None]:
# Creating a dictionary to map coordinates onto matched values
melted_df = codes_df.melt(id_vars='coordinates', value_vars=['code', 'airport', 'city'], value_name='Key').drop('variable', axis=1)
result_dict = dict(zip(melted_df['Key'], melted_df['coordinates']))

# Adding coordinates to score_Df
score_df['coordinates'] = score_df['match'].apply(lambda x: try_map(x,result_dict))

In [None]:
df_new = df[['Route', 'origin', 'destination', 'Overall_Rating', 'Recommended']].copy().dropna()

df_origins = pd.merge(df_new, score_df[['word','match','coordinates']], left_on= 'origin',right_on='word', how='left')
df_origins.rename(columns={'match':'origin_city','coordinates':'origin_coordinates'},inplace=True)
df_origins.drop(columns='word', inplace=True)

df_dest = pd.merge(df_origins, score_df[['word','match','coordinates']], left_on= 'destination',right_on='word', how='left')
df_dest.rename(columns={'match':'destination_city','coordinates':'destination_coordinates'},inplace=True)
df_dest.drop(columns='word', inplace=True)

df_final = df_dest[['Route','origin','origin_city','origin_coordinates','destination',
                    'destination_city','destination_coordinates']].dropna().copy()

df_final['distance'] = df_final.apply(
    lambda row: calculate_distance(row['origin_coordinates'], 
                                   row['destination_coordinates']), axis=1)

In [None]:
score_df2=score_df.loc[score_df['score']==100]

In [None]:
df_new = df[['Route', 'origin', 'destination', 'Overall_Rating', 'Recommended']].copy().dropna()

df_origins = pd.merge(df_new, score_df2[['word','match','coordinates']], left_on= 'origin',right_on='word', how='left')
df_origins.rename(columns={'match':'origin_city','coordinates':'origin_coordinates'},inplace=True)
df_origins.drop(columns='word', inplace=True)

df_dest = pd.merge(df_origins, score_df2[['word','match','coordinates']], left_on= 'destination',right_on='word', how='left')
df_dest.rename(columns={'match':'destination_city','coordinates':'destination_coordinates'},inplace=True)
df_dest.drop(columns='word', inplace=True)

df_final2 = df_dest[['Route','origin','origin_city','origin_coordinates','destination',
                    'destination_city','destination_coordinates']].dropna().copy()

df_final2['distance'] = df_final2.apply(
    lambda row: calculate_distance(row['origin_coordinates'], 
                                   row['destination_coordinates']), axis=1)

In [None]:
import seaborn as sns
sns.histplot(df_final2['distance'])

In [None]:
df_final2.loc[df_final2['distance']>17500]
37.7749° N,
122.4194° W

In [None]:
codes_df.loc[codes_df['coordinates']=='-38.7, -60.016667']