In [1]:
import pandas as pd 
import numpy as np
import geopandas as gpd
import geodatasets as gds
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import gradio as gr



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_airports = pd.read_csv('airports.csv', header = None, na_values = ['\\N'])
data_routes = pd.read_csv('routes.csv', header = None, na_values = ['\\N'])
data_airports.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10.0,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10.0,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10.0,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10.0,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10.0,U,Pacific/Port_Moresby,airport,OurAirports


В датасетах нет заголовков у колонок, добавим их, затем проверим данные на ошибки. Начнем с пропусков, затем проверим дубликаты и форматы данных. 

In [3]:
column_names_airports = [
    'id', 'full_name', 'city', 'country', 'IATA', 'ICAO', 'latitude', 'longtitude',
    'aptitude', 'timezone', 'summer_time', 'db_timezone', 'type', 'source'
]
column_mapping = dict(enumerate(column_names_airports))
data_airports = data_airports.rename(columns=column_mapping)
data_airports.head()

Unnamed: 0,id,full_name,city,country,IATA,ICAO,latitude,longtitude,aptitude,timezone,summer_time,db_timezone,type,source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10.0,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10.0,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10.0,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10.0,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10.0,U,Pacific/Port_Moresby,airport,OurAirports


In [4]:
data_airports.isna().sum()

id                0
full_name         0
city             49
country           0
IATA           1626
ICAO              1
latitude          0
longtitude        0
aptitude          0
timezone        353
summer_time     353
db_timezone    1021
type              0
source            0
dtype: int64

Заметим, что в данных есть пропуски в коде IATA, что может повлиять на дальшнейшее решение, так как смэтчить аэропорты мы сможем только по нему, когда будем считать количество полетов. Если смэтчить какой то крупный аэропорт не получится, то будем искать альтернативный метод решения. Пока оставляем все как есть и не будем обрабатывать эти пропуски.

In [5]:
column_names_routes = [
    'airline', 'airline_id', 'source_airport', 'source_airport_id', 'destination_airport', 'destination_airport_id', 'codeshare', 'stops', 'equipment'
]
column_mapping = dict(enumerate(column_names_routes))
data_routes = data_routes.rename(columns=column_mapping)
data_routes.head()

Unnamed: 0,airline,airline_id,source_airport,source_airport_id,destination_airport,destination_airport_id,codeshare,stops,equipment
0,2B,410.0,AER,2965.0,KZN,2990.0,,0,CR2
1,2B,410.0,ASF,2966.0,KZN,2990.0,,0,CR2
2,2B,410.0,ASF,2966.0,MRV,2962.0,,0,CR2
3,2B,410.0,CEK,2968.0,KZN,2990.0,,0,CR2
4,2B,410.0,CEK,2968.0,OVB,4078.0,,0,CR2


In [6]:
data_routes.isna().sum()

airline                       0
airline_id                  479
source_airport                0
source_airport_id           220
destination_airport           0
destination_airport_id      221
codeshare                 53066
stops                         0
equipment                    18
dtype: int64

Здесь есть пропуски в нескольких колонках. Данные колонки не будут влиять на решение задачи. 2 самые важные колонки source_airport и destination_airport не имееют пропусков. 

In [7]:
data_airports.nunique()

id             7698
full_name      7658
city           6955
country         237
IATA           6072
ICAO           7697
latitude       7683
longtitude     7686
aptitude       2522
timezone         40
summer_time       7
db_timezone     307
type              1
source            1
dtype: int64

Видим, что нет дубликатов по id, есть дубликаты по названиям аэропортов, по ICAO, это ошибка в данных.

In [8]:
data_routes.nunique()

airline                    568
airline_id                 547
source_airport            3409
source_airport_id         3320
destination_airport       3418
destination_airport_id    3326
codeshare                    1
stops                        2
equipment                 3945
dtype: int64

Здесь нет дубликатов. Проверим ошибки в форматах данных. IATA - 3 заглавные буквы, ICAO - 4 заглавные буквы. 

In [9]:

iata = data_airports['IATA']
icao = data_airports['ICAO']

mask_iata = (iata.notna() & (~iata.str.match(r'^[A-Z]{3}$', na=False)))

mask_icao = (icao.notna() & (~icao.str.match(r'^[A-Z]{4}$', na=False)))

format_errors = pd.concat([data_airports[mask_iata], data_airports[mask_icao]]).drop_duplicates()
print(len(format_errors))

151


151 ошибочное значение, проверим выход координат за диапазон в датасете аэропорты

In [10]:
lat_lon = data_airports[
    ((data_airports['latitude'] < -90) | 
     (data_airports['latitude'] > 90) | 
     (data_airports['longtitude'] < -180) | 
     (data_airports['longtitude'] > 180))
]
print(len(lat_lon))

0


Таких ошибок нет, проверим несоответствие маршрутов.

In [11]:
airport_IATA = set(data_airports['IATA'])
source_na = data_routes[~data_routes['source_airport'].isin(airport_IATA)]
destination_na = data_routes[~data_routes['destination_airport'].isin(airport_IATA)]
print(len(source_na))
print(len(destination_na))

406
416


Несоответствия маршрутов существуют, это тоже ошибка в данных. Такую ошибку обработать сложно, поэтому пока просто перейдем к решению основной задачи. Найдем все маршруты, по кодам IATA, по вылетам и прилетам, затем сметчим с аэропортами по кодам, и выведем самые крупные. 

In [12]:
departures = data_routes.source_airport.value_counts().reset_index()
departures.columns = ['IATA', 'departures']

arrivals = data_routes.destination_airport.value_counts().reset_index()
arrivals.columns = ['IATA', 'arrivals']

airports_final = pd.merge(departures, arrivals, on = 'IATA', how = 'outer')
airports_final['sum'] = airports_final['departures'] + airports_final['arrivals']
airports_final = airports_final.sort_values('sum', ascending=False)
top_airports = airports_final.head(10)
top_airports

Unnamed: 0,IATA,departures,arrivals,sum
165,ATL,915.0,911.0,1826.0
2128,ORD,558.0,550.0,1108.0
2200,PEK,535.0,534.0,1069.0
1626,LHR,527.0,524.0,1051.0
463,CDG,524.0,517.0,1041.0
890,FRA,497.0,493.0,990.0
1573,LAX,492.0,498.0,990.0
667,DFW,469.0,467.0,936.0
1293,JFK,456.0,455.0,911.0
110,AMS,453.0,450.0,903.0


In [13]:
merge_result = pd.merge(
    top_airports[['IATA']],  
    data_airports[['IATA', 'country', 'full_name']], 
    on="IATA",
    how="inner"  
)
countries = merge_result.loc[:,'country']
names = merge_result.loc[:,'full_name']
top_airports.reset_index(drop=True, inplace=True)
merge_result.reset_index(drop=True, inplace=True)
top_airports['country'] = countries
top_airports['full_name'] = names
top_airports = top_airports.drop('IATA', axis = 1)
top_airports = top_airports.set_index(['country', 'full_name'])
top_airports

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_airports['country'] = countries
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_airports['full_name'] = names


Unnamed: 0_level_0,Unnamed: 1_level_0,departures,arrivals,sum
country,full_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
United States,Hartsfield Jackson Atlanta International Airport,915.0,911.0,1826.0
United States,Chicago O'Hare International Airport,558.0,550.0,1108.0
China,Beijing Capital International Airport,535.0,534.0,1069.0
United Kingdom,London Heathrow Airport,527.0,524.0,1051.0
France,Charles de Gaulle International Airport,524.0,517.0,1041.0
Germany,Frankfurt am Main Airport,497.0,493.0,990.0
United States,Los Angeles International Airport,492.0,498.0,990.0
United States,Dallas Fort Worth International Airport,469.0,467.0,936.0
United States,John F Kennedy International Airport,456.0,455.0,911.0
Netherlands,Amsterdam Airport Schiphol,453.0,450.0,903.0


In [14]:
def normalize_country_name(country_name: str) -> str:
    if country_name == 'United States':
        country_name = 'United States of America'
        return country_name
    else:
        return country_name
data_airports["country"] = data_airports["country"].apply(normalize_country_name)

Здесь мы преобразовывам название штатов, потому что они неправильно считываются геопандасом. его используем чтобы получить континенты 

In [None]:
path_to_file = 'ne_10m_admin_0_countries'

world = gpd.read_file(path_to_file)

print("Доступные столбцы:", world.columns)


Доступные столбцы: Index(['featurecla', 'scalerank', 'LABELRANK', 'SOVEREIGNT', 'SOV_A3',
       'ADM0_DIF', 'LEVEL', 'TYPE', 'TLC', 'ADMIN',
       ...
       'FCLASS_TR', 'FCLASS_ID', 'FCLASS_PL', 'FCLASS_GR', 'FCLASS_IT',
       'FCLASS_NL', 'FCLASS_SE', 'FCLASS_BD', 'FCLASS_UA', 'geometry'],
      dtype='object', length=169)


In [16]:
world = world[['ADMIN', 'CONTINENT']]

world.columns = ["country", "continent"]
merged_data = pd.merge(
    data_airports,
    world,
    on="country",
    how="left"
)
data_airports = merged_data


In [17]:
airports_final.reset_index(drop=True, inplace=True)
data_airports.reset_index(drop=True, inplace=True)
airports_final = pd.merge(
    airports_final[['IATA', 'sum']],  
    data_airports[['IATA', 'continent', 'latitude', 'longtitude']], 
    on="IATA",
    how="inner"  
)




In [18]:
continent_to_scope = {
    'Africa': 'africa',
    'Asia': 'asia',
    'Europe': 'europe',
    'North America': 'north america',
    'South America': 'south america',
    'Oceania': 'world',
    'Antarctica': 'world'
}


Делим на континенты, затем строим с помощью plotly график, по координатам аэропортов. Антарктиду и Океанию строим через world. 

In [19]:
continents = [
    c for c in data_airports['continent'].unique() 
    if pd.notna(c) and c not in ['Seven seas (open ocean)']
]

for continent in continents:

    continent_df = data_airports[data_airports['continent'] == continent].copy()
    

    continent_df['dummy_color'] = 1 
    fig = px.choropleth(
        continent_df,
        locations='country',
        locationmode='country names',
        color='dummy_color', 
        scope=continent_to_scope.get(continent, 'world'),
        title=f'{continent} - Airport Locations',
        color_continuous_scale=px.colors.sequential.Blues,
        hover_data=['full_name']  
    )
    fig.add_trace(
        go.Scattergeo(
            lon=continent_df['longtitude'],
            lat=continent_df['latitude'],
            text=continent_df['full_name'],
            mode='markers',
            showlegend=False,
            marker=dict(
                size=4, 
                color='#E74C3C',
                opacity=0.9,
                line=dict(width=0.5, color='black')
            ),
            name='Airports'
        )
    )
    fig.show()


Здесь можно заметить, что некоторые аэропорты попали не в те континенты, возможно, потому что у них неверно указана страна. Есть аэропорт Los Alamitos.

In [20]:
data_airports[data_airports['full_name'] == 'Los Alamitos Army Air Field']

Unnamed: 0,id,full_name,city,country,IATA,ICAO,latitude,longtitude,aptitude,timezone,summer_time,db_timezone,type,source,continent
4316,5613,Los Alamitos Army Air Field,Solwesi,Zambia,,KSLI,33.790001,-118.052002,32,-8.0,U,America/Los_Angeles,airport,OurAirports,Africa


Видим, что действительно, страна Замбия, хотя аэропорт не там. Ошибка в данных

In [21]:
grouped_by_continent = (airports_final.groupby('continent', group_keys=False)
                        .apply(lambda x: x.nlargest(5, 'sum'))
                        .reset_index(drop=True))

airport_IATA = set(grouped_by_continent['IATA'])


filtered_routes = data_routes[
    (data_routes['source_airport'].isin(airport_IATA)) & 
    (data_routes['destination_airport'].isin(airport_IATA))
]


connections = []
for _, row in filtered_routes.iterrows():
    source = row['source_airport']
    dest = row['destination_airport']
    if source == dest:
        continue
    
    reverse_exists = ((filtered_routes['source_airport'] == dest) & 
                    (filtered_routes['destination_airport'] == source)).any()
    
    connection_type = 'two-way' if reverse_exists else 'one-way'
    
    connections.append({
        'airport_a': source,
        'airport_b': dest,
        'type': connection_type
    })

connections_df = pd.DataFrame(connections)

def get_connections(airport_IATA):
    conns = connections_df[
        (connections_df['airport_a'] == airport_IATA) | 
        (connections_df['airport_b'] == airport_IATA)
    ]
    formatted = []
    for _, row in conns.iterrows():
        other = row['airport_b'] if row['airport_a'] == airport_IATA else row['airport_a']
        formatted.append(f"{other} ({row['type']})")
    return ', '.join(formatted) if formatted else 'No connections'

grouped_by_continent['connections'] = grouped_by_continent['IATA'].apply(get_connections)


print(grouped_by_continent)





   IATA     sum                continent   latitude  longtitude  \
0   JNB   321.0                   Africa -26.139200   28.246000   
1   CAI   303.0                   Africa  30.121901   31.405600   
2   NBO   258.0                   Africa  -1.319240   36.927799   
3   CMN   250.0                   Africa  33.367500   -7.589970   
4   ADD   214.0                   Africa   8.977890   38.799301   
5   PEK  1069.0                     Asia  40.080101  116.584999   
6   PVG   825.0                     Asia  31.143400  121.805000   
7   SIN   820.0                     Asia   1.350190  103.994003   
8   ICN   740.0                     Asia  37.469101  126.450996   
9   IST   719.0                     Asia  41.275278   28.751944   
10  LHR  1051.0                   Europe  51.470600   -0.461941   
11  CDG  1041.0                   Europe  49.012798    2.550000   
12  FRA   990.0                   Europe  50.033333    8.570556   
13  AMS   903.0                   Europe  52.308601    4.76389

Здесь мы создаем список из самых крупных аэропортов и их соединений по континентам. Для начала группируем аэропорты по континентам, выбираем самые крупные. Затем создаем из них новый датафрейм, где будем циклом считывать и искать соединения, отмечаем как односторонние и двухсторонние

In [22]:

fig = px.scatter_geo(
    grouped_by_continent,
    lat='latitude',
    lon='longtitude', 
    hover_name='IATA',
    text='IATA',
    scope='world',
    title='Top Airports Connections'
)

for _, conn in connections_df.iterrows():
    a = grouped_by_continent[grouped_by_continent['IATA'] == conn['airport_a']]
    b = grouped_by_continent[grouped_by_continent['IATA'] == conn['airport_b']]
    a_coords = a[['latitude', 'longtitude']].values[0]
    b_coords = b[['latitude', 'longtitude']].values[0]
    
    if conn['type'] == 'two-way':
        fig.add_trace(go.Scattergeo(
            lon=[a_coords[1], b_coords[1]],
            lat=[a_coords[0], b_coords[0]],
            mode='lines',
            line=dict(width=1, color='red'),
            hoverinfo='none',
            showlegend=False
        ))
    else:
        fig.add_trace(go.Scattergeo(
            lon=[a_coords[1], b_coords[1]],
            lat=[a_coords[0], b_coords[0]],
            mode='lines+markers',
            line=dict(width=1, color='green'),
            marker=dict(
                symbol='arrow',
                size=10,
                angleref='previous',
                color='green'
            ),
            hoverinfo='none',
            showlegend=False,
        ))

fig.update_geos(
    projection_type="orthographic", 
    landcolor="lightgray",
    showcountries=True,
    coastlinecolor="gray",
    showocean=True,
    oceancolor="LightBlue"
)

fig.update_layout(
    height=800,
    width=1200,
    title_text="Топ-5 аэропортов по континентам и их связи",
    margin={"r":0,"t":40,"l":0,"b":0}
)

fig.show()

Строим непосредственно график используя gradio и plotly.
