In [13]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from arcgis.gis import GIS
import getpass
import numpy as np

In [26]:

# Prompt for ArcGIS credentials
username = input("Enter your ArcGIS username: ")
password = getpass.getpass("Enter your ArcGIS password: ")

# Authenticate with ArcGIS
gis = GIS("https://www.arcgis.com", username, password)

In [27]:
# Get the airports layer from the Living Atlas
airports_lyr = gis.content.get('e90996158f0a464189098881379597a6').layers[0]
airports_lyr

<FeatureLayer url:"https://services2.arcgis.com/jUpNdisbWqRpMo35/arcgis/rest/services/Airports28062017/FeatureServer/0">

In [41]:
data_rows = []
for page in range(1, 200):  # Loop through pages
    start_id = (page - 1) * 15 + 1
    end_id = page * 15
    for i in range(start_id, end_id + 1):  # Loop through IDs for the current page
        URL = f'https://www.maxgumby.com/combos?id={i}&page={page}'
        print(f'Processing ID {i} on page {page} from URL: {URL}')
        response = requests.get(URL)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Assuming `soup` is a BeautifulSoup object containing the HTML content
        tables = soup.find_all('table', class_='table-sm')
        
        # Check if there are at least two tables
        if len(tables) >= 2:
            # Convert the first table to a DataFrame and drop the first row
            df1 = pd.read_html(str(tables[0]))[0]
            df1 = df1.drop(0).reset_index(drop=True)
            df1 = df1.drop(df1.columns[0], axis=1)

            # Convert the second table to a DataFrame and drop the last row
            df2 = pd.read_html(str(tables[1]))[0]
            df2 = df2.drop(df2.index[-1]).reset_index(drop=True)
            
            # Split the 'DPTR ARVL' column into 'DPT_A' and 'ARV_A'
            df2[['DPT_A', 'ARV_A']] = df2['DPT ARV'].str.split(' ', expand=True)
            df2 = df2.drop(columns=['DPTR ARVL', 'DPT ARV'])
            
            # Repeat the row in df1 that corresponds to the current i
            row_to_repeat = df1.iloc[[i - start_id]]
            repeated_row = pd.DataFrame(np.repeat(row_to_repeat.values, len(df2), axis=0), columns=df1.columns)
            
            # Merge the repeated row with df2
            merged_df = pd.concat([repeated_row.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
            
            # Append the merged DataFrame to data_rows
            data_rows.append(merged_df)
        else:
            print("There are less than two tables with the class 'table-sm'.")

# Combine all merged DataFrames into a single DataFrame
raw_df = pd.concat(data_rows, ignore_index=True)

https://www.maxgumby.com/combos?id=1&page=1
https://www.maxgumby.com/combos?id=2&page=1
https://www.maxgumby.com/combos?id=3&page=1
https://www.maxgumby.com/combos?id=4&page=1
https://www.maxgumby.com/combos?id=5&page=1
https://www.maxgumby.com/combos?id=6&page=1
https://www.maxgumby.com/combos?id=7&page=1
https://www.maxgumby.com/combos?id=8&page=1
https://www.maxgumby.com/combos?id=9&page=1
https://www.maxgumby.com/combos?id=10&page=1
https://www.maxgumby.com/combos?id=11&page=1
https://www.maxgumby.com/combos?id=12&page=1
https://www.maxgumby.com/combos?id=13&page=1
https://www.maxgumby.com/combos?id=14&page=1
https://www.maxgumby.com/combos?id=15&page=1
https://www.maxgumby.com/combos?id=16&page=2
https://www.maxgumby.com/combos?id=17&page=2
https://www.maxgumby.com/combos?id=18&page=2
https://www.maxgumby.com/combos?id=19&page=2
https://www.maxgumby.com/combos?id=20&page=2
https://www.maxgumby.com/combos?id=21&page=2
https://www.maxgumby.com/combos?id=22&page=2
https://www.maxgumb

In [42]:
def getLatLong(row):
    feat1 = airports_lyr.query(where=f"iata_code = '{row['DPT_A']}'",out_sr=4326).features[0]
    row['depart_name'] = feat1.attributes['name']
    row['depart_x'] = feat1.geometry['x']
    row['depart_y'] = feat1.geometry['y']
    
    feat2 = airports_lyr.query(where=f"iata_code = '{row['ARV_A']}'",out_sr=4326).features[0]
    row['arrive_name'] = feat2.attributes['name']
    row['arrive_x'] = feat2.geometry['x']
    row['arrive_y'] = feat2.geometry['y']
    return row

df_enriched = raw_df.apply(getLatLong,axis=1)

In [72]:
# Query the airport layer once to get all features
all_airports = airports_lyr.query(where="1=1", out_sr=4326).features

# Create a dictionary to map IATA codes to their corresponding features, excluding entries with None as iata_code
airport_dict = {feat.attributes['iata_code']: feat for feat in all_airports if feat.attributes['iata_code']}


In [93]:
# Create a DataFrame from the airport features
airport_df = pd.DataFrame([{
    'iata_code': feat.attributes['iata_code'],
    'name': feat.attributes['name'],
    'x': feat.geometry['x'],
    'y': feat.geometry['y']
} for feat in all_airports if feat.attributes['iata_code']])

# Merge departure airport information
depart_df = raw_df[['DPT_A']].merge(airport_df, left_on='DPT_A', right_on='iata_code', how='left')
depart_df = depart_df.rename(columns={
    'name': 'depart_name',
    'x': 'depart_x',
    'y': 'depart_y'
}).drop(columns=['iata_code', 'DPT_A'])

# Merge arrival airport information
arrive_df = raw_df[['ARV_A']].merge(airport_df, left_on='ARV_A', right_on='iata_code', how='left')
arrive_df = arrive_df.rename(columns={
    'name': 'arrive_name',
    'x': 'arrive_x',
    'y': 'arrive_y'
}).drop(columns=['iata_code', 'ARV_A'])

# Concatenate the original DataFrame with the departure and arrival information
df_enriched = pd.concat([raw_df, depart_df, arrive_df], axis=1)
df_enriched.head()

Unnamed: 0,Trip,Full,Date,Base,Fleet,Seat,Days,Dptr,Arvl,Legs,...,IND,D/C,DPT_A,ARV_A,depart_name,depart_x,depart_y,arrive_name,arrive_x,arrive_y
0,V5001,-,Sun 6/30/24,CLE,737.0,,1.0,745.0,1348.0,2.0,...,,,CLE,MCO,Cleveland Hopkins International Airport,-81.8498,41.411701,Orlando International Airport,-81.308998,28.429399
1,V5001,-,Sun 6/30/24,CLE,737.0,,1.0,745.0,1348.0,2.0,...,,0.0,MCO,CLE,Orlando International Airport,-81.308998,28.429399,Cleveland Hopkins International Airport,-81.8498,41.411701
2,V5001,-,Mon 7/1/24,CLE,737.0,,1.0,745.0,1348.0,2.0,...,,,CLE,MCO,Cleveland Hopkins International Airport,-81.8498,41.411701,Orlando International Airport,-81.308998,28.429399
3,V5001,-,Mon 7/1/24,CLE,737.0,,1.0,745.0,1348.0,2.0,...,,0.0,MCO,CLE,Orlando International Airport,-81.308998,28.429399,Cleveland Hopkins International Airport,-81.8498,41.411701
4,V5001,-,Tue 7/2/24,CLE,737.0,,1.0,745.0,1348.0,2.0,...,,,CLE,MCO,Cleveland Hopkins International Airport,-81.8498,41.411701,Orlando International Airport,-81.308998,28.429399


In [94]:
df_enriched.to_csv('trips_pages_200.csv')