# Process Data for All Flights
---

### Load Seaborn, Pandas, and other Libraries

---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import os

### Create a function to process data files one by one
---

##### Get a list of all files in the data folder:

In [2]:
files = os.listdir('data')

##### Function to do everything all at once:

In [3]:
def process_file(filename):
    # Read in airports and filter for small, medium, and large ones
    airports = pd.read_csv('all_airports(Cleaned).csv', encoding = 'latin1')
    airports = airports[(airports["Airport_Type"] == 'large_airport') | (airports["Airport_Type"] == 'small_airport') | (airports["Airport_Type"] == 'medium_airport')]
    
    # Read in the flights file
    flights = pd.read_csv(f'data/{filename}')
    print(f'Reading in {filename}')
    flights['origin'] = flights['origin'].str[1:]
    flights['destination'] = flights['destination'].str[1:]
    
    # Inner join flights on origin with small, medium, large airport codes
    flights = flights.merge(airports, left_on='origin', right_on='Airport_Code')
    
    # Rename columns
    flights.rename(columns = {'callsign' : 'Airline', 'icao24' : 'Aircraft_ID', 'typecode' : 'Aircraft_type', 
                          'origin' : 'Origin_Airport_Code', 'destination' : 'Destination_Airport_Code',
                          'firstseen' : 'Departure_Time', 'lastseen' : 'Arrival_Time',
                          'day' : 'Date', 'Airport_Type' : 'Origin_Airport_Type', 
                          'Airport_Name' : 'Origin_Airport_Name', 'Country_Name' : 'Origin_Country_Name',
                          'Longitude' : 'Origin_Airport_Longitude', 'Latitude' : 'Origin_Airport_Latitude'}, inplace=True)
    
    # Inner join flights on destination with small, medium, large airport codes
    flights = flights.merge(airports, left_on='Destination_Airport_Code', right_on='Airport_Code')
    
    # Rename columns
    flights.rename(columns = {'Airport_Type' : 'Destination_Airport_Type', 
                          'Airport_Name' : 'Destination_Airport_Name', 'Country_Name' : 'Destination_Country_Name',
                          'Longitude' : 'Destination_Airport_Longitude', 'Latitude' : 'Destination_Airport_Latitude'}, inplace=True)
    
    # Drop unnecessary columns
    flights.drop(columns =['number', 'registration', 'altitude_1', 'altitude_2', 'latitude_1', 'longitude_1','latitude_2', 'longitude_2', 'Country_x', 'Airport_Code_x', 'Country_y', 'Airport_Code_y'], axis=1, inplace=True)
    
    # Read in the airline codes
    airlines = pd.read_csv('airlines_5000.csv')
    
    # Join flights with airline codes 
    flights_airlines = flights.merge(airlines, left_on='Airline', right_on='ICAO')
    
    # Drop unnecessary columns and rename others
    flights_airlines.drop(columns =['Alias', 'IATA', 'Active', 'Airline ID' ], axis=1, inplace=True)
    flights_airlines.rename(columns = {'Airline' : 'Airline_Code'}, inplace=True)
    
    # Convert to datetimes
    flights_airlines['Date'] = pd.to_datetime(flights_airlines['Date'])
    flights_airlines['Departure_Time'] = pd.to_datetime(flights_airlines['Departure_Time'])
    flights_airlines['Arrival_Time'] = pd.to_datetime(flights_airlines['Arrival_Time'])
    
    # Reset the dataframe index and name it Row ID
    flights_airlines.reset_index(drop=True, inplace=True)
    flights_airlines.index.name='Row_ID'
    
    # Save to processed data folder
    return flights_airlines

print("Done!")

Done!


### Apply the function to all the files!

In [9]:
df_list = [process_file(file) for file in files if 'ipynb' not in file]

  flights = pd.read_csv(f'data/{filename}')


Reading in flightlist_20190101_20190131.csv
Reading in flightlist_20190201_20190228.csv
Reading in flightlist_20190301_20190331.csv
Reading in flightlist_20190401_20190430.csv
Reading in flightlist_20190501_20190531.csv
Reading in flightlist_20190601_20190630.csv
Reading in flightlist_20190701_20190731.csv
Reading in flightlist_20190801_20190831.csv
Reading in flightlist_20190901_20190930.csv
Reading in flightlist_20191001_20191031.csv
Reading in flightlist_20191101_20191130.csv
Reading in flightlist_20191201_20191231.csv
Reading in flightlist_20200101_20200131.csv
Reading in flightlist_20200201_20200229.csv
Reading in flightlist_20200301_20200331.csv
Reading in flightlist_20200401_20200430.csv
Reading in flightlist_20200501_20200531.csv
Reading in flightlist_20200601_20200630.csv
Reading in flightlist_20200701_20200731.csv
Reading in flightlist_20200801_20200831.csv
Reading in flightlist_20200901_20200930.csv
Reading in flightlist_20201001_20201031.csv
Reading in flightlist_20201101_2

In [10]:
files

['.ipynb_checkpoints',
 'flightlist_20190101_20190131.csv',
 'flightlist_20190201_20190228.csv',
 'flightlist_20190301_20190331.csv',
 'flightlist_20190401_20190430.csv',
 'flightlist_20190501_20190531.csv',
 'flightlist_20190601_20190630.csv',
 'flightlist_20190701_20190731.csv',
 'flightlist_20190801_20190831.csv',
 'flightlist_20190901_20190930.csv',
 'flightlist_20191001_20191031.csv',
 'flightlist_20191101_20191130.csv',
 'flightlist_20191201_20191231.csv',
 'flightlist_20200101_20200131.csv',
 'flightlist_20200201_20200229.csv',
 'flightlist_20200301_20200331.csv',
 'flightlist_20200401_20200430.csv',
 'flightlist_20200501_20200531.csv',
 'flightlist_20200601_20200630.csv',
 'flightlist_20200701_20200731.csv',
 'flightlist_20200801_20200831.csv',
 'flightlist_20200901_20200930.csv',
 'flightlist_20201001_20201031.csv',
 'flightlist_20201101_20201130.csv',
 'flightlist_20201201_20201231.csv',
 'flightlist_20210101_20210131.csv',
 'flightlist_20210201_20210228.csv',
 'flightlist_20

In [11]:
df_list

[       Airline_Code Aircraft_ID Aircraft_type Origin_Airport_Code  \
 Row_ID                                                              
 0               SFR      00893d           NaN                 ALE   
 1               SFR      7c5be5           NaN                 SBK   
 2               SFR      7c5be5           NaN                 SBK   
 3               SFR      7c5be5           NaN                 SBK   
 4               SFR      7c5be5           NaN                 SBK   
 ...             ...         ...           ...                 ...   
 4553            ITW      7c2b42           NaN                 SEN   
 4554            RTS      7c58ce           NaN                 SEN   
 4555            RTS      7c58ce           NaN                 SEN   
 4556            RTS      7c58ce           NaN                 SEN   
 4557            PNT      7c4dd7           NaN                 SEN   
 
        Destination_Airport_Code            Departure_Time  \
 Row_ID                   

### Merge all the processed files together

In [12]:
all_flights_data = pd.concat(df_list).reset_index(drop=True)
all_flights_data.index.name = 'Row ID'

### Save the result

In [13]:
all_flights_data.to_csv('Processed_5000_data/all_flights.csv')

In [14]:
compressed = pd.read_csv('Processed_5000_data/all_flights.csv')

In [15]:
compressed.shape

(160777, 23)