# Fix inconsistent airport codes
The original data set uses two codes for each airport. October: 5-digits; other months: 3-letters. This notebook resaves the data so that all flights are encoded using the 3-letter system

# NOTE: This kernel requires extra files to run
Small csv files containing the two sets of airport codes which are acquired here: 
* 5-digit codes: https://www.transtats.bts.gov/FieldInfo.asp?Field_Desc=Origin%20Airport%2C%20Airport%20ID.%20An%20identification%20number%20assigned%20by%20US%20DOT%20to%20identify%20a%20unique%20airport.%20%20Use%20this%20field%20for%20airport%20analysis%20across%20a%20range%20of%20years%20because%20an%20airport%20can%20change%20its%20airport%20code%20and%20airport%20codes%20can%20be%20reused.&Field_Type=Num&Lookup_Table=L_AIRPORT_ID&Table_ID=236&SYS_Table_Name=T_ONTIME&Sys_Field_Name=ORIGIN_AIRPORT_ID
* 3-letter codes: https://www.transtats.bts.gov/FieldInfo.asp?Field_Desc=Origin%20Airport&Field_Type=Char&Lookup_Table=L_AIRPORT&Table_ID=236&SYS_Table_Name=T_ONTIME&Sys_Field_Name=ORIGIN

In [None]:
# Imports
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set_style('white')

# Load two sets of airport codes

In [None]:
df_aircode1 = pd.DataFrame.from_csv('../input/L_AIRPORT.csv')
df_aircode2 = pd.DataFrame.from_csv('../input/L_AIRPORT_ID.csv')

In [None]:
# Format the airport codes
df_aircode1 = df_aircode1.reset_index()
df_aircode2 = df_aircode2.reset_index()
df_aircodes = pd.merge(df_aircode1,df_aircode2,on='Description')
aircode_dict = dict(zip(df_aircodes['Code_y'].astype(str),df_aircodes['Code_x']))

# Replace each 5-digit airport code for 3-letter airport code

In [None]:
# Load data
df_fl = pd.io.parsers.read_csv('../input/flights.csv')

# Make sure all Origin and departing airports are strings
df_fl['ORIGIN_AIRPORT'] = df_fl['ORIGIN_AIRPORT'].values.astype(str)
df_fl['DESTINATION_AIRPORT'] = df_fl['DESTINATION_AIRPORT'].values.astype(str)

N_flights = len(df_fl)
for i in range(N_flights):
    if i % 100000 == 0:
        print(i)
    if len(df_fl['ORIGIN_AIRPORT'][i]) != 3:
        to_replace = df_fl['ORIGIN_AIRPORT'][i]
        value = aircode_dict[df_fl['ORIGIN_AIRPORT'][i]]
        df_fl = df_fl.replace(to_replace, value)
        print('replaced',to_replace,'with',value)
    elif len(df_fl['DESTINATION_AIRPORT'][i]) != 3:
        to_replace = df_fl['DESTINATION_AIRPORT'][i]
        value = aircode_dict[df_fl['DESTINATION_AIRPORT'][i]]
        df_fl = df_fl.replace(to_replace, value)
        print('replaced',to_replace,'with',value)