In [1]:
# all imports
import re
import pandas as pd
import numpy as np

### Create empty dictionaries for citi, ports, and states

In [2]:
citi_code = {'id':[],
            'country':[]}

port = {
    'code': [],
    'city': [],
    'state_code': []
}

state = {
    'code': [],
    'name': []
}

### Regex patterns

In [3]:
# citi_code pattern string
pattern_str = """
                \s*         # Match any spaces
                ([\d]+)     # create a group and match 1 or more numerics
                \s+         # match one or more spaces
                =           # match '='
                \s+         
                '(.*)'      # match everything within ''
              """

# citi code regex pattern
pattern = re.compile(pattern_str, re.VERBOSE)

# port regex pattern
# extracts port code, and name
port_pattern = re.compile("\s*'(\S+)'\s+=\s+'([\S\s#]+),\s*([\S\s#]+)'")

# state regex pattern
# extracts code, and name
state_pattern = re.compile("\s*'([a-zA-Z0-9]+)'='([a-zA-Z\.\s]+)'")

### Read the labels file and fill dictionaries

In [4]:
with open('I94_SAS_Labels_Descriptions.SAS', 'r') as fp:
    for i, line in enumerate(fp):
        if i > 8 and i < 245:
            match = re.search(pattern, line)
            citi_code['id'].append(match.group(1))
            citi_code['country'].append(match.group(2))
        if i > 301 and i < 893:
            match = re.search(port_pattern, line)
            try:
                port['code'].append(match.group(1))
                port['city'].append(match.group(2))
                port['state_code'].append(match.group(3))
            except:
                port['code'].append(None)
                port['city'].append(None)
                port['state_code'].append(None)
        if i > 980 and i < 1036:
            match = re.search(state_pattern, line)
            state['code'].append(match.group(1))
            state['name'].append(match.group(2))

### Create dataframes from dictionaries

In [5]:
# citi code dataframe
citi_code_df = pd.DataFrame(citi_code)
citi_code_df = citi_code_df.set_index('id')

citi_code_df.head()

Unnamed: 0_level_0,country
id,Unnamed: 1_level_1
582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
236,AFGHANISTAN
101,ALBANIA
316,ALGERIA
102,ANDORRA


In [6]:
# airport dataframe
port_df = pd.DataFrame(port)
port_df = port_df.set_index('code')
port_df['state_code'] = port_df.state_code.str.strip()
port_df = port_df.dropna(how='all')
port_df.head()

Unnamed: 0_level_0,city,state_code
code,Unnamed: 1_level_1,Unnamed: 2_level_1
ALC,ALCAN,AK
ANC,ANCHORAGE,AK
BAR,BAKER AAF - BAKER ISLAND,AK
DAC,DALTONS CACHE,AK
PIZ,DEW STATION PT LAY DEW,AK


In [7]:
values = ['AR (BPS)', 'CA (BPS)', 'CO #ARPT', 'FL #ARPT', 'LA (BPS)',
       'ME (BPS)', 'MT (BPS)', 'NM (BPS)', 'SC #ARPT', 'TX (BPS)',
       'VA #ARPT', 'VT (I-91)', 'VT (RT. 5)', 'VT (BP - SECTOR HQ)',
       'WASHINGTON #INTL', 'WA (BPS)']
# clean state_code
temp = np.where(port_df.state_code.isin(values), port_df.state_code.str[:2],\
                np.where(port_df.state_code.str.len()==2, port_df.state_code, np.nan))

us_state_codes = np.where(temp=='MX', np.nan, temp)
port_df['state_code'] = us_state_codes
port_df = port_df.dropna(how='any')

In [8]:
# states dataframe
states = pd.DataFrame(state)
states = states.set_index('code')
states.head()

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
AL,ALABAMA
AK,ALASKA
AZ,ARIZONA
AR,ARKANSAS
CA,CALIFORNIA


In [9]:
# visa category dataframe
visa_category = pd.DataFrame({
    'id': [1,2,3],
    'category': ['Business', 'Pleasure', 'Student']
})
visa_category = visa_category.set_index('id')
visa_category

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
1,Business
2,Pleasure
3,Student


In [10]:
# travel mode dataframe

travel_mode = pd.DataFrame({
    'id': [1,2,3,9],
    'mode': ['Air', 'Sea', 'land', 'Not reported']
})

travel_mode = travel_mode.set_index('id')
travel_mode

Unnamed: 0_level_0,mode
id,Unnamed: 1_level_1
1,Air
2,Sea
3,land
9,Not reported


### Save all dataframes

In [11]:
citi_code_df.to_csv('country_code.csv')
port_df.to_csv('port_immigration.csv')
states.to_csv('state_code.csv')
visa_category.to_csv('visa_category.csv')
travel_mode.to_csv('travel_mode.csv')

### Create us airports dataframe

In [12]:
airports = pd.read_csv('airport-codes_csv.csv')

In [13]:
airports.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [14]:
us_airports = airports[airports.iso_country.str.lower() == 'us']

us_airports = us_airports[us_airports.type.isin(['small_airport', 'medium_airport', 'large_airport'])]

us_intl_airports = us_airports[us_airports.name.str.contains('International')]

us_intl_airports = us_intl_airports[~us_intl_airports.municipality.isnull()]

us_intl_airports.shape

(224, 12)

In [15]:
long_lat = us_intl_airports['coordinates'].str.split(',', expand=True)
long_lat.columns = ['longitude', 'latitude']
us_intl_final = pd.concat([us_intl_airports, long_lat], axis=1).drop('coordinates', axis=1)

In [16]:
us_intl_final.to_csv('us_interantional_airport_codes.csv', index=False)