In [1]:
# all imports
import re
import pandas as pd

### Create empty dictionaries for citi, ports, and states

In [2]:
citi_code = {'id':[],
            'country':[]}

port = {
    'code': [],
    'airport': []
}

state = {
    'code': [],
    'name': []
}

### Regex patterns

In [3]:
# citi_code pattern string
pattern_str = """
                \s*         # Match any spaces
                ([\d]+)     # create a group and match 1 or more numerics
                \s+         # match one or more spaces
                =           # match '='
                \s+         
                '(.*)'      # match everything within ''
              """

# citi code regex pattern
pattern = re.compile(pattern_str, re.VERBOSE)

# port regex pattern
# extracts port code, and name
port_pattern = re.compile("\s*'(\S+)\s*'\s+=\s+'([\S\s#,]+)\s*'")

# state regex pattern
# extracts code, and name
state_pattern = re.compile("\s*'([a-zA-Z0-9]+)'='([a-zA-Z\.\s]+)'")

### Read the labels file and fill dictionaries

In [4]:
with open('I94_SAS_Labels_Descriptions.SAS', 'r') as fp:
    for i, line in enumerate(fp):
        if i > 8 and i < 298:
            match = re.search(pattern, line)
            citi_code['id'].append(match.group(1))
            citi_code['country'].append(match.group(2))
        if i > 301 and i < 962:
            match = re.search(port_pattern, line)
            port['code'].append(match.group(1))
            port['airport'].append(match.group(2))
        if i > 980 and i < 1036:
            match = re.search(state_pattern, line)
            state['code'].append(match.group(1))
            state['name'].append(match.group(2))

### Create dataframes from dictionaries

In [7]:
# citi code dataframe
citi_code_df = pd.DataFrame(citi_code)
citi_code_df = citi_code_df.set_index('id')

citi_code_df.head()

Unnamed: 0_level_0,country
id,Unnamed: 1_level_1
582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
236,AFGHANISTAN
101,ALBANIA
316,ALGERIA
102,ANDORRA


In [8]:
# airport dataframe
airport_df = pd.DataFrame(port)
airport_df = airport_df.set_index('code')
airport_df.head()

Unnamed: 0_level_0,airport
code,Unnamed: 1_level_1
ALC,"ALCAN, AK"
ANC,"ANCHORAGE, AK"
BAR,"BAKER AAF - BAKER ISLAND, AK"
DAC,"DALTONS CACHE, AK"
PIZ,"DEW STATION PT LAY DEW, AK"


In [9]:
# states dataframe
states = pd.DataFrame(state)
states = states.set_index('code')
states.head()

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
AL,ALABAMA
AK,ALASKA
AZ,ARIZONA
AR,ARKANSAS
CA,CALIFORNIA


In [10]:
# visa category dataframe
visa_category = pd.DataFrame({
    'id': [1,2,3],
    'category': ['Business', 'Pleasure', 'Student']
})
visa_category = visa_category.set_index('id')
visa_category

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
1,Business
2,Pleasure
3,Student


In [11]:
# travel mode dataframe

travel_mode = pd.DataFrame({
    'id': [1,2,3,9],
    'mode': ['Air', 'Sea', 'land', 'Not reported']
})

travel_mode = travel_mode.set_index('id')
travel_mode

Unnamed: 0_level_0,mode
id,Unnamed: 1_level_1
1,Air
2,Sea
3,land
9,Not reported


### Save all dataframes

In [12]:
citi_code_df.to_csv('country_code.csv')
airport_df.to_csv('port_immigration.csv')
states.to_csv('state_code.csv')
visa_category.to_csv('visa_category.csv')
travel_mode.to_csv('travel_mode.csv')