# Exploratory Data Analysis

## The Immigration Data

In [151]:
import logging
import numpy as np
import pandas as pd
import psycopg2
from pprint import pprint
from tqdm import tqdm

In [36]:
pd.set_option('display.max_columns', 28)

In [37]:
immigration_data_fnames = ['data/18-83510-I94-Data-2016/i94_jan16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_feb16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_mar16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_may16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_jun16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_jul16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_aug16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_sep16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_oct16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_nov16_sub.sas7bdat',
                           'data/18-83510-I94-Data-2016/i94_dec16_sub.sas7bdat']

Because the column names are inconsistent across files I need to figure out what they are in each

In [38]:
dfs = []
for fname in immigration_data_fnames:
    myiter = pd.read_sas(fname, 'sas7bdat', encoding='ISO-8859-1', chunksize=20)
    dfs.append(next(myiter))
    continue

In [39]:
cnames_by_fname = {t[0].split('/')[-1].split('_')[1][:3]: list(t[1].columns.values) for t in zip(immigration_data_fnames, dfs)}

In [40]:
from collections import defaultdict
cbyf_reversed = defaultdict(list)
for k, v in cnames_by_fname.items():
    cbyf_reversed[','.join(v)].append(k)

In [41]:
print(len(cbyf_reversed))
cbyf_reversed.values()

2


dict_values([['jan', 'feb', 'mar', 'apr', 'may', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'], ['jun']])

So the only screwy one is june

In [42]:
usual = list(dfs[0].columns.values)
screwy = list(dfs[5].columns.values)
print(len(usual), len(screwy))
for z in (zip(usual, screwy)):
    print(z)

28 34
('cicid', 'cicid')
('i94yr', 'i94yr')
('i94mon', 'i94mon')
('i94cit', 'i94cit')
('i94res', 'i94res')
('i94port', 'i94port')
('arrdate', 'arrdate')
('i94mode', 'i94mode')
('i94addr', 'i94addr')
('depdate', 'depdate')
('i94bir', 'i94bir')
('i94visa', 'i94visa')
('count', 'count')
('dtadfile', 'validres')
('visapost', 'delete_days')
('occup', 'delete_mexl')
('entdepa', 'delete_dup')
('entdepd', 'delete_visa')
('entdepu', 'delete_recdup')
('matflag', 'dtadfile')
('biryear', 'visapost')
('dtaddto', 'occup')
('gender', 'entdepa')
('insnum', 'entdepd')
('airline', 'entdepu')
('admnum', 'matflag')
('fltno', 'biryear')
('visatype', 'dtaddto')


In [43]:
usual == [s for s in screwy if not (s.startswith('delete_') or s == 'validres')]

True

So I should just be able to read in the `usual` columns and be good...but `read_sas` doesn't have any params like that so...

In [10]:
# let's try debugging just with january, february, and june
# this seemed to work so let's retry and get the whole thing
# immigration_data_fnames = ['data/18-83510-I94-Data-2016/i94_jan16_sub.sas7bdat',
#                            'data/18-83510-I94-Data-2016/i94_feb16_sub.sas7bdat',
#                            'data/18-83510-I94-Data-2016/i94_jun16_sub.sas7bdat']

In [44]:
# running as script off to the side

# for fname in immigration_data_fnames:
#     logging.info(fname)
#     myiter = pd.read_sas(fname, 'sas7bdat', encoding='ISO-8859-1', chunksize=500000)
#     for sub_df in tqdm(myiter):
#         try:
#             immigration_df = pd.concat([immigration_df, sub_df], join='inner')
#         except NameError:
#             print('first timer here')
#             immigration_df = sub_df
#     if 'may16' in fname:  # special case for june
#         continue
#     else:
#         pd.to_sql('immigration_data', conn, schema='capstone', if_exists='append')

In [45]:
print(immigration_df.shape)
list(immigration_df.columns.values)

(2847924, 28)


['cicid',
 'i94yr',
 'i94mon',
 'i94cit',
 'i94res',
 'i94port',
 'arrdate',
 'i94mode',
 'i94addr',
 'depdate',
 'i94bir',
 'i94visa',
 'count',
 'dtadfile',
 'visapost',
 'occup',
 'entdepa',
 'entdepd',
 'entdepu',
 'matflag',
 'biryear',
 'dtaddto',
 'gender',
 'insnum',
 'airline',
 'admnum',
 'fltno',
 'visatype']

so that seems like it takes between five and five-and-a-half hours to load all the data into a data frame

## Immigration Data Headers File

`data/I94_SAS_Labels_Descriptions.SAS` is a text file with descriptions of what each column is and what can go inside it. Will be a bit of work to parse it, but should help

In [46]:
import pdir
with open('data/I94_SAS_Labels_Descriptions.SAS') as f:
    txt = f.read()
    f.seek(0)
    lines = f.readlines()
comment_lines = [line for line in lines if line.startswith('/*') and line.endswith('*/\n')]

In [47]:
import re
clpatt = re.compile(r'^/\*\s+(?P<code>.+?)\s+-\s+(?P<description>.+)\s+\*/$')
matches = [clpatt.match(cl) for cl in comment_lines]
if not all(m is not None for m in matches):
    for i, m in enumerate(matches):
        if m is None:
            print(i)
print(f'CODE{"":16}', 'DESCRIPTION')
for m in matches:
    print(f'{m.group("code"):20}', m.group('description'))

CODE                 DESCRIPTION
I94YR                4 digit year
I94MON               Numeric month
I94CIT & I94RES      This format shows all the valid and invalid codes for processing
I94PORT              This format shows all the valid and invalid codes for processing
I94MODE              There are missing values as well as not reported (9)
I94BIR               Age of Respondent in Years
COUNT                Used for summary statistics
DTADFILE             Character Date Field - Date added to I-94 Files - CIC does not use
VISAPOST             Department of State where where Visa was issued - CIC does not use
OCCUP                Occupation that will be performed in U.S. - CIC does not use
ENTDEPA              Arrival Flag - admitted or paroled into the U.S. - CIC does not use
ENTDEPD              Departure Flag - Departed, lost I-94 or is deceased - CIC does not use
ENTDEPU              Update Flag - Either apprehended, overstayed, adjusted to perm residence - CIC does not use
MAT

In [None]:
print(list(immigration_df.columns.values))

Since those descriptions don't make a ton of sense all the time, let me add my own anntations here:

- cicid - primary key
- i94yr - year (of arrival?)
- i94mon - month (of arrival?)
- i94cit - citizenship
- i94res - country of residence
- i94port - port of arrival? these are not airport codes
- arrdate - arrival date in USA
- i94mode - mode of arrival: land, sea, air
- i94addr - state...of arrival? their intended address?
- depdate - departure date from USA
- i94bir - age of respondent in years
- i94visa - visa type: business, pleasure, student
- count - "used for summary statistics"??
- dtadfile - character date field "CIC does not use"
- visapost - "Department of State where visa was issued - CIC does not use"
- occup - occupation that will be performed. "CIC does not use"
- entdepd - "Arrival Flag - admitted or paroled into the U.S. - CIC does not use"
- entdepu - "Departure Flag - Departed, lost I-94 or is deceased - CIC does not use"
- matflag - match of arrival and departure records
- biryear - year of birth
- dtaddto - date allowed to stay until "CIC does not use"
- gender - "non-immigrant sex"
- insnum - INS number
- airline - airline they came in on
- admnum - admission number
- fltno - flight number they came in on
- visatype - class of admission legally admitting non-immigrant to temporary stay in US

## The Temperature Data


In [48]:
temperature_df = pd.read_csv('data/GlobalLandTemperaturesByCity.csv')
print(list(temperature_df.columns.values))
print(temperature_df.shape)
temperature_df.head()

['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City', 'Country', 'Latitude', 'Longitude']
(8599212, 7)


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [274]:
dim_temperature_drop = 'DROP TABLE IF EXISTS dim_temperature;'
dim_temperature_create = """CREATE TABLE IF NOT EXISTS dim_temperature (
    dt date, avg_temp numeric, avg_temp_uncertainty numeric, city varchar,
    country varchar, latitude varchar, longitude varchar
);"""
dim_temperature_insert = """INSERT INTO dim_temperature
(dt, avg_temp, avg_temp_uncertainty, city,
    country, latitude, longitude)
VALUES (%s, %s, %s, %s, %s, %s, %s);"""

In [275]:
cur.execute(dim_temperature_create)

In [276]:
for _, row in temperature_df.iterrows():
    cur.execute(dim_temperature_insert, [v if pd.notna(v) else None for v in row])

## The City Demographics Data

In [49]:
city_demo_df = pd.read_csv('data/us-cities-demographics.csv', delimiter=';')
print(list(city_demo_df.columns.values))
print(city_demo_df.shape)
city_demo_df.head()

['City', 'State', 'Median Age', 'Male Population', 'Female Population', 'Total Population', 'Number of Veterans', 'Foreign-born', 'Average Household Size', 'State Code', 'Race', 'Count']
(2891, 12)


Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [281]:
dim_cities_drop = 'DROP TABLE IF EXISTS dim_city_demographics;'

dim_cities_create = """CREATE TABLE IF NOT EXISTS dim_city_demographics
(city varchar, state varchar, median_age numeric, male_pop int, female_pop int, total_pop int, num_vets int,
foreign_born int, avg_household_size float, state_code char(2), race varchar, count int);
"""

dim_cities_insert = """INSERT INTO dim_city_demographics
(city, state, median_age, male_pop, female_pop, total_pop, num_vets, foreign_born, avg_household_size, state_code,
race, count)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""


In [284]:
cur.execute(dim_cities_drop)
cur.execute(dim_cities_create)

In [287]:
for _, row in city_demo_df.iterrows():
    assert (row['Male Population'] + row['Female Population'] == row['Total Population']) or pd.isnull(row['Male Population']) or pd.isnull(row['Female Population'])
    cur.execute(dim_cities_insert, [v if pd.notna(v) else None for v in row])

## The Airport Codes Data

In [126]:
airport_codes_df = pd.read_csv('data/airport-codes.csv')
print(list(airport_codes_df.columns.values))
print(airport_codes_df.shape)
airport_codes_df.head()

['ident', 'type', 'name', 'elevation_ft', 'continent', 'iso_country', 'iso_region', 'municipality', 'gps_code', 'iata_code', 'local_code', 'coordinates']
(55075, 12)


Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [76]:
airport_codes_df[airport_codes_df['iso_country'] != 'US'].shape

(32318, 12)

In [78]:
airport_codes_df.ident.unique().shape

(55075,)

In [101]:
airport_codes_df['coordinates'].isnull().values.any()

False

In [85]:
airport_codes_df[airport_codes_df['continent'].notnull()]

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
223,03N,small_airport,Utirik Airport,4.0,OC,MH,MH-UTI,Utirik Island,K03N,UTK,03N,"169.852005, 11.222"
1111,0TT8,heliport,Dynasty Heliport,150.0,OC,MP,MP-U-A,"San Jose, Tinian",0TT8,,0TT8,"145.64199829101562, 14.963299751281738"
10134,9OG1,heliport,Barrigada Readiness Center Heliport,311.0,OC,GU,GU-U-A,Guam,9OG1,,9OG1,"144.812142, 13.475863"
10368,AAD,small_airport,Adado Airport,1001.0,AF,SO,SO-GA,Adado,,AAD,,"46.6375, 6.095802"
10369,AAXX,small_airport,Rothera Point Airport,,AN,AQ,AQ-U-A,Rothera Point,AAXX,,,"-68.1269931793, -67.5669411575"
10370,ABE,closed,RAF Calveley,,EU,GB,GB-ENG,Cheshire,,,,"-2.603889, 53.113333"
10371,ABL,closed,RNAS/RAF Calshot,,EU,GB,GB-ENG,Hampshire,,,,"-1.30677223206, 50.8199131549"
10372,ABP,small_airport,Atkamba Airport,150.0,OC,PG,PG-WPD,Atkamba Mission,,ABP,AKA,"141.095277778, -6.0655555555600005"
10373,ABW,closed,Abau Airport,10.0,OC,PG,PG-CPM,Abau,,ABW,,"148.7389, -10.1956"
10374,AD-0001,heliport,CamÃ­ Heliport,,EU,AD,AD-04,La Massana,,,,"1.51916, 42.546257"


In [139]:
airport_codes_df[airport_codes_df['continent'].str.len() > 2]

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,longitude,latitude


## Next Steps

- ? consider questions you want to ask
- start modeling the data (on paperish)
- clean the data

## Questions To Ask

These are not final but what I'm considering now

1. Do changes in immigrant residence and citizenship change with those places' temperatures?
1. What are the countries we're seeing change most in immigrant residence and citizenship?
1. Where the people are headed, what's the % of US born? I.e., are people going to immigrant strongholds or new places?

## What I'd need to answer those questions

### Change in immigrant res and cit related to those places' temperatures

Things I'll need:

1. Country of residence over time
1. Country of citizenship over time
1. Temperature over time by country

### Do I have those things?

#### What is the time range over which I have immigration data?

First, I think it's just 2016 so while I'll be able to show what countries have the highest emigration in 2016, I won't be able to show changes for more time than that

Second, to be sure of what I have for this data, I'm going to have to probably ingest the whole thing into a Redshift table...this data is big...I'm currently trying to import all of January up higher, but that's taking a while and it's only January...I may very well have to model this somehow before I really know what I have

...

Two hours in and it's still loading January, so what I'll likely have to do is load...a bunch, and play with that, and worry about the big data later

Alternatively I could increase the chunk size to something more reasonable (1000?) and go from there

...

16 hours 50 minutes in and it's still loading January, yeah this won't work

...

Changed chunk size to 100,000, so it should only take 11 iterations to get back to the 55k iterations that took 17 hours before I killed it

Takes six seconds per iteration, let's try one million then, and after two iterations will be bigger than what took 17 hours before. That takes 48 seconds per iteration...too slow to be very satisfying, but still it takes two minutes to bring in what took 17 hours before, so I'll be curious how long this takes now...oh it's done after two iterations for crying out loud at 2.847 MM rows

...

Tried reading in the whole year but got stuck ish in June/July when I got an error that said 

```0it [00:00, ?it/s]/Users/scott/dend_capstone/venv/lib/python3.7/site-packages/ipykernel_launcher.py:17: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.```

so look into that

In [29]:
integer_columns = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'arrdate', 'i94visa', 'count', 'admnum']
integer_but_nan_columns = ['i94mode', 'i94bir', 'depdate', 'insnum', 'biryear', 'fltno']
integer_mapper = {k: 'int32' for k in integer_columns}
immigration_df = immigration_df.astype(integer_mapper)

In [30]:
immigration_df.dtypes

cicid         int32
i94yr         int32
i94mon        int32
i94cit        int32
i94res        int32
i94port      object
arrdate       int32
i94mode     float64
i94addr      object
depdate     float64
i94bir      float64
i94visa       int32
count         int32
dtadfile     object
visapost     object
occup        object
entdepa      object
entdepd      object
entdepu      object
matflag      object
biryear     float64
dtaddto      object
gender       object
insnum       object
airline      object
admnum        int32
fltno        object
visatype     object
dtype: object

#### Exploring some of those nan-able columsn to see if I can convert to 0

In [37]:
# return sone row but sas header file says it's invalid so assume 0 == NaN
immigration_df[immigration_df['i94mode'] == 0]

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
90487,150710,2016,1,245,245,XXX,20480,0.0,,,26.0,3,1,20120831,,,,,,,1990.0,D/S,,,,-2147483648,,F1


In [40]:
# This is age of respondent in years, so I have to assume 0 is different from NaN, so I can't convert this one
# Which means I'll have to figure out how to integerize some of these as they go into the database
# So I should probably stop spending time on this
immigration_df[immigration_df['i94bir'] == 0].shape

(47, 28)

#### Let's try getting the data into a database so I can explore it better

In [67]:
conn = psycopg2.connect('host=127.0.0.1 dbname=capstone user=capstone_user password=capstone_pw')
conn.set_session(autocommit=True)
cur = conn.cursor()
#cur.execute('select * from information_schema.tables')
#rows = cur.fetchall()
#print(rows)

In [21]:
cur.execute('select * from information_schema.tables')
rows = cur.fetchall()
print(rows)

[('capstone', 'pg_catalog', 'pg_type', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None), ('capstone', 'pg_catalog', 'pg_roles', 'VIEW', None, None, None, None, None, 'NO', 'NO', None), ('capstone', 'pg_catalog', 'pg_settings', 'VIEW', None, None, None, None, None, 'NO', 'NO', None), ('capstone', 'pg_catalog', 'pg_rules', 'VIEW', None, None, None, None, None, 'NO', 'NO', None), ('capstone', 'pg_catalog', 'pg_stat_xact_user_functions', 'VIEW', None, None, None, None, None, 'NO', 'NO', None), ('capstone', 'pg_catalog', 'pg_stat_archiver', 'VIEW', None, None, None, None, None, 'NO', 'NO', None), ('capstone', 'pg_catalog', 'pg_stat_bgwriter', 'VIEW', None, None, None, None, None, 'NO', 'NO', None), ('capstone', 'pg_catalog', 'pg_attribute', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None), ('capstone', 'pg_catalog', 'pg_proc', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None), ('capstone', 'pg_catalog', 'pg_class', 'BASE TABLE', None, None, None, Non

In [4]:
for fname in immigration_data_fnames:
    print(fname)
    myiter = pd.read_sas(fname, 'sas7bdat', encoding='ISO-8859-1', chunksize=500000)
    for sub_df in tqdm(myiter):
        try:
            immigration_df = pd.concat([immigration_df, sub_df], join='inner')
        except NameError:
            print('first timer here')
            immigration_df = sub_df
    if 'may16' in fname:  # special case for june
        continue
    else:
        immigration_df.to_sql('immigration_data', conn, schema='capstone', if_exists='append')
        del immigration_df

1it [00:21, 21.04s/it]

first timer here


6it [02:07, 20.81s/it]


NameError: name 'conn' is not defined

In [7]:
pd.__version__

'0.25.0'

In [9]:
import sqlalchemy

In [32]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://capstone_user:capstone_pw@localhost:5432/capstone')

In [31]:
immigration_df.shape

(2847924, 28)

In [184]:
# doesn't work in notebooks
# immigration_df.to_sql('immigration_data', engine.raw_connection(), if_exists='append')

In [51]:
with open('data/I94_SAS_Labels_Descriptions.SAS') as f:
    lines = f.readlines()
country_lines = lines[9:298]
print(country_lines[0], country_lines[-1])

   582 =  'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'
    996 =  'No Country Code (996)' ;



In [52]:
len(country_lines)

289

In [53]:
import re
patt = re.compile(r"^\s*(?P<code>\d+)\s*=\s*'(?P<country>.+)'.*$")
matches = [patt.match(line) for line in country_lines]

In [58]:
country_codes = {int(match.group('code')): match.group('country') for match in matches}

In [62]:
assert len(country_lines) == len(country_codes)

In [63]:
list(country_codes.items())[:5]

[(582, 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'),
 (236, 'AFGHANISTAN'),
 (101, 'ALBANIA'),
 (316, 'ALGERIA'),
 (102, 'ANDORRA')]

All right, so turn that into a table

In [69]:
dim_country_drop = 'DROP TABLE IF EXISTS dim_country;'

dim_country_create = """CREATE TABLE IF NOT EXISTS dim_country
(code int PRIMARY KEY, name varchar NOT NULL);"""

In [70]:
cur.execute(dim_country_create)

In [74]:
dim_country_insert = """INSERT INTO dim_country
(code, name)
VALUES (%s, %s)
ON CONFLICT (code) DO NOTHING;"""

In [75]:
for item in country_codes.items():
    cur.execute(dim_country_insert, item)

In [113]:
dim_airport_drop = 'DROP TABLE IF EXISTS dim_airport;'

dim_airport_create = """CREATE TABLE IF NOT EXISTS dim_airport
(identity varchar PRIMARY KEY, type varchar NOT NULL, name varchar NOT NULL,
 elevation_ft int, continent char(2), iso_country char(2), iso_region char(7) NOT NULL,
 municipality varchar, gps_code char(4), iata_code char(3), local_code char(7),
 latitude numeric NOT NULL, longitude numeric NOT NULL);"""

In [137]:
dim_airport_insert = """INSERT INTO dim_airport
(identity, type, name, elevation_ft, continent, iso_country, iso_region,
 municipality, gps_code, iata_code, local_code, latitude, longitude)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (identity) DO NOTHING;"""

In [114]:
cur.execute(dim_airport_create)

In [127]:
airport_codes_df['longitude'] = airport_codes_df['coordinates'].str.split(',').str[0]
airport_codes_df['latitude'] = airport_codes_df['coordinates'].str.split(',').str[1]

In [128]:
airport_codes_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,longitude,latitude
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",-74.93360137939453,40.07080078125
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",-101.473911,38.704022
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",-151.695999146,59.94919968
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",-86.77030181884766,34.86479949951172
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087",-91.254898,35.6087


In [133]:
airport_codes_df = airport_codes_df.drop(columns='coordinates') 
airport_codes_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,longitude,latitude
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,-74.93360137939453,40.07080078125
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,-101.473911,38.704022
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,-151.695999146,59.94919968
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,-86.77030181884766,34.86479949951172
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,-91.254898,35.6087


In [183]:
for _, row in airport_codes_df.iterrows():
    cur.execute(dim_airport_insert, [v if pd.notna(v) else None for v in row])

Well it's in there but I don't think it does much good

#### dim_date

In [268]:
dim_date_drop = 'DROP TABLE IF EXISTS dim_date;'

dim_date_create = """CREATE TABLE dim_date
(code int PRIMARY KEY, year int NOT NULL, month int NOT NULL,
 day int NOT NULL, day_of_week INT NOT NULL, ymd_dash char(10) NOT NULL,
 ymd_nodash char(8) NOT NULL, mdy_nodash char(8) NOT NULL);
"""
cur.execute(dim_date_drop)
cur.execute(dim_date_create)

In [269]:
from datetime import datetime, timedelta

In [270]:
dim_date_insert = """INSERT INTO dim_date
(code, year, month, day, day_of_week, ymd_dash, ymd_nodash, mdy_nodash)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s);"""

In [271]:
dt = datetime(2016, 1, 1)
end_dt = datetime(2019, 12, 31)
one_day = timedelta(days=1)
code = 20454

while dt <= end_dt:
    cur.execute(dim_date_insert, 
               [code, dt.year, dt.month, dt.day, dt.weekday(), dt.strftime('%Y-%m-%d'),
                dt.strftime('%Y%m%d'), dt.strftime('%d%m%Y')]
               )
    dt = dt + one_day
    code += 1
    
dt = datetime(2015, 12, 31)
end_dt = datetime(1900, 1, 1)
code = 20453

while dt >= end_dt:
    cur.execute(dim_date_insert, 
               [code, dt.year, dt.month, dt.day, dt.weekday(), dt.strftime('%Y-%m-%d'),
                dt.strftime('%Y%m%d'), dt.strftime('%d%m%Y')]
               )
    dt = dt - one_day
    code -=1

#### dim_arrival_mode

In [215]:
dim_arr_mode_drop = 'DROP TABLE IF EXISTS dim_arrival_mode;'

dim_arr_mode_create = """CREATE TABLE IF NOT EXISTS dim_arrival_mode
(code int, mode char(12))"""

dim_arr_mode_insert = """INSERT INTO dim_arrival_mode (code, mode)
VALUES (1, 'Air'), (2, 'Sea'), (3, 'Land'), (9, 'Not reported');"""

In [216]:
cur.execute(dim_arr_mode_drop)
cur.execute(dim_arr_mode_create)
cur.execute(dim_arr_mode_insert)

#### dim_address

Though I still don't know what this column means

In [230]:
dim_address_drop = 'DROP TABLE IF EXISTS dim_address;'
dim_address_create = 'CREATE TABLE IF NOT EXISTS dim_address (code char(2), name varchar);'
dim_address_insert = 'INSERT INTO dim_address (code, name) VALUES (%s, %s);'

cur.execute(dim_address_drop)
cur.execute(dim_address_create)

In [231]:
address_lines = lines[981:1036]
patt = re.compile(r"^\s*'(?P<code>..)'\s*=\s*'(?P<name>.+)'.*$")
matches = [patt.match(line) for line in address_lines]
address_codes = {match.group('code'): match.group('name') for match in matches}
assert len(address_codes) == len(address_lines)

In [232]:
for item in sorted(address_codes.items()):
    cur.execute(dim_address_insert, item)

#### dim_visa_type

In [236]:
dim_visa_type_drop = 'DROP TABLE IF EXISTS dim_visa_type;'

dim_visa_type_create = """CREATE TABLE IF NOT EXISTS dim_visa_type
(code int, visa_type char(8))"""

dim_visa_type_insert = """INSERT INTO dim_visa_type (code, visa_type)
VALUES (1, 'Business'), (2, 'Pleasure'), (3, 'Student');"""

In [237]:
cur.execute(dim_visa_type_drop)
cur.execute(dim_visa_type_create)
cur.execute(dim_visa_type_insert)

#### dim_port

In [238]:
dim_port_drop = 'DROP TABLE IF EXISTS dim_port;'
dim_port_create = 'CREATE TABLE IF NOT EXISTS dim_port (code char(3), name varchar);'
dim_port_insert = 'INSERT INTO dim_port (code, name) VALUES (%s, %s);'

cur.execute(dim_port_drop)
cur.execute(dim_port_create)

In [248]:
port_lines = lines[302:962]
patt = re.compile(r"^\s*'(?P<code>...?)'\s*=\s*'(?P<name>.+)'.*$")
matches = [patt.match(line) for line in port_lines]
port_codes = {match.group('code'): match.group('name').strip() for match in matches}
assert len(port_codes) == len(port_lines)

In [249]:
for item in sorted(port_codes.items()):
    cur.execute(dim_port_insert, item)