In [308]:
!ls ../data

29_2150_compressed_GlobalLandTemperaturesByCity.csv.zip
29_2150_compressed_GlobalLandTemperaturesByCountry.csv.zip
29_2150_compressed_GlobalLandTemperaturesByMajorCity.csv.zip
29_2150_compressed_GlobalLandTemperaturesByState.csv.zip
I94_SAS_Labels_Descriptions.SAS
airport-codes_csv.csv
capstone-user.csv
datasets_29_2150_GlobalTemperatures.csv
immigration_data_sample.csv
[34moutput[m[m
[34msas_data[m[m
sub-est2019_all.csv
us-cities-demographics.csv
us_states.csv


In [665]:
import time
import os
import re

import pandas as pd
import numpy as np
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType as R, StructField as Fld
from pyspark.sql.types import IntegerType, StringType, DoubleType, DateType
from pyspark.sql.functions import udf, col, to_date
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, dayofweek, date_format, from_unixtime

In [666]:
output_data = '../data/output'

In [667]:
##i94 schema
i94Schema = R([
    Fld("_c0", StringType()),
    Fld("cicid", DoubleType()),
    Fld("i94yr", DoubleType()),
    Fld("i94mon", DoubleType()),
    Fld("i94cit", DoubleType()),
    Fld("i94res", DoubleType()),
    Fld("i94port", StringType()),
    Fld("arrdate", DoubleType()),
    Fld("i94mode", DoubleType()),
    Fld("i94addr", StringType()),
    Fld("depdate", DoubleType()), 
    Fld("i94bir", DoubleType()), 
    Fld("i94visa", DoubleType()),
    Fld("count", DoubleType()),
    Fld("dtadfile", StringType()),
    Fld("visapost", StringType()),
    Fld("occup", StringType()),
    Fld("entdepa", StringType()),
    Fld("entdepd", StringType()),
    Fld("entdepu", StringType()),
    Fld("matflag", StringType()),
    Fld("biryear", DoubleType()),
    Fld("dtaddto", StringType()),
    Fld("gender", StringType()),
    Fld("insnum", StringType()),
    Fld("airline", StringType()),
    Fld("admnum", StringType()),
    Fld("fltno", StringType()),
    Fld("visatype", StringType()),
])

In [668]:
## read
spark = SparkSession.builder.appName('sample').getOrCreate()

df_i94 = spark.read.csv('../data/immigration_data_sample.csv', header=True, schema=i94Schema)

In [651]:
## for time table
print("\nStarting to process time data.")
st = time.time()
get_timestamp = udf(lambda x: int(x*24*60*60)-315619200, IntegerType())
df_i94 = df_i94.withColumn('arrdate', to_date(from_unixtime(get_timestamp(df_i94.arrdate))))

time_table = df_i94.select(col("arrdate").alias("arrival_date"),
      dayofmonth("arrdate").alias("day"), 
      weekofyear("arrdate").alias("week"), 
      month("arrdate").alias("month"), 
      year("arrdate").alias("year"), 
      dayofweek("arrdate").alias("weekday")).dropDuplicates(['arrival_date'])
print(f" Finished processing time table. Used {(time.time() - st)/60:5.2} min.")


Starting to process time data.
 Finished processing time table. Used 0.00089 min.


In [605]:
print(' Writing time data to S3.')
st = time.time()
# write time table to parquet files partitioned by year and month
output_path = os.path.join(output_data, 'time_table_run.parquet')
time_table.write.parquet(output_path, 'overwrite')
print(f"======== time data Done in  {(time.time() - st)/60:5.2}min.")

 Writing time data to S3.


In [558]:
## for port table
# df_airport = pd.read_csv('../data/airport-codes_csv.csv')
df_states = pd.read_csv('../data/us_states.csv')

res = []
with open('../data/I94_SAS_Labels_Descriptions.SAS', 'r') as file_object:
    for line in file_object:
        if len(line.strip()) > 0:
            res.append(line.strip())

i94_desp = ''.join(res).split(';')

In [559]:
# process port description data
print("\nStarting to process port table data.")
st=time.time()
for desp in i94_desp:
    if 'I94PORT' in desp:
        break
        
desp = desp.replace("INT''L FALLS, MN", "INTL FALLS, MN")
desp = desp.replace("\t\t", "\t")
port_info = re.findall(re.compile("\'([A-Z0-9]+\'\\t=\\t\'.+)\'"), desp)[0]

port_info_list = port_info.split("''")
port_info_list = [x.split("\t=\t") for x in port_info_list]

df_port_info = pd.DataFrame(port_info_list, columns=['port_code', 'port_add'])

df_port_info['port_code'] = df_port_info['port_code'].str.replace("'", "")
df_port_info['port_add'] = df_port_info['port_add'].str.replace("'", "").str.strip()

# process the outlier
no_port_idx = df_port_info['port_add'].str.contains('No PORT Code')
df_port_info.loc[no_port_idx, 'port_add'] = 'NA, NA'

collapsed_idx = df_port_info['port_add'].str.contains('Collapsed')
df_port_info.loc[collapsed_idx, 'port_add'] = 'NA, NA'

unknown_idx = df_port_info['port_add'].str.contains('UNIDENTIFED')
df_port_info.loc[unknown_idx, 'port_add'] = 'NA, NA'

unknown_idx = df_port_info['port_add'].str.contains('UNKNOWN')
df_port_info.loc[unknown_idx, 'port_add'] = 'NA, NA'

df_port_info.loc[df_port_info['port_add'].str.contains('#INTL'), 'port_add'] = 'BELLINGHAM, WASHINGTON WA'
df_port_info.loc[df_port_info['port_add'].str.contains('PASO DEL NORTE,TX'), 'port_add'] = 'PASO DEL NORTE, TX'

states_list = df_states['Abbreviation'].unique()
def is_us(x):
    if len(np.intersect1d(np.array(x.split(' ')), states_list)) > 0:
        return True
    else:
        return False
    
def get_state(x):
    find_state = np.intersect1d(np.array(x.split(' ')), states_list)
    if len(find_state) > 0:
        return find_state[0]
    else:
        return 'NA'

def get_address(x):
    find_state = np.intersect1d(np.array(x.split(' ')), states_list)
    if len(find_state) > 0:
        state = find_state[0]
        x = x.replace(state, '').strip()
        x = x.replace(',', '')
        return x
    else:
        return x   

df_port_info = df_port_info[df_port_info['port_add'].apply(lambda x: is_us(x))]
df_port_info['state'] = df_port_info['port_add'].apply(lambda x: get_state(x))
df_port_info['address'] = df_port_info['port_add'].apply(lambda x: get_address(x))
df_port_info = df_port_info[['port_code', 'state', 'address']]
df_port_info = df_port_info.rename(columns={'port_code': 'port_key'})
print(f" Finished processing port table. Used {(time.time() - st)/60:5.2} min.")


Starting to process port table data.
 Finished processing port table. Used 0.0071 min.


In [560]:
## process country tables
for desp in i94_desp:
    if 'I94CIT & I94RES' in desp:
        break

region_list = re.findall(re.compile("([0-9]+) {1,2}=  {1,2}\'([A-Za-z0-9 ,\(\)\.:\/-]+)\'"), desp)
df_region = pd.DataFrame(region_list, columns=['region_key', 'region_name'])

df_region

Unnamed: 0,region_key,region_name
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA
...,...,...
284,791,No Country Code (791)
285,849,No Country Code (849)
286,914,No Country Code (914)
287,944,No Country Code (944)


In [561]:
## process i94 mode table
for desp in i94_desp:
    if 'I94MODE' in desp:
        break

i94mode_list= re.findall(re.compile("([1-9]) = \'([a-zA-Z ]+)\'"), desp)
df_i94mode = pd.DataFrame(i94mode_list, columns=['i94mode_key', 'i94mode'])
df_i94mode

Unnamed: 0,i94mode_key,i94mode
0,1,Air
1,2,Sea
2,3,Land
3,9,Not reported


In [562]:
## process state table
for desp in i94_desp:
    if 'I94ADDR' in desp:
        break

state_info_list = re.findall(re.compile("\'([A-Z9]{2})\'=\'([a-zA-Z \.]+)\'"), desp)
df_state = pd.DataFrame(state_info_list, columns=['state_code', 'state'])
df_state.head()

Unnamed: 0,state_code,state
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA


In [563]:
## process visa table
for desp in i94_desp:
    if 'I94VISA' in desp:
        break

visa_list = re.findall(re.compile("([1-3]) = ([A-Za-z]+)"), desp)
df_visa = pd.DataFrame(visa_list, columns=['visa_key', 'visa_broad_type'])
df_visa

Unnamed: 0,visa_key,visa_type
0,1,Business
1,2,Pleasure
2,3,Student


In [669]:
## process i94 fact table 
df_i94 = df_i94.select([col('cicid').alias('i94_key'), 
                       col('i94res').alias('res_region_key'),
                       col('i94cit').alias('cit_region_key'),
                       col('i94port').alias('port_key'),
                       col('arrdate').alias('arrival_date'),
                       col('i94mode').alias('i94mode_key'),
                       col('i94addr').alias('state_code'),
                       col('depdate').alias('departure_date'),
                       col('i94bir').alias('age'),
                       col('i94visa').alias('visa_key'),
                       col('count').alias('i94_count'),
                       col('dtadfile').alias('i94_file_date'),
                       col('occup'),
                       col('biryear'),
                       col('dtaddto').alias('i94_leave_date'),
                       col('gender'),
                       col('insnum'),
                       col('airline'),
                       col('admnum').alias('i94_admin_num'),
                       col('fltno'),
                       col('visatype')
                        ])

df_i94 = df_i94.withColumn('i94_key', df_i94.i94_key.cast(IntegerType()))
df_i94 = df_i94.withColumn('res_region_key', df_i94.res_region_key.cast(IntegerType()))
df_i94 = df_i94.withColumn('cit_region_key', df_i94.cit_region_key.cast(IntegerType()))
df_i94 = df_i94.withColumn('i94mode_key', df_i94.i94mode_key.cast(IntegerType()))
df_i94 = df_i94.withColumn('age', df_i94.age.cast(IntegerType()))
df_i94 = df_i94.withColumn('visa_key', df_i94.visa_key.cast(IntegerType()))
df_i94 = df_i94.withColumn('i94_count', df_i94.i94_count.cast(IntegerType()))
df_i94 = df_i94.withColumn('biryear', df_i94.biryear.cast(IntegerType()))

get_timestamp = udf(lambda x: None if x is None else int(x*24*60*60)-315619200, IntegerType())
tmp = df_i94.withColumn('departure_date', get_timestamp(df_i94.departure_date))

str_to_date = udf(lambda x: datetime.strptime(x, "%Y%m%d"), DateType())
df_i94 = df_i94.withColumn('i94_file_date', str_to_date(df_i94.i94_file_date))

str_to_date = udf(lambda x: None if x == 'D/S' else datetime.strptime(x, "%m%d%Y"), DateType())
df_i94 = df_i94.withColumn('i94_leave_date', str_to_date(df_i94.i94_leave_date))

remove_zero = udf(lambda x: x.replace('.0', ''))
df_i94 = df_i94.withColumn('i94_admin_num', remove_zero(df_i94.i94_admin_num))

In [696]:
## process demographics table
df_demo = pd.read_csv('../data/us-cities-demographics.csv', delimiter=';')
df_demo.columns = ['_'.join(x.split(' ')).lower() for x in df_demo.columns]

city                       0
state                      0
median_age                 0
male_population            3
female_population          3
total_population           0
number_of_veterans        13
foreign-born              13
average_household_size    16
state_code                 0
race                       0
count                      0
dtype: int64

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign-born,average_household_size,state_code,race,count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402
