In [204]:
!ls ../data

29_2150_compressed_GlobalLandTemperaturesByCity.csv.zip
29_2150_compressed_GlobalLandTemperaturesByCountry.csv.zip
29_2150_compressed_GlobalLandTemperaturesByMajorCity.csv.zip
29_2150_compressed_GlobalLandTemperaturesByState.csv.zip
I94_SAS_Labels_Descriptions.SAS
airport-codes_csv.csv
capstone-user.csv
datasets_29_2150_GlobalTemperatures.csv
immigration_data_sample.csv
[34moutput[m[m
[34msas_data[m[m
sub-est2019_all.csv
us-cities-demographics.csv
us_states.csv


In [205]:
import configparser
import time
import os
import re

import pandas as pd
import numpy as np
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType as R, StructField as Fld
from pyspark.sql.types import IntegerType, StringType, DoubleType, DateType
from pyspark.sql.functions import udf, col, to_date
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, dayofweek, date_format, from_unixtime

In [206]:
config_s3 = configparser.ConfigParser()
config_s3.read_file(open('../aws_setup.cfg'))

S3_REGION              = config_s3.get('S3', 'REGION')
S3_BUCKET_NAME         = config_s3.get('S3', 'NAME')

In [210]:
df_tmp = pd.read_csv("s3://immigrate-demographics-s3-1629/output/label_desp_staging.csv")

PermissionError: Forbidden

In [200]:
df_tmp.loc[df_tmp['type'] == 'state']

Unnamed: 0,code,value,type
956,AL,ALABAMA,state
957,AK,ALASKA,state
958,AZ,ARIZONA,state
959,AR,ARKANSAS,state
960,CA,CALIFORNIA,state
961,CO,COLORADO,state
962,CT,CONNECTICUT,state
963,DE,DELAWARE,state
964,DC,DIST. OF COLUMBIA,state
965,FL,FLORIDA,state


In [193]:
df_tmp['type'].unique()

array(['port', 'country', 'i94_mode', 'visa', 'state'], dtype=object)

In [126]:
output_data = '../data/output/'

In [108]:
##i94 schema
i94Schema = R([
    Fld("_c0", StringType()),
    Fld("cicid", DoubleType()),
    Fld("i94yr", DoubleType()),
    Fld("i94mon", DoubleType()),
    Fld("i94cit", DoubleType()),
    Fld("i94res", DoubleType()),
    Fld("i94port", StringType()),
    Fld("arrdate", DoubleType()),
    Fld("i94mode", DoubleType()),
    Fld("i94addr", StringType()),
    Fld("depdate", DoubleType()), 
    Fld("i94bir", DoubleType()), 
    Fld("i94visa", DoubleType()),
    Fld("count", DoubleType()),
    Fld("dtadfile", StringType()),
    Fld("visapost", StringType()),
    Fld("occup", StringType()),
    Fld("entdepa", StringType()),
    Fld("entdepd", StringType()),
    Fld("entdepu", StringType()),
    Fld("matflag", StringType()),
    Fld("biryear", DoubleType()),
    Fld("dtaddto", StringType()),
    Fld("gender", StringType()),
    Fld("insnum", StringType()),
    Fld("airline", StringType()),
    Fld("admnum", StringType()),
    Fld("fltno", StringType()),
    Fld("visatype", StringType()),
])

In [109]:
# pd.read_csv(f's3a://{S3_BUCKET_NAME}/data/immigration_data_sample.csv')

In [110]:
## read
spark = SparkSession.builder \
        .config("spark.jars.packages", "com.amazonaws:aws-java-sdk:1.11.7755") \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("spark.jars.packages", "net.java.dev.jets3t:jets3t:0.9.3") \
        .getOrCreate()


df_i94 = spark.read.csv('../data/immigration_data_sample.csv', header=True, schema=i94Schema)
# df_i94 = spark.read.csv(f's3a://{S3_BUCKET_NAME}/data/immigration_data_sample.csv', header=True, schema=i94Schema)

In [111]:
df_i94.select(col('insnum')).dropDuplicates().show(2)

+------+
|insnum|
+------+
|  3517|
|  3993|
+------+
only showing top 2 rows



In [112]:
## for time table
print("\nStarting to process time data.")
st = time.time()
get_timestamp = udf(lambda x: int(x*24*60*60)-315619200, IntegerType())
df_i94 = df_i94.withColumn('arrdate', to_date(from_unixtime(get_timestamp(df_i94.arrdate))))

time_table = df_i94.select(col("arrdate").alias("arrival_date"),
      dayofmonth("arrdate").alias("day"), 
      weekofyear("arrdate").alias("week"), 
      month("arrdate").alias("month"), 
      year("arrdate").alias("year"), 
      dayofweek("arrdate").alias("weekday")).dropDuplicates(['arrival_date'])
print(f" Finished processing time table. Used {(time.time() - st)/60:5.2} min.")


Starting to process time data.
 Finished processing time table. Used 0.0014 min.


In [113]:
time_table.printSchema()

root
 |-- arrival_date: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: integer (nullable = true)



In [114]:
time_table.show(2)

+------------+---+----+-----+----+-------+
|arrival_date|day|week|month|year|weekday|
+------------+---+----+-----+----+-------+
|  2016-04-25| 25|  17|    4|2016|      2|
|  2016-04-22| 22|  16|    4|2016|      6|
+------------+---+----+-----+----+-------+
only showing top 2 rows



In [115]:
df_i94.show(2)

+-------+---------+------+------+------+------+-------+----------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|    _c0|    cicid| i94yr|i94mon|i94cit|i94res|i94port|   arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|       admnum|fltno|visatype|
+-------+---------+------+------+------+------+-------+----------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|2027561|4084316.0|2016.0|   4.0| 209.0| 209.0|    HHW|2016-04-21|    1.0|     HI|20573.0|  61.0|    2.0|  1.0|20160422|    null| null|      G|      O|   null|      M| 1955.0|07202016|     F|  null|     JL|56582674633.0|00782|      WT|
|2171295|4422636.0|2016.0|   4.0| 582.0| 582.0|    MCA|2

In [116]:
import boto3

In [133]:
s3 = boto3.resource('s3')
obj = s3.Object(S3_BUCKET_NAME, "data/I94_SAS_Labels_Descriptions.SAS")

In [134]:
obj.get()['Body'].read()

b"libname library 'Your file location' ;\r\nproc format library=library ;\r\n\r\n/* I94YR - 4 digit year */\r\n\r\n/* I94MON - Numeric month */\r\n\r\n/* I94CIT & I94RES - This format shows all the valid and invalid codes for processing */\r\n  value i94cntyl\r\n   582 =  'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'\r\n   236 =  'AFGHANISTAN'\r\n   101 =  'ALBANIA'\r\n   316 =  'ALGERIA'\r\n   102 =  'ANDORRA'\r\n   324 =  'ANGOLA'\r\n   529 =  'ANGUILLA'\r\n   518 =  'ANTIGUA-BARBUDA'\r\n   687 =  'ARGENTINA '\r\n   151 =  'ARMENIA'\r\n   532 =  'ARUBA'\r\n   438 =  'AUSTRALIA'\r\n   103 =  'AUSTRIA'\r\n   152 =  'AZERBAIJAN'\r\n   512 =  'BAHAMAS'\r\n   298 =  'BAHRAIN'\r\n   274 =  'BANGLADESH'\r\n   513 =  'BARBADOS'\r\n   104 =  'BELGIUM'\r\n   581 =  'BELIZE'\r\n   386 =  'BENIN'\r\n   509 =  'BERMUDA'\r\n   153 =  'BELARUS'\r\n   242 =  'BHUTAN'\r\n   688 =  'BOLIVIA'\r\n   717 =  'BONAIRE, ST EUSTATIUS, SABA' \r\n   164 =  'BOSNIA-HERZEGOVINA'\r\n   336 =  'BOTSW

In [135]:
i94_byte = obj.get()['Body'].read()
i94_desp = str(i94_byte, 'utf-8').split(';')

In [136]:
i94_desp

["libname library 'Your file location' ",
 '\r\nproc format library=library ',
 "\r\n\r\n/* I94YR - 4 digit year */\r\n\r\n/* I94MON - Numeric month */\r\n\r\n/* I94CIT & I94RES - This format shows all the valid and invalid codes for processing */\r\n  value i94cntyl\r\n   582 =  'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'\r\n   236 =  'AFGHANISTAN'\r\n   101 =  'ALBANIA'\r\n   316 =  'ALGERIA'\r\n   102 =  'ANDORRA'\r\n   324 =  'ANGOLA'\r\n   529 =  'ANGUILLA'\r\n   518 =  'ANTIGUA-BARBUDA'\r\n   687 =  'ARGENTINA '\r\n   151 =  'ARMENIA'\r\n   532 =  'ARUBA'\r\n   438 =  'AUSTRALIA'\r\n   103 =  'AUSTRIA'\r\n   152 =  'AZERBAIJAN'\r\n   512 =  'BAHAMAS'\r\n   298 =  'BAHRAIN'\r\n   274 =  'BANGLADESH'\r\n   513 =  'BARBADOS'\r\n   104 =  'BELGIUM'\r\n   581 =  'BELIZE'\r\n   386 =  'BENIN'\r\n   509 =  'BERMUDA'\r\n   153 =  'BELARUS'\r\n   242 =  'BHUTAN'\r\n   688 =  'BOLIVIA'\r\n   717 =  'BONAIRE, ST EUSTATIUS, SABA' \r\n   164 =  'BOSNIA-HERZEGOVINA'\r\n   336 =

In [137]:
for desp in i94_desp:
    if 'I94PORT' in desp:
        i94_port = desp
    if 'I94CIT & I94RES' in desp:
        i94_country = desp
    if 'I94MODE' in desp:
        i94_mode = desp
    if 'I94ADDR' in desp:
        i94_state = desp
    if 'I94VISA' in desp:
        i94_visa = desp

In [174]:
print("aa\t\t\tbb", "aa\t\t\tbb".replace("\", ""))

SyntaxError: EOL while scanning string literal (<ipython-input-174-e7790e83512f>, line 1)

In [179]:
i94_country

"\r\n\r\n/* I94YR - 4 digit year */\r\n\r\n/* I94MON - Numeric month */\r\n\r\n/* I94CIT & I94RES - This format shows all the valid and invalid codes for processing */\r\n  value i94cntyl\r\n   582 =  'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'\r\n   236 =  'AFGHANISTAN'\r\n   101 =  'ALBANIA'\r\n   316 =  'ALGERIA'\r\n   102 =  'ANDORRA'\r\n   324 =  'ANGOLA'\r\n   529 =  'ANGUILLA'\r\n   518 =  'ANTIGUA-BARBUDA'\r\n   687 =  'ARGENTINA '\r\n   151 =  'ARMENIA'\r\n   532 =  'ARUBA'\r\n   438 =  'AUSTRALIA'\r\n   103 =  'AUSTRIA'\r\n   152 =  'AZERBAIJAN'\r\n   512 =  'BAHAMAS'\r\n   298 =  'BAHRAIN'\r\n   274 =  'BANGLADESH'\r\n   513 =  'BARBADOS'\r\n   104 =  'BELGIUM'\r\n   581 =  'BELIZE'\r\n   386 =  'BENIN'\r\n   509 =  'BERMUDA'\r\n   153 =  'BELARUS'\r\n   242 =  'BHUTAN'\r\n   688 =  'BOLIVIA'\r\n   717 =  'BONAIRE, ST EUSTATIUS, SABA' \r\n   164 =  'BOSNIA-HERZEGOVINA'\r\n   336 =  'BOTSWANA'\r\n   689 =  'BRAZIL'\r\n   525 =  'BRITISH VIRGIN ISLANDS'\r\n   2

In [186]:
desp = i94_port

In [190]:
desp = desp.replace("INT''L FALLS, MN", "INTL FALLS, MN")
desp = re.sub('[^A-Za-z0-9= \']+', '', desp)
# desp = desp.replace("\t", "")
# desp = desp.replace("\n", "")
# desp = desp.replace("\r", "")
# port_info_list = re.findall(re.compile("\'([A-Z0-9]+\'=\'.+ *)\'"), desp)

# port_info_list = [x.split("'='") + ['port'] for x in port_info_list]
# df_port = pd.DataFrame(port_info_list, columns=['port_code', 'port_add'])

In [191]:
desp

" I94PORT  This format shows all the valid and invalid codes for processing   value i94prtl   'ALC'='ALCAN AK             '   'ANC'='ANCHORAGE AK         '   'BAR'='BAKER AAF  BAKER ISLAND AK'   'DAC'='DALTONS CACHE AK     '   'PIZ'='DEW STATION PT LAY DEW AK'   'DTH'='DUTCH HARBOR AK      '   'EGL'='EAGLE AK             '   'FRB'='FAIRBANKS AK         '   'HOM'='HOMER AK             '              'HYD'='HYDER AK             '   'JUN'='JUNEAU AK            '   '5KE'='KETCHIKAN AK'   'KET'='KETCHIKAN AK         '   'MOS'='MOSES POINT INTERMEDIATE AK'   'NIK'='NIKISKI AK           '   'NOM'='NOM AK               '   'PKC'='POKER CREEK AK       '   'ORI'='PORT LIONS SPB AK'   'SKA'='SKAGWAY AK           '   'SNP'='ST PAUL ISLAND AK'   'TKI'='TOKEEN AK'   'WRA'='WRANGELL AK          '   'HSV'='MADISON COUNTY  HUNTSVILLE AL'   'MOB'='MOBILE AL            '   'LIA'='LITTLE ROCK AR BPS'   'ROG'='ROGERS ARPT AR'   'DOU'='DOUGLAS AZ           '   'LUK'='LUKEVILLE AZ         '   'MAP'='MARIPOSA

In [181]:
port_info_list = re.findall(re.compile("\'([A-Z0-9]+\'=\'.+ *)\'"), desp)
port_info_list

["XXX'='XXXAN, AK             ",
 "ANC'='ANCHORAGE, AK         ",
 "BAR'='BAKER AAF - BAKER ISLAND, AK",
 "DAC'='DALTONS CACHE, AK     ",
 "PIZ'='DEW STATION PT LAY DEW, AK",
 "DTH'='DUTCH HARBOR, AK      ",
 "EGL'='EAGLE, AK             ",
 "FRB'='FAIRBANKS, AK         ",
 "HOM'='HOMER, AK             ",
 "HYD'='HYDER, AK             ",
 "JUN'='JUNEAU, AK            ",
 "5KE'='KETCHIKAN, AK",
 "KET'='KETCHIKAN, AK         ",
 "MOS'='MOSES POINT INTERMEDIATE, AK",
 "NIK'='NIKISKI, AK           ",
 "NOM'='NOM, AK               ",
 "PKC'='POKER CREEK, AK       ",
 "ORI'='PORT LIONS SPB, AK",
 "SKA'='SKAGWAY, AK           ",
 "SNP'='ST. PAUL ISLAND, AK",
 "TKI'='TOKEEN, AK",
 "WRA'='WRANGELL, AK          ",
 "HSV'='MADISON COUNTY - HUNTSVILLE, AL",
 "MOB'='MOBILE, AL            ",
 "LIA'='LITTLE ROCK, AR (BPS)",
 "ROG'='ROGERS ARPT, AR",
 "DOU'='DOUGLAS, AZ           ",
 "LUK'='LUKEVILLE, AZ         ",
 "MAP'='MARIPOSA AZ           ",
 "NAC'='NACO, AZ              ",
 "NOG'='NOGALES, AZ  

In [149]:
re.__version__

'2.2.1'

In [29]:
print(' Writing time data to S3.')
st = time.time()
# write time table to parquet files partitioned by year and month
output_path = os.path.join(output_data, 'time_table_run.parquet')
time_table.write.parquet(output_path, 'overwrite')
print(f"======== time data Done in  {(time.time() - st)/60:5.2}min.")

 Writing time data to S3.


In [30]:
## for port table
# df_airport = pd.read_csv('../data/airport-codes_csv.csv')
df_states = pd.read_csv('../data/us_states.csv')

res = []
with open('../data/I94_SAS_Labels_Descriptions.SAS', 'r') as file_object:
    for line in file_object:
        if len(line.strip()) > 0:
            res.append(line.strip())

i94_desp = ''.join(res).split(';')

In [31]:
df_port_info.empty

False

In [32]:
# process port description data
print("\nStarting to process port table data.")
st=time.time()
for desp in i94_desp:
    if 'I94PORT' in desp:
        break
        
desp = desp.replace("INT''L FALLS, MN", "INTL FALLS, MN")
desp = desp.replace("\t\t", "\t")
port_info = re.findall(re.compile("\'([A-Z0-9]+\'\\t=\\t\'.+)\'"), desp)[0]

port_info_list = port_info.split("''")
port_info_list = [x.split("\t=\t") for x in port_info_list]

df_port_info = pd.DataFrame(port_info_list, columns=['port_code', 'port_add'])

df_port_info['port_code'] = df_port_info['port_code'].str.replace("'", "")
df_port_info['port_add'] = df_port_info['port_add'].str.replace("'", "").str.strip()

# process the outlier
no_port_idx = df_port_info['port_add'].str.contains('No PORT Code')
df_port_info.loc[no_port_idx, 'port_add'] = 'NA, NA'

collapsed_idx = df_port_info['port_add'].str.contains('Collapsed')
df_port_info.loc[collapsed_idx, 'port_add'] = 'NA, NA'

unknown_idx = df_port_info['port_add'].str.contains('UNIDENTIFED')
df_port_info.loc[unknown_idx, 'port_add'] = 'NA, NA'

unknown_idx = df_port_info['port_add'].str.contains('UNKNOWN')
df_port_info.loc[unknown_idx, 'port_add'] = 'NA, NA'

df_port_info.loc[df_port_info['port_add'].str.contains('#INTL'), 'port_add'] = 'BELLINGHAM, WASHINGTON WA'
df_port_info.loc[df_port_info['port_add'].str.contains('PASO DEL NORTE,TX'), 'port_add'] = 'PASO DEL NORTE, TX'

states_list = df_states['Abbreviation'].unique()
def is_us(x):
    if len(np.intersect1d(np.array(x.split(' ')), states_list)) > 0:
        return True
    else:
        return False
    
def get_state(x):
    find_state = np.intersect1d(np.array(x.split(' ')), states_list)
    if len(find_state) > 0:
        return find_state[0]
    else:
        return 'NA'

def get_address(x):
    find_state = np.intersect1d(np.array(x.split(' ')), states_list)
    if len(find_state) > 0:
        state = find_state[0]
        x = x.replace(state, '').strip()
        x = x.replace(',', '')
        return x
    else:
        return x   

df_port_info = df_port_info[df_port_info['port_add'].apply(lambda x: is_us(x))]
df_port_info['state'] = df_port_info['port_add'].apply(lambda x: get_state(x))
df_port_info['address'] = df_port_info['port_add'].apply(lambda x: get_address(x))
df_port_info = df_port_info[['port_code', 'state', 'address']]
df_port_info = df_port_info.rename(columns={'port_code': 'port_key'})
print(f" Finished processing port table. Used {(time.time() - st)/60:5.2} min.")


Starting to process port table data.
 Finished processing port table. Used 0.0034 min.


In [33]:
df_port_info

Unnamed: 0,port_key,state,address
0,ALC,AK,ALCAN
1,ANC,AK,ANCHORAGE
2,BAR,AK,BER AAF - BER ISLAND
3,DAC,AK,DALTONS CACHE
4,PIZ,AK,DEW STATION PT LAY DEW
...,...,...,...
586,MLI,IL,MOLINE
587,RIV,CA,RIVERSIDE
588,RME,NY,ROME
589,VNY,CA,VAN NUYS


In [88]:
region_list = re.findall(re.compile("([0-9]+) {1,2}=  {1,2}\'([A-Za-z0-9 ,\(\)\.:\/-]+)\'"), i94_country)
region_list = [list(x) + ['country'] for x in region_list]

region_list

[['582',
  'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)',
  'country'],
 ['236', 'AFGHANISTAN', 'country'],
 ['101', 'ALBANIA', 'country'],
 ['316', 'ALGERIA', 'country'],
 ['102', 'ANDORRA', 'country'],
 ['324', 'ANGOLA', 'country'],
 ['529', 'ANGUILLA', 'country'],
 ['518', 'ANTIGUA-BARBUDA', 'country'],
 ['687', 'ARGENTINA ', 'country'],
 ['151', 'ARMENIA', 'country'],
 ['532', 'ARUBA', 'country'],
 ['438', 'AUSTRALIA', 'country'],
 ['103', 'AUSTRIA', 'country'],
 ['152', 'AZERBAIJAN', 'country'],
 ['512', 'BAHAMAS', 'country'],
 ['298', 'BAHRAIN', 'country'],
 ['274', 'BANGLADESH', 'country'],
 ['513', 'BARBADOS', 'country'],
 ['104', 'BELGIUM', 'country'],
 ['581', 'BELIZE', 'country'],
 ['386', 'BENIN', 'country'],
 ['509', 'BERMUDA', 'country'],
 ['153', 'BELARUS', 'country'],
 ['242', 'BHUTAN', 'country'],
 ['688', 'BOLIVIA', 'country'],
 ['717', 'BONAIRE, ST EUSTATIUS, SABA', 'country'],
 ['164', 'BOSNIA-HERZEGOVINA', 'country'],
 ['336', 'BOTSWANA', 'country'],


In [34]:
## process country tables
for desp in i94_desp:
    if 'I94CIT & I94RES' in desp:
        break

region_list = re.findall(re.compile("([0-9]+) {1,2}=  {1,2}\'([A-Za-z0-9 ,\(\)\.:\/-]+)\'"), desp)
df_region = pd.DataFrame(region_list, columns=['region_key', 'region_name'])

df_region

Unnamed: 0,region_key,region_name
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA
...,...,...
284,791,No Country Code (791)
285,849,No Country Code (849)
286,914,No Country Code (914)
287,944,No Country Code (944)


In [89]:
i94mode_list= re.findall(re.compile("([1-9]) = \'([a-zA-Z ]+)\'"), i94_mode)

In [92]:
i94mode_list

[('1', 'Air'), ('2', 'Sea'), ('3', 'Land'), ('9', 'Not reported')]

In [36]:
## process i94 mode table
for desp in i94_desp:
    if 'I94MODE' in desp:
        break

i94mode_list= re.findall(re.compile("([1-9]) = \'([a-zA-Z ]+)\'"), desp)
df_i94mode = pd.DataFrame(i94mode_list, columns=['i94mode_key', 'i94mode'])
df_i94mode

Unnamed: 0,i94mode_key,i94mode
0,1,Air
1,2,Sea
2,3,Land
3,9,Not reported


In [37]:
## process state table
for desp in i94_desp:
    if 'I94ADDR' in desp:
        break

state_info_list = re.findall(re.compile("\'([A-Z9]{2})\'=\'([a-zA-Z \.]+)\'"), desp)
df_state = pd.DataFrame(state_info_list, columns=['state_code', 'state'])
df_state.head()

Unnamed: 0,state_code,state
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA


In [38]:
## process visa table
for desp in i94_desp:
    if 'I94VISA' in desp:
        break

visa_list = re.findall(re.compile("([1-3]) = ([A-Za-z]+)"), desp)
df_visa = pd.DataFrame(visa_list, columns=['visa_key', 'visa_broad_type'])
df_visa

Unnamed: 0,visa_key,visa_broad_type
0,1,Business
1,2,Pleasure
2,3,Student


In [117]:
## rename i94 table
col_rename = {'cicid': 'i94_key', 
              'i94res': 'res_region_key',
              'i94cit': 'cit_region_key', 
              'i94port': 'port_key',
              'arrdate': 'arrival_date', 
              'i94mode': 'i94mode_key',
              'i94addr': 'state_code', 
              'depdate': 'departure_date', 
              'i94bir': 'age', 
              'i94visa': 'visa_key', 
              'count': 'i94_count', 
              'dtadfile': 'i94_file_date', 
              'dtaddto': 'i94_leave_date', 
              'admnum': 'i94_admin_num'}
for old_col, new_col in col_rename.items():
    df_i94 = df_i94.withColumnRenamed(old_col, new_col)

In [118]:
## process i94 fact table 
select_cols = ['i94_key', 'res_region_key', 'cit_region_key', 'port_key', 
               'arrival_date', 'i94yr', 'i94mon', 'i94mode_key', 'state_code', 
               'departure_date', 'age', 'visa_key', 'i94_count', 'i94_file_date', 
               'occup', 'biryear', 'i94_leave_date', 'gender', 'insnum', 'airline',
               'i94_admin_num', 'fltno', 'visatype']

df_i94 = df_i94.select(select_cols)

# process data type
int_cols = ['i94_key', 'res_region_key', 'cit_region_key', \
            'i94yr', 'i94mon', 'i94mode_key', 'age', 'visa_key', 'i94_count', 'biryear']
for int_col in int_cols:
    df_i94 = df_i94.withColumn(int_col, col(int_col).cast(IntegerType()))
    
get_timestamp = udf(lambda x: None if x is None else int(x*24*60*60)-315619200, IntegerType())
df_i94 = df_i94.withColumn('departure_date', to_date(from_unixtime(get_timestamp(df_i94.departure_date))))

str_to_date = udf(lambda x: datetime.strptime(x, "%Y%m%d"), DateType())
df_i94 = df_i94.withColumn('i94_file_date', str_to_date(df_i94.i94_file_date))

str_to_date = udf(lambda x: None if x == 'D/S' else datetime.strptime(x, "%m%d%Y"), DateType())
df_i94 = df_i94.withColumn('i94_leave_date', str_to_date(df_i94.i94_leave_date))

remove_zero = udf(lambda x: x.replace('.0', ''))
df_i94 = df_i94.withColumn('i94_admin_num', remove_zero(df_i94.i94_admin_num))

In [120]:
df_i94.select(['departure_date']).show(2)

+--------------+
|departure_date|
+--------------+
|    2016-04-28|
|    2016-04-23|
+--------------+
only showing top 2 rows



In [121]:
df_i94.printSchema()

root
 |-- i94_key: integer (nullable = true)
 |-- res_region_key: integer (nullable = true)
 |-- cit_region_key: integer (nullable = true)
 |-- port_key: string (nullable = true)
 |-- arrival_date: date (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94mode_key: integer (nullable = true)
 |-- state_code: string (nullable = true)
 |-- departure_date: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- visa_key: integer (nullable = true)
 |-- i94_count: integer (nullable = true)
 |-- i94_file_date: date (nullable = true)
 |-- occup: string (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- i94_leave_date: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- i94_admin_num: string (nullable = true)
 |-- fltno: string (nullable = true)
 |-- visatype: string (nullable = true)



In [74]:
datetime.now().year

2020

In [80]:
df_i94.agg({"i94yr":"max"}).collect()[0][0]

2016

In [71]:
df_i94.show()

+-------+--------------+--------------+--------+------------+-----+------+-----------+----------+--------------+---+--------+---------+-------------+-----+-------+--------------+------+------+-------+-------------+-----+--------+
|i94_key|res_region_key|cit_region_key|port_key|arrival_date|i94yr|i94mon|i94mode_key|state_code|departure_date|age|visa_key|i94_count|i94_file_date|occup|biryear|i94_leave_date|gender|insnum|airline|i94_admin_num|fltno|visatype|
+-------+--------------+--------------+--------+------------+-----+------+-----------+----------+--------------+---+--------+---------+-------------+-----+-------+--------------+------+------+-------+-------------+-----+--------+
|4084316|           209|           209|     HHW|     20566.0| 2016|     4|          1|        HI|       20573.0| 61|       2|        1|   2016-04-22| null|   1955|    2016-07-20|     F|  null|     JL|  56582674633|00782|      WT|
|4422636|           582|           582|     MCA|     20567.0| 2016|     4|      

In [201]:
## process demographics table
df_demo = pd.read_csv('../data/us-cities-demographics.csv', delimiter=';')
df_demo.columns = ['_'.join(x.split(' ')).lower() for x in df_demo.columns]

In [212]:
df_demo.columns

Index(['city', 'state', 'median_age', 'male_population', 'female_population',
       'total_population', 'number_of_veterans', 'foreign-born',
       'average_household_size', 'state_code', 'race', 'count'],
      dtype='object')

In [211]:
df_demo.groupby(['state_code']).size()

state_code
AK      5
AL     34
AR     29
AZ     80
CA    676
CO     80
CT     39
DC      5
DE      5
FL    222
GA     55
HI      5
IA     34
ID     15
IL     91
IN     51
KS     35
KY     10
LA     40
MA     69
MD     50
ME      5
MI     79
MN     54
MO     45
MS      9
MT     10
NC     70
ND     10
NE     10
NH     10
NJ     57
NM     20
NV     45
NY     54
OH     49
OK     30
OR     40
PA     33
PR     13
RI     19
SC     24
SD     10
TN     44
TX    273
UT     48
VA     70
WA     85
WI     45
dtype: int64