In [1]:
import pandas as pd
import re
import os

import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, udf, lit, explode, split, regexp_extract, col, isnan, isnull, desc, when, sum, to_date, desc, regexp_replace, count, to_timestamp, current_timestamp
from pyspark.sql.types import IntegerType, TimestampType

In [2]:
#setting visualization options
# https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  

# modify visualization of the notebook, for easier view
from IPython.core.display import display, HTML
display(HTML("""<style> p { max-width:90% !important; } h1 {font-size:2rem!important } h2 {font-size:1.6rem!important } 
h3 {font-size:1.4rem!important } h4 {font-size:1.3rem!important }h5 {font-size:1.2rem!important }h6 {font-size:1.1rem!important }</style>"""))# Do all imports and installs here


In [3]:
def create_spark_session():
    """
    This function creates a Spark Sesson and includes necessary Jar and adoop packages in the configuration. 
    """
    spark=SparkSession \
    .builder \
    .config("spark.jars.repositories", "https://repos.spark-packages.org/") \
    .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11") \
    .enableHiveSupport() \
    .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

In [5]:
# from https://knowledge.udacity.com/questions/66798
from datetime import datetime, timedelta
from pyspark.sql import types as T
def convert_datetime(x):
    try:
        start = datetime(1960, 1, 1)
        return start + timedelta(days=int(x))
    except:
        return None



def cleanImmigrationFactData(path_to_files, output_path):
    """
        Description:

        Usage:
        
    """


    filelist = os.listdir(path_to_files)
    
    print(f"The dataset contains {len(filelist)} files")
    
    for file in filelist:
        
        filepath = '{}{}'.format(path_to_files, file)
        
        size = os.path.getsize('{}/{}'.format(path_to_files, file))        
        print(f'Processing {filepath} - dim(bytes): {size} ')
        
        df_I94 = spark.read.format('com.github.saurfang.sas.spark').load(filepath).persist()

        # Snippet taken from https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
        toInt = udf(lambda x: int(x) if x!=None else x, IntegerType())

        for colname, coltype in df_I94.dtypes:
            if coltype == 'double':
                df_I94 = df_I94.withColumn(colname, toInt(colname))
        
        # Convert strings to dates
        df_I94 = df_I94.withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy")) \
        .withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy"))
        
        # Convert SAS date to dates
        udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType())
        
        df_I94 = df_I94.withColumn("arrdate", udf_datetime_from_sas("arrdate")) \
        .withColumn("depdate", udf_datetime_from_sas("depdate")) 
        
        # write data out
        print(f'Exporting cleaned file to {output_path}')
        df_I94.write.format('parquet').mode('overwrite').partitionBy('i94yr','i94mon').save(output_path)
        
        

In [6]:
I94_DATASET_PATH = '../../../../../data/18-83510-I94-Data-2016/'
CLEAN_DATA_DIR='data/'

In [7]:
cleanImmigrationFactData(I94_DATASET_PATH,'{}imm'.format(CLEAN_DATA_DIR))

The dataset contains 12 files
Processing ../../../../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat - dim(bytes): 471990272 
Exporting cleaned file to data/imm
Processing ../../../../../data/18-83510-I94-Data-2016/i94_sep16_sub.sas7bdat - dim(bytes): 569180160 
Exporting cleaned file to data/imm
Processing ../../../../../data/18-83510-I94-Data-2016/i94_nov16_sub.sas7bdat - dim(bytes): 444334080 
Exporting cleaned file to data/imm
Processing ../../../../../data/18-83510-I94-Data-2016/i94_mar16_sub.sas7bdat - dim(bytes): 481296384 
Exporting cleaned file to data/imm
Processing ../../../../../data/18-83510-I94-Data-2016/i94_jun16_sub.sas7bdat - dim(bytes): 716570624 
Exporting cleaned file to data/imm
Processing ../../../../../data/18-83510-I94-Data-2016/i94_aug16_sub.sas7bdat - dim(bytes): 625541120 
Exporting cleaned file to data/imm
Processing ../../../../../data/18-83510-I94-Data-2016/i94_may16_sub.sas7bdat - dim(bytes): 525008896 
Exporting cleaned file to data/imm
Processing 

### clean_dimension_data

In [8]:
def readDimensionData(label_file):
    """
    
    """
    print(f'Reading {label_file}')
    df_label_full = spark.read.text(label_file, wholetext=True)

    return df_label_full

def extractAirportCodes(df_label_full):
    """
    
    """
    
    print(f'Extracting airport codes from dataframe')
    # airport codes
    pattern='(\$i94prtl)([^;]+)'

    df_extract = df_label_full.withColumn('I94PORT', regexp_extract(col('value'),pattern,2))
    df_extract = df_extract.withColumn('port',explode(split('I94PORT','[\r\n]+'))).drop('value').drop('I94PORT')
    df_I94PORT = df_extract.withColumn('port_code',regexp_extract(col('port'),"(?<=')[0-9A-Z. ]+(?=')",0)) \
        .withColumn('city_state',regexp_extract(col('port'),"(=\t')([0-9A-Za-z ,\-()\/\.#&]+)(')",2)) \
        .withColumn('city', split(col('city_state'),',').getItem(0)) \
        .withColumn('state', split(col('city_state'),',').getItem(1)) \
        .withColumn('state', regexp_replace(col('state'), ' *$', '')) \
        .where(col('port')!='') \
        .drop('port') \
    
    return df_I94PORT

def extractCountryCodes(df_label_full):    
    """
    
    """
    
    print(f'Extracting country codes from dataframe')
    pattern='(i94cntyl)([^;]+)'

    df_extract = df_label_full.withColumn('I94RES', regexp_extract(col('value'),pattern,2))
    df_extract = df_extract.withColumn('raw',explode(split('I94RES','[\r\n]+'))).drop('value').drop('I94RES')
    df_I94RES = df_extract.withColumn('country_code',regexp_extract(col('raw'),"[0-9]+",0)) \
    .withColumn('country',regexp_extract(col('raw'),"\'([A-Za-z ,\-()0-9]+)\'",1)) \
    .where(col('raw')!='') \
    .drop('raw')
    
    return df_I94RES

def extractStateCodes(df_label_full):        
    """
    
    """
    
    print(f'Extracting state codes from dataframe')
    pattern='(i94addrl)([^;]+)'

    df_extract = df_label_full.withColumn('i94addrl', regexp_extract(col('value'),pattern,2))
    df_extract = df_extract.withColumn('raw',explode(split('i94addrl','[\r\n]+'))).drop('value').drop('i94addrl')
    df_I94ADDR = df_extract.withColumn('state_code',regexp_extract(col('raw'),"(?<=')[0-9A-Z. ]+(?=')",0)) \
    .withColumn('state',regexp_extract(col('raw'),"(=\s*\')([A-Z]+)(\')",2)) \
    .where(col('raw')!='') \
    .drop('raw')
    
    return df_I94ADDR

def buildVisaData():    
    """
    
    """
    
    print('Building visa code df')
    columns = ['I94VISA', 'category']
    vals = [(1,'Business'),(2,'Pleasure'),(3,'Student')]

    df_I94VISA = spark.createDataFrame(vals, columns)
    
    return df_I94VISA

def buildModeData():
    """
    
    """
    
    print('Building entry mode code df')    
    columns = ['I94MODE', 'category']
    vals = [(1,'Air'),(2,'Sea'),(3,'Land'),(4,'Not reported')]

    df_I94MODE = spark.createDataFrame(vals, columns)
    
    return df_I94MODE

def writeDataFrame(df_output, output_path, output_filename):
    """
    
    """
    print('Writing data to {}/{}'.format(output_path,output_filename)) 
    os.makedirs(output_path, exist_ok=True)  
    df_output.toPandas().to_csv('{}/{}'.format(output_path,output_filename))

In [9]:
I94_LABELS = 'I94_SAS_Labels_Descriptions.SAS'

df_labels = readDimensionData(I94_LABELS)

df_I94PORT = extractAirportCodes(df_labels)
writeDataFrame(df_I94PORT,'{}dim'.format(CLEAN_DATA_DIR),'I94PORT.csv')

df_I94RES = extractCountryCodes(df_labels)
writeDataFrame(df_I94PORT,'{}dim'.format(CLEAN_DATA_DIR),'I94RES.csv')

df_I94ADDR = extractStateCodes(df_labels)
writeDataFrame(df_I94PORT,'{}dim'.format(CLEAN_DATA_DIR),'I94ADDR.csv')

df_I94VISA = buildVisaData()
writeDataFrame(df_I94PORT,'{}dim'.format(CLEAN_DATA_DIR),'I94VISA.csv')

df_I94MODE = buildModeData()
writeDataFrame(df_I94PORT,'{}dim'.format(CLEAN_DATA_DIR),'I94MODE.csv')

Reading I94_SAS_Labels_Descriptions.SAS
Extracting airport codes from dataframe
Writing data to data/dim/I94PORT.csv
Extracting country codes from dataframe
Writing data to data/dim/I94RES.csv
Extracting state codes from dataframe
Writing data to data/dim/I94ADDR.csv
Building visa code df
Writing data to data/dim/I94VISA.csv
Building entry mode code df
Writing data to data/dim/I94MODE.csv
