In [1]:
import pandas as pd
import re
import os

import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, unix_timestamp, udf, lit, explode, split, regexp_extract, col, isnan, isnull, desc, when, sum, to_date, desc, regexp_replace, count, to_timestamp, current_timestamp
from pyspark.sql.types import IntegerType, TimestampType

In [2]:
#setting visualization options
# https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  

# modify visualization of the notebook, for easier view
from IPython.core.display import display, HTML
display(HTML("""<style> p { max-width:90% !important; } h1 {font-size:2rem!important } h2 {font-size:1.6rem!important } 
h3 {font-size:1.4rem!important } h4 {font-size:1.3rem!important }h5 {font-size:1.2rem!important }h6 {font-size:1.1rem!important }</style>"""))# Do all imports and installs here


In [3]:
def create_spark_session():
    """
    This function creates a Spark Sesson and includes necessary Jar and adoop packages in the configuration. 
    """
    spark=SparkSession \
    .builder \
    .config("spark.jars.repositories", "https://repos.spark-packages.org/") \
    .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11") \
    .enableHiveSupport() \
    .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

In [5]:
filepath = 'data/imm/i94yr=2016/i94mon=6'
df_I94 = spark.read.format('parquet').load(filepath).persist()

In [6]:
df_I94.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- i94cit: integer (nullable = true)
 |-- i94res: integer (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: date (nullable = true)
 |-- i94mode: integer (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: date (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- i94visa: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- validres: integer (nullable = true)
 |-- delete_days: integer (nullable = true)
 |-- delete_mexl: integer (nullable = true)
 |-- delete_dup: integer (nullable = true)
 |-- delete_visa: integer (nullable = true)
 |-- delete_recdup: integer (nullable = true)
 |-- dtadfile: date (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: integer (nullable = tru

In [6]:
df_I94.limit(5).toPandas().head()


Unnamed: 0,cicid,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,5927066,575,575,HOU,2016-06-28,1,FL,2016-07-02,38,2,1,,SNJ,,G,O,,M,1978,2016-12-27,F,,NK,-1953324344,499,B2
1,5927067,689,689,MIA,2016-06-28,1,FL,,48,2,1,,SPL,,G,,,,1968,2016-12-27,M,,JJ,-1953323444,8090,B2
2,5927069,689,689,WAS,2016-06-28,1,DC,2016-08-07,56,2,1,,SPL,,G,O,,M,1960,2016-12-27,F,,UA,-1953321644,860,B2
3,5927077,689,689,NYC,2016-06-28,1,NY,2016-10-09,29,2,1,,BRA,,G,O,,M,1987,2016-12-27,M,,AA,-1952625044,974,B2
4,5927081,689,689,ORL,2016-06-28,1,GA,,31,2,1,,SPL,,G,,,,1985,2016-12-27,F,,DL,-1952620344,196,B2


In [13]:
df_I94.where(col('delete_recdup')!=0).limit(5).toPandas().head()


Unnamed: 0,cicid,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,validres,delete_days,delete_mexl,delete_dup,delete_visa,delete_recdup,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype


In [16]:
I94_TEST_FILE = '../../../../../data/18-83510-I94-Data-2016/i94_jun16_sub.sas7bdat'

df_I94_jun = spark.read.format('com.github.saurfang.sas.spark').load(I94_TEST_FILE).persist()

In [15]:
df_I94_jun.limit(5).toPandas().head()


Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,validres,delete_days,delete_mexl,delete_dup,delete_visa,delete_recdup,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,4.0,2016.0,6.0,135.0,135.0,XXX,20612.0,,,,59.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,Z,,U,,1957.0,10032016,,,,14938460000.0,,WT
1,5.0,2016.0,6.0,135.0,135.0,XXX,20612.0,,,,50.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,Z,,U,,1966.0,10032016,,,,17460060000.0,,WT
2,6.0,2016.0,6.0,213.0,213.0,XXX,20609.0,,,,27.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,T,,U,,1989.0,D/S,,,,1679298000.0,,F1
3,7.0,2016.0,6.0,213.0,213.0,XXX,20611.0,,,,23.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,T,,U,,1993.0,D/S,,,,1140963000.0,,F1
4,16.0,2016.0,6.0,245.0,245.0,XXX,20632.0,,,,24.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,T,,U,,1992.0,D/S,,,,1934535000.0,,F1


In [28]:
df_I94_jun=df_I94_jun.drop('delete_days','delete_dup', 'delete_mexl','delete_visa','delete_recdup','validres')
df_I94_jun.printSchema()


root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- delete_mexl: double (nullable = true)
 |-- delete_visa: double (nullable = true)
 |-- delete_recdup: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: str

In [19]:
I94_TEST_FILE = '../../../../../data/18-83510-I94-Data-2016/i94_jan16_sub.sas7bdat'

df_I94_jan = spark.read.format('com.github.saurfang.sas.spark').load(I94_TEST_FILE).persist()
df_I94_jan.printSchema()



root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double (nullable = 

In [24]:
df_I94_jan=df_I94_jan.drop('validres')

In [22]:
filepath = 'data/imm/i94yr=2016/'
filelist = os.walk(filepath)

for root, subFolders, files in filelist:

    for file in files:
        if file[-4:] != '.crc':
            local_filename = os.path.join(root, file)
            df_I94 = spark.read.format('parquet').load(local_filename).persist()
            print(f'{local_filename}:\t{len(df_I94.columns)}')

data/imm/i94yr=2016/i94mon=10/part-00008-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00015-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00001-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00014-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00011-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00007-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00005-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00013-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00003-23549729-8549-440f-b0ef-cc091ab18f23.c000.snappy.parquet:	26
data/imm/i94yr=2016/i94mon=10/part-00004-23549729-8549-440f-b0ef-cc091ab18f23.c000

In [23]:
def error_capture():
    
    try:
        
        three=4/0
        var = UNSET
    
    except Exception as e:
        
        if type(e).__name__ == 'NameError':
        
            print(f'This type of error is ok: {type(e).__name__}')
        else:
            
            print(f'This type is not ok: {e}')

In [24]:
error_capture()

This type is not ok: division by zero


In [None]:
my_bucket = s3.Bucket(S3_BUCKET)

for my_bucket_object in my_bucket.objects.all():
    print(f'{my_bucket_object.key}\t\t{my_bucket_object.size}')