In [55]:
import pandas as pd
import re
import os

import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, udf, lit, explode, split, regexp_extract, col, isnan, isnull, desc, when, sum, to_date, desc, regexp_replace, count, to_timestamp
from pyspark.sql.types import IntegerType, TimestampType

In [56]:
#setting visualization options
# https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  

# modify visualization of the notebook, for easier view
from IPython.core.display import display, HTML
display(HTML("""<style> p { max-width:90% !important; } h1 {font-size:2rem!important } h2 {font-size:1.6rem!important } 
h3 {font-size:1.4rem!important } h4 {font-size:1.3rem!important }h5 {font-size:1.2rem!important }h6 {font-size:1.1rem!important }</style>"""))# Do all imports and installs here


In [57]:
def create_spark_session():
    """
    This function creates a Spark Sesson and includes necessary Jar and adoop packages in the configuration. 
    """
    spark=SparkSession \
    .builder \
    .config("spark.jars.repositories", "https://repos.spark-packages.org/") \
    .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11") \
    .enableHiveSupport() \
    .getOrCreate()
    return spark

In [58]:
spark = create_spark_session()

In [59]:
I94_DATASET_PATH = '../../../../../data/18-83510-I94-Data-2016/'

filelist = os.listdir(I94_DATASET_PATH)
print(f"The dataset contains {len(filelist)} files")

The dataset contains 12 files


In [60]:
for file in filelist:
    size = os.path.getsize('{}/{}'.format(I94_DATASET_PATH, file))
    print(f'{file} - dim(bytes): {size}')

i94_apr16_sub.sas7bdat - dim(bytes): 471990272
i94_sep16_sub.sas7bdat - dim(bytes): 569180160
i94_nov16_sub.sas7bdat - dim(bytes): 444334080
i94_mar16_sub.sas7bdat - dim(bytes): 481296384
i94_jun16_sub.sas7bdat - dim(bytes): 716570624
i94_aug16_sub.sas7bdat - dim(bytes): 625541120
i94_may16_sub.sas7bdat - dim(bytes): 525008896
i94_jan16_sub.sas7bdat - dim(bytes): 434176000
i94_oct16_sub.sas7bdat - dim(bytes): 556269568
i94_jul16_sub.sas7bdat - dim(bytes): 650117120
i94_feb16_sub.sas7bdat - dim(bytes): 391905280
i94_dec16_sub.sas7bdat - dim(bytes): 523304960


#### Create a dataframe
*Note*: If this fails with `Failed to find data source: com.github.saurfang.sas.spark` then reset the Udactiy workspace

In [61]:
I94_TEST_FILE = '../../../../../data/18-83510-I94-Data-2016/i94_aug16_sub.sas7bdat'

df_I94 = spark.read.format('com.github.saurfang.sas.spark').load(I94_TEST_FILE).persist()


#### Inspect the df

In [62]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22.0,2016.0,8.0,323.0,323.0,NYC,20667.0,1.0,FL,,23.0,3.0,1.0,20160801,RID,,U,,,,1993.0,D/S,M,,EK,64510500000.0,201,F1
1,55.0,2016.0,8.0,209.0,209.0,AGA,20667.0,1.0,CA,,41.0,2.0,1.0,20160801,,,A,,,,1975.0,09142016,M,3955.0,JL,57571870000.0,941,GMT
2,56.0,2016.0,8.0,209.0,209.0,AGA,20667.0,1.0,GU,,24.0,2.0,1.0,20160801,,,A,,,,1992.0,09152016,F,3661.0,UA,57571890000.0,874,GMT
3,61.0,2016.0,8.0,213.0,213.0,CHI,20667.0,1.0,WA,20774.0,27.0,3.0,1.0,20160801,BMB,,U,O,,M,1989.0,D/S,M,,UA,59059190000.0,906,F1
4,64.0,2016.0,8.0,111.0,111.0,BOS,20667.0,1.0,MS,20670.0,34.0,2.0,1.0,20160804,,,G,O,,M,1982.0,08242016,F,32572.0,QK,61043090000.0,8456,WT


In [63]:
df_I94.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double (nullable = 

#### Convert doubles to integers

In [64]:
# Snippet taken from https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
toInt = udf(lambda x: int(x) if x!=None else x, IntegerType())

for colname, coltype in df_I94.dtypes:
    if coltype == 'double':
        df_I94 = df_I94.withColumn(colname, toInt(colname))

#### Convert strings to dates

In [65]:
df_I94 = df_I94.withColumn('dtadfile',to_date((col("dtadfile")),"yyyyMMdd"))

In [66]:
df_I94 = df_I94.withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy"))

#### Convert SAS epoch dates from integers to dates

In [67]:
# from https://knowledge.udacity.com/questions/66798
from datetime import datetime, timedelta
from pyspark.sql import types as T
def convert_datetime(x):
    try:
        start = datetime(1960, 1, 1)
        return start + timedelta(days=int(x))
    except:
        return None
udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType())

In [68]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22,2016,8,323,323,NYC,20667,1,FL,,23,3,1,2016-08-01,RID,,U,,,,1993,,M,,EK,85986190,201,F1
1,55,2016,8,209,209,AGA,20667,1,CA,,41,2,1,2016-08-01,,,A,,,,1975,2016-09-14,M,3955.0,JL,1737294085,941,GMT
2,56,2016,8,209,209,AGA,20667,1,GU,,24,2,1,2016-08-01,,,A,,,,1992,2016-09-15,F,3661.0,UA,1737319685,874,GMT
3,61,2016,8,213,213,CHI,20667,1,WA,20774.0,27,3,1,2016-08-01,BMB,,U,O,,M,1989,,M,,UA,-1070347714,906,F1
4,64,2016,8,111,111,BOS,20667,1,MS,20670.0,34,2,1,2016-08-04,,,G,O,,M,1982,2016-08-24,F,32572.0,QK,913547189,8456,WT


In [69]:
# df_I94 = df_I94.withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy")
df_I94 = df_I94.withColumn("arrdate", udf_datetime_from_sas("arrdate")) 

In [70]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22,2016,8,323,323,NYC,2016-08-01,1,FL,,23,3,1,2016-08-01,RID,,U,,,,1993,,M,,EK,85986190,201,F1
1,55,2016,8,209,209,AGA,2016-08-01,1,CA,,41,2,1,2016-08-01,,,A,,,,1975,2016-09-14,M,3955.0,JL,1737294085,941,GMT
2,56,2016,8,209,209,AGA,2016-08-01,1,GU,,24,2,1,2016-08-01,,,A,,,,1992,2016-09-15,F,3661.0,UA,1737319685,874,GMT
3,61,2016,8,213,213,CHI,2016-08-01,1,WA,20774.0,27,3,1,2016-08-01,BMB,,U,O,,M,1989,,M,,UA,-1070347714,906,F1
4,64,2016,8,111,111,BOS,2016-08-01,1,MS,20670.0,34,2,1,2016-08-04,,,G,O,,M,1982,2016-08-24,F,32572.0,QK,913547189,8456,WT


In [71]:
df_I94 = df_I94.withColumn("depdate", udf_datetime_from_sas("depdate")) 

In [72]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22,2016,8,323,323,NYC,2016-08-01,1,FL,,23,3,1,2016-08-01,RID,,U,,,,1993,,M,,EK,85986190,201,F1
1,55,2016,8,209,209,AGA,2016-08-01,1,CA,,41,2,1,2016-08-01,,,A,,,,1975,2016-09-14,M,3955.0,JL,1737294085,941,GMT
2,56,2016,8,209,209,AGA,2016-08-01,1,GU,,24,2,1,2016-08-01,,,A,,,,1992,2016-09-15,F,3661.0,UA,1737319685,874,GMT
3,61,2016,8,213,213,CHI,2016-08-01,1,WA,2016-11-16,27,3,1,2016-08-01,BMB,,U,O,,M,1989,,M,,UA,-1070347714,906,F1
4,64,2016,8,111,111,BOS,2016-08-01,1,MS,2016-08-04,34,2,1,2016-08-04,,,G,O,,M,1982,2016-08-24,F,32572.0,QK,913547189,8456,WT


In [73]:
df_I94.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94cit: integer (nullable = true)
 |-- i94res: integer (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: date (nullable = true)
 |-- i94mode: integer (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: date (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- i94visa: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- dtadfile: date (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- dtaddto: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: integer (nullable

In [74]:
df_I94.select("gender").groupBy("gender").count().show()

+------+-------+
|gender|  count|
+------+-------+
|     F|1930751|
|  null| 233112|
|     M|1936845|
|     U|   2514|
|     X|    348|
+------+-------+



In [110]:
df_I94.select("biryear").groupBy("biryear").count().show()

+-------+-----+
|biryear|count|
+-------+-----+
|   1959|44392|
|   1990|61346|
|   1975|64323|
|   1977|61655|
|   1924|  179|
|   2003|41252|
|   2007|34049|
|   1974|66670|
|   2015|14362|
|   1927|  486|
|   1955|35212|
|   2006|35340|
|   1978|60687|
|   1925|  227|
|   1961|49941|
|   2013|14677|
|   1942|11013|
|   1939| 7328|
|   1944|14304|
|   null|   31|
+-------+-----+
only showing top 20 rows



In [112]:
min_by=df_I94.agg({"biryear": "min"}).collect()[0][0]
max_by=df_I94.agg({"biryear": "max"}).collect()[0][0]
print(f'The oldest arrival was born in {min_by} and the youngest in {max_by}')

The oldest arrival was born in 1911 and the youngest in 2016


In [76]:
df_I94.select("airline").groupBy("airline").count().show()

+-------+------+
|airline| count|
+-------+------+
|     DZ|     1|
|    01B|     1|
|    926|     3|
|     CI| 26072|
|      7|     2|
|     TC|     1|
|     FI| 17935|
|     AZ| 25670|
|     IC|     1|
|    78B|     1|
|     UA|380789|
|     EA|  2012|
|     Q7|    20|
|     VP|     1|
|    743|    34|
|    FYG|     4|
|     3M|  1278|
|    YEA|     4|
|     RO|     1|
|     SL|     4|
+-------+------+
only showing top 20 rows



In [77]:
df_I94.select("arrDate").groupBy("arrDate").count().show()

+----------+------+
|   arrDate| count|
+----------+------+
|2016-08-15|130941|
|2016-08-31| 98063|
|2016-08-23|124442|
|2016-08-26|127757|
|2016-08-01|147570|
|2016-08-16|126259|
|2016-08-06|150617|
|2016-08-05|152439|
|2016-08-20|143306|
|2016-08-03|139811|
|2016-08-12|148702|
|2016-08-19|146621|
|2016-08-10|135493|
|2016-08-13|145134|
|2016-08-30| 90944|
|2016-08-07|137923|
|2016-08-27|126097|
|2016-08-18|143132|
|2016-08-04|147395|
|2016-08-21|133151|
+----------+------+
only showing top 20 rows



#### Get max and min arrDates

In [78]:
df_I94.agg({"arrDate": "max"}).collect()[0][0]

datetime.date(2016, 8, 31)

In [79]:
df_I94.agg({"arrDate": "min"}).collect()[0][0]

datetime.date(2016, 8, 1)

#### Get max and min depDates

In [107]:
df_I94.agg({"depDate": "max"}).collect()[0][0]

datetime.date(2016, 11, 22)

In [108]:
df_I94.agg({"depDate": "min"}).collect()[0][0]

datetime.date(2016, 8, 2)

#### Find cases where dep data is before arrival date

In [100]:
df_I94.where(col('arrDate') > col('depDate')).count()

564

#### Drop rows where the arrival data is after the departure date

In [105]:
df_I94=df_I94.where(col('arrDate') <= col('depDate'))

#### Count and delete duplicates

In [106]:
count_before=df_I94.count()
df_I94 = df_I94.drop_duplicates()
count_after=df_I94.count()
print(f'{count_before-count_after} duplicate rows dropped (out of {count_before})')

0 duplicate rows dropped (out of 3451218)


#### What kind of DF is this?

In [92]:
if isinstance(df, pd.DataFrame):
    print('pandas')
else:
    print('spark')

spark


#### Find Nulls

In [104]:
# https://stackoverflow.com/questions/44627386/how-to-find-count-of-null-and-nan-values-for-each-column-in-a-pyspark-dataframe
from pyspark.sql.functions import isnan, when, count, col

df_I94.select([count(when(isnull(c), c)).alias(c) for c in df_I94.columns]).show()


+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+------+-----+--------+
|cicid|i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|  occup|entdepa|entdepd|entdepu|matflag|biryear|dtaddto|gender| insnum|airline|admnum|fltno|visatype|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+------+-----+--------+
|    0|    0|     0|  6142|     0|      0|      0|   1884| 184462| 651788|   816|      0|    0|       0| 2378159|4061341|     18| 639255|4099524| 637603|    816| 451416|233112|3573888| 145501|     0|23238|       0|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-------+-------+-------+---

#### Deal with Nulls