In [1]:
import pandas as pd
import re
import os

import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, udf, lit, explode, split, regexp_extract, col, isnan, isnull, desc, when, sum, to_date, desc, regexp_replace, count, to_timestamp
from pyspark.sql.types import IntegerType, TimestampType

In [2]:
#setting visualization options
# https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  

# modify visualization of the notebook, for easier view
from IPython.core.display import display, HTML
display(HTML("""<style> p { max-width:90% !important; } h1 {font-size:2rem!important } h2 {font-size:1.6rem!important } 
h3 {font-size:1.4rem!important } h4 {font-size:1.3rem!important }h5 {font-size:1.2rem!important }h6 {font-size:1.1rem!important }</style>"""))# Do all imports and installs here


In [3]:
def create_spark_session():
    """
    This function creates a Spark Sesson and includes necessary Jar and adoop packages in the configuration. 
    """
    spark=SparkSession \
    .builder \
    .config("spark.jars.repositories", "https://repos.spark-packages.org/") \
    .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11") \
    .enableHiveSupport() \
    .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

In [5]:
I94_DATASET_PATH = '../../../../../data/18-83510-I94-Data-2016/'
CLEAN_DATA_DIR='data/'

The dataset contains 12 files


In [None]:
filelist = os.listdir(I94_DATASET_PATH)
print(f"The dataset contains {len(filelist)} files")

In [6]:
for file in filelist:
    size = os.path.getsize('{}/{}'.format(I94_DATASET_PATH, file))
    print(f'{file} - dim(bytes): {size}')

i94_apr16_sub.sas7bdat - dim(bytes): 471990272
i94_sep16_sub.sas7bdat - dim(bytes): 569180160
i94_nov16_sub.sas7bdat - dim(bytes): 444334080
i94_mar16_sub.sas7bdat - dim(bytes): 481296384
i94_jun16_sub.sas7bdat - dim(bytes): 716570624
i94_aug16_sub.sas7bdat - dim(bytes): 625541120
i94_may16_sub.sas7bdat - dim(bytes): 525008896
i94_jan16_sub.sas7bdat - dim(bytes): 434176000
i94_oct16_sub.sas7bdat - dim(bytes): 556269568
i94_jul16_sub.sas7bdat - dim(bytes): 650117120
i94_feb16_sub.sas7bdat - dim(bytes): 391905280
i94_dec16_sub.sas7bdat - dim(bytes): 523304960


#### Create a dataframe
*Note*: If this fails with `Failed to find data source: com.github.saurfang.sas.spark` then reset the Udactiy workspace

In [None]:
# from https://knowledge.udacity.com/questions/66798
from datetime import datetime, timedelta
from pyspark.sql import types as T
def convert_datetime(x):
    try:
        start = datetime(1960, 1, 1)
        return start + timedelta(days=int(x))
    except:
        return None



def cleanImmigrationFactData(path_to_files, output_path)
"""
    Description:

    Usage:
"""


    filelist = os.listdir(path_to_files)

    print(f"The dataset contains {len(filelist)} files")
    
    for file in filelist:
        
         print(f'Processing {file} - dim(bytes): {size}')
        
        df_I94 = spark.read.format('com.github.saurfang.sas.spark').load(I94_TEST_FILE).persist()

        # Snippet taken from https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
        toInt = udf(lambda x: int(x) if x!=None else x, IntegerType())

        for colname, coltype in df_I94.dtypes:
            if coltype == 'double':
                df_I94 = df_I94.withColumn(colname, toInt(colname))
        
        # Convert strings to dates
        df_I94 = df_I94.withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy")) \
        .withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy"))
        
        # Convert SAS date to dates
        udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType())
        
        df_I94 = df_I94.withColumn("arrdate", udf_datetime_from_sas("arrdate")) \
        .withColumn("depdate", udf_datetime_from_sas("depdate")) 
        
        # write data out
        print(f'Exporting cleaned file to {S3_bucket_I94}')
        df_I94.write.format('parquet').mode('overwrite').partitionBy('i94yr','i94mon').save(S3_bucket_I94)

In [5]:
I94_TEST_FILE = '../../../../../data/18-83510-I94-Data-2016/i94_aug16_sub.sas7bdat'

df_I94 = spark.read.format('com.github.saurfang.sas.spark').load(I94_TEST_FILE).persist()


#### Inspect the df

In [8]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22.0,2016.0,8.0,323.0,323.0,NYC,20667.0,1.0,FL,,23.0,3.0,1.0,20160801,RID,,U,,,,1993.0,D/S,M,,EK,64510500000.0,201,F1
1,55.0,2016.0,8.0,209.0,209.0,AGA,20667.0,1.0,CA,,41.0,2.0,1.0,20160801,,,A,,,,1975.0,09142016,M,3955.0,JL,57571870000.0,941,GMT
2,56.0,2016.0,8.0,209.0,209.0,AGA,20667.0,1.0,GU,,24.0,2.0,1.0,20160801,,,A,,,,1992.0,09152016,F,3661.0,UA,57571890000.0,874,GMT
3,61.0,2016.0,8.0,213.0,213.0,CHI,20667.0,1.0,WA,20774.0,27.0,3.0,1.0,20160801,BMB,,U,O,,M,1989.0,D/S,M,,UA,59059190000.0,906,F1
4,64.0,2016.0,8.0,111.0,111.0,BOS,20667.0,1.0,MS,20670.0,34.0,2.0,1.0,20160804,,,G,O,,M,1982.0,08242016,F,32572.0,QK,61043090000.0,8456,WT


In [9]:
df_I94.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double (nullable = 

#### Convert doubles to integers

In [6]:
# Snippet taken from https://www.1week4.com/it/machine-learning/udacity-data-engineering-capstone-project/
toInt = udf(lambda x: int(x) if x!=None else x, IntegerType())

for colname, coltype in df_I94.dtypes:
    if coltype == 'double':
        df_I94 = df_I94.withColumn(colname, toInt(colname))

#### Convert strings to dates

In [7]:
df_I94 = df_I94.withColumn('dtadfile',to_date((col("dtadfile")),"yyyyMMdd"))

In [8]:
df_I94 = df_I94.withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy"))

#### Convert SAS epoch dates from integers to dates

In [9]:
# from https://knowledge.udacity.com/questions/66798
from datetime import datetime, timedelta
from pyspark.sql import types as T
def convert_datetime(x):
    try:
        start = datetime(1960, 1, 1)
        return start + timedelta(days=int(x))
    except:
        return None
udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType())

#### df_I94.limit(5).toPandas().head()

In [10]:
# df_I94 = df_I94.withColumn('dtaddto',to_date(col("dtaddto"),"MMddyyyy")
df_I94 = df_I94.withColumn("arrdate", udf_datetime_from_sas("arrdate")) 

In [15]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22,2016,8,323,323,NYC,2016-08-01,1,FL,,23,3,1,2016-08-01,RID,,U,,,,1993,,M,,EK,85986190,201,F1
1,55,2016,8,209,209,AGA,2016-08-01,1,CA,,41,2,1,2016-08-01,,,A,,,,1975,2016-09-14,M,3955.0,JL,1737294085,941,GMT
2,56,2016,8,209,209,AGA,2016-08-01,1,GU,,24,2,1,2016-08-01,,,A,,,,1992,2016-09-15,F,3661.0,UA,1737319685,874,GMT
3,61,2016,8,213,213,CHI,2016-08-01,1,WA,20774.0,27,3,1,2016-08-01,BMB,,U,O,,M,1989,,M,,UA,-1070347714,906,F1
4,64,2016,8,111,111,BOS,2016-08-01,1,MS,20670.0,34,2,1,2016-08-04,,,G,O,,M,1982,2016-08-24,F,32572.0,QK,913547189,8456,WT


In [11]:
df_I94 = df_I94.withColumn("depdate", udf_datetime_from_sas("depdate")) 

In [17]:
df_I94.limit(5).toPandas().head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,22,2016,8,323,323,NYC,2016-08-01,1,FL,,23,3,1,2016-08-01,RID,,U,,,,1993,,M,,EK,85986190,201,F1
1,55,2016,8,209,209,AGA,2016-08-01,1,CA,,41,2,1,2016-08-01,,,A,,,,1975,2016-09-14,M,3955.0,JL,1737294085,941,GMT
2,56,2016,8,209,209,AGA,2016-08-01,1,GU,,24,2,1,2016-08-01,,,A,,,,1992,2016-09-15,F,3661.0,UA,1737319685,874,GMT
3,61,2016,8,213,213,CHI,2016-08-01,1,WA,2016-11-16,27,3,1,2016-08-01,BMB,,U,O,,M,1989,,M,,UA,-1070347714,906,F1
4,64,2016,8,111,111,BOS,2016-08-01,1,MS,2016-08-04,34,2,1,2016-08-04,,,G,O,,M,1982,2016-08-24,F,32572.0,QK,913547189,8456,WT


In [12]:
df_I94.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94cit: integer (nullable = true)
 |-- i94res: integer (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: date (nullable = true)
 |-- i94mode: integer (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: date (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- i94visa: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- dtadfile: date (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- dtaddto: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: integer (nullable

In [19]:
df_I94.select("gender").groupBy("gender").count().show()

+------+-------+
|gender|  count|
+------+-------+
|     F|1930751|
|  null| 233112|
|     M|1936845|
|     U|   2514|
|     X|    348|
+------+-------+



In [20]:
df_I94.select("i94port").groupBy("i94port").count().orderBy(desc('count')).show()

+-------+------+
|i94port| count|
+-------+------+
|    NYC|696609|
|    LOS|466224|
|    MIA|439593|
|    SFR|249132|
|    HHW|218639|
|    CHI|213908|
|    NEW|183121|
|    ORL|178448|
|    BOS|116619|
|    ATL|116060|
|    WAS|115528|
|    HOU|114214|
|    AGA|114174|
|    DAL| 91816|
|    FTL| 84006|
|    SEA| 76597|
|    LVG| 75121|
|    DET| 60681|
|    SAI| 52872|
|    PHI| 41332|
+-------+------+
only showing top 20 rows



In [21]:
df_I94.select("biryear").groupBy("biryear").count().show()

+-------+-----+
|biryear|count|
+-------+-----+
|   1959|47945|
|   1990|80446|
|   1975|69434|
|   1977|66979|
|   1924|  218|
|   2003|44558|
|   2007|37341|
|   1974|71795|
|   2015|16330|
|   1927|  568|
|   1955|38837|
|   2006|38628|
|   1978|66008|
|   1925|  279|
|   1908|    1|
|   1961|53447|
|   2013|16791|
|   1942|12432|
|   1944|16074|
|   1939| 8256|
+-------+-----+
only showing top 20 rows



In [22]:
min_by=df_I94.agg({"biryear": "min"}).collect()[0][0]
max_by=df_I94.agg({"biryear": "max"}).collect()[0][0]
print(f'The oldest arrival was born in {min_by} and the youngest in {max_by}')

The oldest arrival was born in 1904 and the youngest in 2016


In [23]:
df_I94.select("airline").groupBy("airline").count().show()

+-------+------+
|airline| count|
+-------+------+
|     DZ|     1|
|    01B|     1|
|    926|     3|
|     CI| 26072|
|      7|     2|
|     TC|     1|
|     FI| 17935|
|     AZ| 25670|
|     IC|     1|
|    78B|     1|
|     UA|380789|
|     EA|  2012|
|     Q7|    20|
|     VP|     1|
|    743|    34|
|    FYG|     4|
|     3M|  1278|
|    YEA|     4|
|     RO|     1|
|     SL|     4|
+-------+------+
only showing top 20 rows



In [24]:
df_I94.select("arrDate").groupBy("arrDate").count().show()

+----------+------+
|   arrDate| count|
+----------+------+
|2016-08-15|130941|
|2016-08-31| 98063|
|2016-08-23|124442|
|2016-08-26|127757|
|2016-08-01|147570|
|2016-08-16|126259|
|2016-08-06|150617|
|2016-08-05|152439|
|2016-08-20|143306|
|2016-08-03|139811|
|2016-08-12|148702|
|2016-08-19|146621|
|2016-08-10|135493|
|2016-08-13|145134|
|2016-08-30| 90944|
|2016-08-07|137923|
|2016-08-27|126097|
|2016-08-18|143132|
|2016-08-04|147395|
|2016-08-21|133151|
+----------+------+
only showing top 20 rows



#### Get max and min arrDates

In [25]:
df_I94.agg({"arrDate": "max"}).collect()[0][0]

datetime.date(2016, 8, 31)

In [26]:
df_I94.agg({"arrDate": "min"}).collect()[0][0]

datetime.date(2016, 8, 1)

#### Get max and min depDates

In [27]:
df_I94.agg({"depDate": "max"}).collect()[0][0]

datetime.date(2016, 11, 22)

In [28]:
df_I94.agg({"depDate": "min"}).collect()[0][0]

datetime.date(1920, 8, 23)

#### Find cases where dep data is before arrival date

In [29]:
df_I94.where(col('arrDate') > col('depDate')).count()

564

#### Drop rows where the arrival data is after the departure date

In [13]:
df_I94=df_I94.where(col('arrDate') <= col('depDate'))

#### Count and delete duplicates

In [14]:
count_before=df_I94.count()
df_I94 = df_I94.drop_duplicates()
count_after=df_I94.count()
print(f'{count_before-count_after} duplicate rows dropped (out of {count_before})')

0 duplicate rows dropped (out of 3451218)


#### What kind of DF is this?

In [32]:
# if isinstance(df, pd.DataFrame):
#     print('pandas')
# else:
#     print('spark')

NameError: name 'df' is not defined

#### Find Nulls

In [33]:
# https://stackoverflow.com/questions/44627386/how-to-find-count-of-null-and-nan-values-for-each-column-in-a-pyspark-dataframe
from pyspark.sql.functions import isnan, when, count, col

df_I94.select([count(when(isnull(c), c)).alias(c) for c in df_I94.columns]).show()


+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+------+-----+--------+
|cicid|i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|  occup|entdepa|entdepd|entdepu|matflag|biryear|dtaddto|gender| insnum|airline|admnum|fltno|visatype|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+------+-----+--------+
|    0|    0|     0|  4364|     0|      0|      0|   1882| 153714|      0|    31|      0|    0|       0| 2236256|3444991|      9|   1832|3450560|     19|     31|  58194|228740|3081252| 109278|     0|10170|       0|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-------+-------+-------+---

#### Deal with Nulls

In [15]:
S3_bucket_I94 = 'data/S3bucket_temp/I94_data'
df_I94.write.format('parquet').mode('overwrite').partitionBy('i94yr','i94mon').save(S3_bucket_I94)

#### Investigate join to airport_codes

In [34]:
df_airport_codes = spark.read.csv('../../../airport-codes_csv.csv', sep=',', inferSchema=True, header=True)

In [35]:
df_I94.join(df_airport_codes, df_I94.i94port == df_airport_codes.local_code,"left").count()

4050921

In [36]:
df_I94.join(df_airport_codes, df_I94.i94port == df_airport_codes.local_code).count()

2838134

In [37]:
df_I94.join(df_airport_codes, df_I94.i94port == df_airport_codes.local_code,"left").where(isnull("local_code"))

DataFrame[cicid: int, i94yr: int, i94mon: int, i94cit: int, i94res: int, i94port: string, arrdate: date, i94mode: int, i94addr: string, depdate: date, i94bir: int, i94visa: int, count: int, dtadfile: date, visapost: string, occup: string, entdepa: string, entdepd: string, entdepu: string, matflag: string, biryear: int, dtaddto: date, gender: string, insnum: string, airline: string, admnum: int, fltno: string, visatype: string, ident: string, type: string, name: string, elevation_ft: int, continent: string, iso_country: string, iso_region: string, municipality: string, gps_code: string, iata_code: string, local_code: string, coordinates: string]

#### Check the cities where there is no match between local codes and port code
*Hint* It ain't good news.  I don't think the airport_codes datset is going to be much use

In [38]:
df_I94.join(df_airport_codes, df_I94.i94port == df_airport_codes.local_code,"left") \
.where(isnull("local_code")) \
.select("i94port") \
.groupBy("i94port") \
.count() \
.orderBy(desc('count')).show()

+-------+------+
|i94port| count|
+-------+------+
|    NYC|598401|
|    CHI|146630|
|    WAS| 85730|
|    AGA| 84123|
|    FTL| 74761|
|    LVG| 71248|
|    PHI| 34663|
|    YHC| 17254|
|    SPM| 15900|
|    SAJ| 12906|
|    TAM| 11344|
|    PBB| 10748|
|    SNJ|  8842|
|    POO|  8656|
|    WPB|  4106|
|    XXX|  3285|
|    DER|  2679|
|    THO|  2447|
|    SYS|  1887|
|    X96|  1730|
+-------+------+
only showing top 20 rows



#### Now check What we find in I94_SAS_Labels_Descriptions

In [39]:
text_file = 'I94_SAS_Labels_Descriptions.SAS'

# wholetext=true means we read the file into a singel row - handier for regex
df_label_full = spark.read.text(text_file, wholetext=True)

In [40]:
df_label_full.printSchema()

root
 |-- value: string (nullable = true)



In [41]:
pattern='(\$i94prtl)([^;]+)'

In [42]:

df_new = df_label_full.withColumn('I94PORT', regexp_extract(col('value'),pattern,2))


In [43]:
df_new = df_new.withColumn('port',explode(split('I94PORT','[\r\n]+'))).drop('value').drop('I94PORT')
# df_new.toPandas().head()

In [44]:
df_I94_code = df_new.withColumn('code',regexp_extract(col('port'),"(?<=')[0-9A-Z. ]+(?=')",0)) \
    .withColumn('city_state',regexp_extract(col('port'),"(=\t')([0-9A-Za-z ,\-()\/\.#&]+)(')",2)) \
    .withColumn('city', split(col('city_state'),',').getItem(0)) \
    .withColumn('state', split(col('city_state'),',').getItem(1)) \
    .withColumn('state', regexp_replace(col('state'), ' *$', '')) \
    .where(col('port')!='') \
    .drop('port') \

In [45]:
df_I94_code.show()

+----+--------------------+--------------------+-----+
|code|          city_state|                city|state|
+----+--------------------+--------------------+-----+
| ALC|ALCAN, AK        ...|               ALCAN|   AK|
| ANC|ANCHORAGE, AK    ...|           ANCHORAGE|   AK|
| BAR|BAKER AAF - BAKER...|BAKER AAF - BAKER...|   AK|
| DAC|DALTONS CACHE, AK...|       DALTONS CACHE|   AK|
| PIZ|DEW STATION PT LA...|DEW STATION PT LA...|   AK|
| DTH|DUTCH HARBOR, AK ...|        DUTCH HARBOR|   AK|
| EGL|EAGLE, AK        ...|               EAGLE|   AK|
| FRB|FAIRBANKS, AK    ...|           FAIRBANKS|   AK|
| HOM|HOMER, AK        ...|               HOMER|   AK|
| HYD|HYDER, AK        ...|               HYDER|   AK|
| JUN|JUNEAU, AK       ...|              JUNEAU|   AK|
| 5KE|       KETCHIKAN, AK|           KETCHIKAN|   AK|
| KET|KETCHIKAN, AK    ...|           KETCHIKAN|   AK|
| MOS|MOSES POINT INTER...|MOSES POINT INTER...|   AK|
| NIK|NIKISKI, AK      ...|             NIKISKI|   AK|
| NOM|NOM,

In [51]:
df_I94_code.where(col('code')=='OCA').show()

+----+-------------------+---------------+-----+
|code|         city_state|           city|state|
+----+-------------------+---------------+-----+
| OCA|Ocean Reef Club, FL|Ocean Reef Club|   FL|
+----+-------------------+---------------+-----+



In [46]:
df_I94.join(df_I94_code, df_I94.i94port == df_I94_code.code,"left") \
.where(isnull("code")) \
.select("i94port") \
.groupBy("i94port") \
.count() \
.orderBy(desc('count')).show()

+-------+-----+
|i94port|count|
+-------+-----+
+-------+-----+



#### Add a new row for the missing airport

In [47]:
# columns = ['code', 'city_state', 'city','state']
# vals = [('OCA','Ocean Reef Club, FL','Ocean Reef Club', 'FL' )]

# df_OCA = spark.createDataFrame(vals, columns)

# df_I94_code = df_I94_code.union(df_OCA)

In [48]:
# df_I94.join(df_I94_code, df_I94.i94port == df_I94_code.code,"left") \
# .where(isnull("code")) \
# .select("i94port") \
# .groupBy("i94port") \
# .count() \
# .orderBy(desc('count')).show()

+-------+-----+
|i94port|count|
+-------+-----+
+-------+-----+



> /* I94VISA - Visa codes collapsed into three categories:
   1 = Business
   2 = Pleasure
   3 = Student
*/

In [49]:
columns = ['I94VISA', 'category']
vals = [(1,'Business'),(2,'Pleasure'),(3,'Student')]

df_I94VISA = spark.createDataFrame(vals, columns)

In [50]:
df_I94VISA.show()

+-------+--------+
|I94VISA|category|
+-------+--------+
|      1|Business|
|      2|Pleasure|
|      3| Student|
+-------+--------+



In [None]:
columns = ['I94VISA', 'category']
vals = [(1,'Business'),(2,'Pleasure'),(3,'Student')]

df_I94VISA = spark.createDataFrame(vals, columns)

#### Extract entry mode
**I94MODE - There are missing values as well as not reported (9)**


|Code|Entry Mode|
|---|---|
|1|'Air'|
|2 |'Sea'|
|3 | 'Land'|
|9 | 'Not reported'|

In [87]:
columns = ['I94MODE', 'category']
vals = [(1,'Air'),(2,'Sea'),(3,'Land'),(4,'Not reported')]

df_I94MODE = spark.createDataFrame(vals, columns)

In [88]:
df_I94MODE.show()

+-------+------------+
|I94MODE|    category|
+-------+------------+
|      1|         Air|
|      2|         Sea|
|      3|        Land|
|      4|Not reported|
+-------+------------+



#### Extract countries

In [111]:
pattern='(i94cntyl)([^;]+)'

In [112]:

df_new = df_label_full.withColumn('I94RES', regexp_extract(col('value'),pattern,2))


In [113]:
# df_new.toPandas().head()

In [114]:
# df_new = df_new.withColumn('port',explode(split('I94PORT','[\r\n]+'))).drop('value').drop('I94PORT')
df_new = df_new.withColumn('raw',explode(split('I94RES','[\r\n]+'))).drop('value').drop('I94RES')
df_new.toPandas().head()

Unnamed: 0,raw
0,
1,"582 = 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'"
2,236 = 'AFGHANISTAN'
3,101 = 'ALBANIA'
4,316 = 'ALGERIA'


In [115]:
#df_I94_code = df_new.withColumn('code',regexp_extract(col('port'),"(?<=')[0-9A-Z. ]+(?=')",0)) \
#     .withColumn('city_state',regexp_extract(col('port'),"(=\t')([0-9A-Za-z ,\-()\/\.#&]+)(')",2)) \
#     .withColumn('city', split(col('city_state'),',').getItem(0)) \
#     .withColumn('state', split(col('city_state'),',').getItem(1)) \
#     .withColumn('state', regexp_replace(col('state'), ' *$', '')) \
#     .where(col('port')!='') \
#     .drop('port') \

df_I94RES = df_new.withColumn('code',regexp_extract(col('raw'),"[0-9]+",0)) \
.withColumn('country',regexp_extract(col('raw'),"\'([A-Za-z ,\-()0-9]+)\'",1)) \
.where(col('raw')!='') \
.drop('raw')


In [125]:
df_I94RES.show()

+----+--------------------+
|code|             country|
+----+--------------------+
| 582|MEXICO Air Sea, a...|
| 236|         AFGHANISTAN|
| 101|             ALBANIA|
| 316|             ALGERIA|
| 102|             ANDORRA|
| 324|              ANGOLA|
| 529|            ANGUILLA|
| 518|     ANTIGUA-BARBUDA|
| 687|          ARGENTINA |
| 151|             ARMENIA|
| 532|               ARUBA|
| 438|           AUSTRALIA|
| 103|             AUSTRIA|
| 152|          AZERBAIJAN|
| 512|             BAHAMAS|
| 298|             BAHRAIN|
| 274|          BANGLADESH|
| 513|            BARBADOS|
| 104|             BELGIUM|
| 581|              BELIZE|
+----+--------------------+
only showing top 20 rows



In [117]:
df_I94RES.printSchema()

root
 |-- code: string (nullable = true)
 |-- country: string (nullable = true)



In [86]:
df_I94.join(df_I94RES, df_I94.i94res == df_I94RES.code,"left") \
.where(isnull("code")) \
.select("i94res") \
.groupBy("i94res") \
.count() \
.orderBy(desc('count')).show()

+------+-----+
|i94res|count|
+------+-----+
+------+-----+



#### Extract states

In [118]:
pattern='(i94addrl)([^;]+)'

In [119]:

df_new = df_label_full.withColumn('i94addrl', regexp_extract(col('value'),pattern,2))


In [120]:
# df_new.toPandas().head()

In [121]:
# df_new = df_new.withColumn('port',explode(split('I94PORT','[\r\n]+'))).drop('value').drop('I94PORT')
df_new = df_new.withColumn('raw',explode(split('i94addrl','[\r\n]+'))).drop('value').drop('i94addrl')
df_new.toPandas().head()

Unnamed: 0,raw
0,
1,\t'AL'='ALABAMA'
2,\t'AK'='ALASKA'
3,\t'AZ'='ARIZONA'
4,\t'AR'='ARKANSAS'


In [122]:
#df_I94_code = df_new.withColumn('code',regexp_extract(col('port'),"(?<=')[0-9A-Z. ]+(?=')",0)) \
#     .withColumn('city_state',regexp_extract(col('port'),"(=\t')([0-9A-Za-z ,\-()\/\.#&]+)(')",2)) \
#     .withColumn('city', split(col('city_state'),',').getItem(0)) \
#     .withColumn('state', split(col('city_state'),',').getItem(1)) \
#     .withColumn('state', regexp_replace(col('state'), ' *$', '')) \
#     .where(col('port')!='') \
#     .drop('port') \


df_I94ADDR = df_new.withColumn('code',regexp_extract(col('raw'),"(?<=')[0-9A-Z. ]+(?=')",0)) \
.withColumn('state',regexp_extract(col('raw'),"(=\s*\')([A-Z]+)(\')",2)) \
.where(col('raw')!='') \
.drop('raw')

In [124]:
df_I94ADDR.show()

+----+-----------+
|code|      state|
+----+-----------+
|  AL|    ALABAMA|
|  AK|     ALASKA|
|  AZ|    ARIZONA|
|  AR|   ARKANSAS|
|  CA| CALIFORNIA|
|  CO|   COLORADO|
|  CT|CONNECTICUT|
|  DE|   DELAWARE|
|  DC|           |
|  FL|    FLORIDA|
|  GA|    GEORGIA|
|  GU|       GUAM|
|  HI|     HAWAII|
|  ID|      IDAHO|
|  IL|   ILLINOIS|
|  IN|    INDIANA|
|  IA|       IOWA|
|  KS|     KANSAS|
|  KY|   KENTUCKY|
|  LA|  LOUISIANA|
+----+-----------+
only showing top 20 rows



In [84]:
df_I94RES.printSchema()

root
 |-- code: integer (nullable = true)
 |-- country: string (nullable = true)



In [86]:
df_I94.join(df_I94RES, df_I94.i94res == df_I94RES.code,"left") \
.where(isnull("code")) \
.select("i94res") \
.groupBy("i94res") \
.count() \
.orderBy(desc('count')).show()

+------+-----+
|i94res|count|
+------+-----+
+------+-----+

