In [1]:
# imports and installs done
import pandas as pd
import os
import configparser
import datetime as dt

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, udf, dayofmonth, dayofweek, month, year, weekofyear
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *

import requests
requests.packages.urllib3.disable_warnings()

import utility
import functions

import importlib
importlib.reload(utility)
from utility import clean_spark_immigration_data, clean_spark_temperature_data
from utility import clean_spark_demographics_data, print_formatted_float

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['KEYS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['KEYS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession.builder.\
    config("spark.jars.repositories", "https://repos.spark-packages.org/").\
    config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11").\
    enableHiveSupport().getOrCreate()

## I. Scope of the project and Gathering Data

The objective of this project is to make analysis on the immigration events based on various datasets obtained. An ETL is to be  created for I-94 immigration, global land temperature and US demographics datasets. This is used to get an idea of immigration patterns to US. 

#### Immigration Dataset 

This data comes from the US National Tourism and Trade Office.In the past all foreign visitors to the U.S. arriving via air or sea were required to complete paper Customs and Border Protection Form I-94 Arrival/Departure Record or Form I-94W Nonimmigrant Visa Waiver Arrival/Departure Record and this dataset comes from this forms.
This dataset forms the core of the data warehouse and the customer repository has a years worth of data for the year 2016 and the dataset is divided by month. 
For this project the data is in a folder located at ../../data/18-83510-I94-Data-2016/. Each months data is stored in an SAS binary database storage format sas7bdat. For this project we have chosen going to work with data for the month of April. The data extraction, transformation and loading utility functions have been designed to work with any month's worth of data.

In [4]:
# Read in the data here
fname = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
df = pd.read_sas(fname, 'sas7bdat', encoding="ISO-8859-1")

In [5]:
df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,6.0,2016.0,4.0,692.0,692.0,XXX,20573.0,,,,...,U,,1979.0,10282016,,,,1897628000.0,,B2
1,7.0,2016.0,4.0,254.0,276.0,ATL,20551.0,1.0,AL,,...,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1
2,15.0,2016.0,4.0,101.0,101.0,WAS,20545.0,1.0,MI,20691.0,...,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2
3,16.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,...,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2
4,17.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,...,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2


#### World Temperature Data

 This dataset came from Kaggle accessible through *'../../data2/GlobalLandTemperaturesByCity.csv'* .
 The dataset provides data about global land temperatures by cities

In [6]:
temp_df = pd.read_csv('../../data2/GlobalLandTemperaturesByCity.csv')

In [7]:
temp_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [8]:
temp_df.shape

(8599212, 7)

#### US Demographic Data

This data comes from OpenSoft. It contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000. 
The original data source is the US Census Bureau's 2015 American Community Survey.

In [9]:
demographic_df =pd.read_csv("us-cities-demographics.csv", sep=';')

In [10]:
demographic_df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [11]:
demographic_df.shape

(2891, 12)

## II. Exploring and Assessing the data

### Explore the Data
Identify data quality issues, like missing values, duplicate data, etc.

Clean up the data based on what you have identified.

### 1. Immigration Data

In [12]:
# list all files in the  repository
files = os.listdir('../../data/18-83510-I94-Data-2016/')
files

['i94_apr16_sub.sas7bdat',
 'i94_sep16_sub.sas7bdat',
 'i94_nov16_sub.sas7bdat',
 'i94_mar16_sub.sas7bdat',
 'i94_jun16_sub.sas7bdat',
 'i94_aug16_sub.sas7bdat',
 'i94_may16_sub.sas7bdat',
 'i94_jan16_sub.sas7bdat',
 'i94_oct16_sub.sas7bdat',
 'i94_jul16_sub.sas7bdat',
 'i94_feb16_sub.sas7bdat',
 'i94_dec16_sub.sas7bdat']

In [13]:
# Read in the data for April 2016
fname = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
df = pd.read_sas(fname, 'sas7bdat', encoding="ISO-8859-1")

In [14]:
# lets see the dataframe dimensions
df.shape

(3096313, 28)

In [15]:
# get the first 5 rows of the dataframe
df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,6.0,2016.0,4.0,692.0,692.0,XXX,20573.0,,,,...,U,,1979.0,10282016,,,,1897628000.0,,B2
1,7.0,2016.0,4.0,254.0,276.0,ATL,20551.0,1.0,AL,,...,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1
2,15.0,2016.0,4.0,101.0,101.0,WAS,20545.0,1.0,MI,20691.0,...,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2
3,16.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,...,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2
4,17.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,...,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2


In [16]:
# Don't run this cell, will take a long time to execute. Just showing the files
# for name in files:
#     # read the data into a data frame
#     fname = '../../data/18-83510-I94-Data-2016/' + name
#     df_f =  pd.read_sas(fname, 'sas7bdat', encoding="ISO-8859-1")
#     print(f'{fname} df.shape = ', df_f.shape)

In [17]:
# explore missing values per row and list rows with over 50% missing values
rows_missing_values = df.isnull().sum(axis=1)
# check if there are rows with over 50% of missing values
rows_missing_values[rows_missing_values > df.shape[0]/2]


Series([], dtype: int64)

In [34]:
missing_values = df.isnull().sum()
print(missing_values)

cicid             0
i94yr             0
i94mon            0
i94cit            0
i94res            0
i94port           0
arrdate           0
i94mode         239
i94addr      152372
depdate      142457
i94bir          802
i94visa           0
count             0
dtadfile          1
visapost    1881250
occup       3088187
entdepa         238
entdepd      138429
entdepu     3095921
matflag      138429
biryear         802
dtaddto         477
gender       414269
insnum      2982605
airline       83627
admnum            0
fltno         19549
visatype          0
dtype: int64


In [35]:
df.count()

cicid       3096313
i94yr       3096313
i94mon      3096313
i94cit      3096313
i94res      3096313
i94port     3096313
arrdate     3096313
i94mode     3096074
i94addr     2943941
depdate     2953856
i94bir      3095511
i94visa     3096313
count       3096313
dtadfile    3096312
visapost    1215063
occup          8126
entdepa     3096075
entdepd     2957884
entdepu         392
matflag     2957884
biryear     3095511
dtaddto     3095836
gender      2682044
insnum       113708
airline     3012686
admnum      3096313
fltno       3076764
visatype    3096313
dtype: int64

In [36]:
df.shape

(3096313, 28)

#### Droping the columns with very large amount of missing values

In [37]:
def clean_immigration(df):
    """Clean immigration dataframe
    :param df: dataframe with monthly immigration data
    :return: clean dataframe
    """
    # Above  has shown these columns to exhibit over large amount missing values, and hence we drop them
    drop_columns = ['occup', 'entdepu','insnum']
    df = df.drop(columns=drop_columns)

    # drop rows where all elements are missing
    df = df.dropna(how='all')

    return df

In [38]:
new_df = clean_immigration(df)

In [39]:
new_df.shape

(3096313, 25)

### 2.Temperature Dataset

In [40]:
temp_df.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,8235082.0,8235082.0
mean,16.72743,1.028575
std,10.35344,1.129733
min,-42.704,0.034
25%,10.299,0.337
50%,18.831,0.591
75%,25.21,1.349
max,39.651,15.396


#### Dropping rows with missing average temperature values and dropping duplicates

In [47]:
temp_df.count()

dt                               8599212
AverageTemperature               8235082
AverageTemperatureUncertainty    8235082
City                             8599212
Country                          8599212
Latitude                         8599212
Longitude                        8599212
dtype: int64

#### Dropping the rows that contains missing Average temperature value and dropping duplicate columns

In [42]:
def clean_temperature_data(df):
    """Clean global temperatures dataset
    
    :param df: pandas dataframe representing global temperatures
    :return: clean dataframe
    """
    # drop rows with missing average temperature
    df = df.dropna(subset=['AverageTemperature'])
    
    # drop duplicate rows
    df = df.drop_duplicates(subset=['dt', 'City', 'Country'])
    
    return df

In [43]:
new_temp_df = clean_temperature_data(temp_df)


In [44]:
# dimensions before cleaning
temp_df.shape

(8599212, 7)

In [45]:
#dimensions after cleaning
new_temp_df.shape

(8190783, 7)

## Demographic Data

In [46]:
demographic_df.describe()

Unnamed: 0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
count,2891.0,2888.0,2888.0,2891.0,2878.0,2878.0,2875.0,2891.0
mean,35.494881,97328.43,101769.6,198966.8,9367.832523,40653.6,2.742543,48963.77
std,4.401617,216299.9,231564.6,447555.9,13211.219924,155749.1,0.433291,144385.6
min,22.9,29281.0,27348.0,63215.0,416.0,861.0,2.0,98.0
25%,32.8,39289.0,41227.0,80429.0,3739.0,9224.0,2.43,3435.0
50%,35.3,52341.0,53809.0,106782.0,5397.0,18822.0,2.65,13780.0
75%,38.0,86641.75,89604.0,175232.0,9368.0,33971.75,2.95,54447.0
max,70.5,4081698.0,4468707.0,8550405.0,156961.0,3212500.0,4.98,3835726.0


In [48]:
demographic_df.count()

City                      2891
State                     2891
Median Age                2891
Male Population           2888
Female Population         2888
Total Population          2891
Number of Veterans        2878
Foreign-born              2878
Average Household Size    2875
State Code                2891
Race                      2891
Count                     2891
dtype: int64

#### Dropping the rows that contain missing values and dropping duplicate columns

In [49]:
def clean_demographics_data(df):
    """Clean the US demographics dataset
    
    :param df: pandas dataframe of US demographics dataset
    :return: clean dataframe
    """
    # drop rows with missing values
    subset_cols = [
        'Male Population',
        'Female Population',
        'Number of Veterans',
        'Foreign-born',
        'Average Household Size'
    ]
    df = df.dropna(subset=subset_cols)
    
    # drop duplicate columns
    df = df.drop_duplicates(subset=['City', 'State', 'State Code', 'Race'])
    
    return df

In [51]:
new_demographic_df = clean_demographics_data(demographic_df)

In [52]:
demographic_df.shape

(2891, 12)

In [53]:
new_demographic_df.shape

(2875, 12)