In [1]:
# Do all imports and installs here
from pyspark.sql.functions import udf
from pyspark.sql.session import SparkSession
import datetime
import pandas as pd
import pyspark.sql.functions as F
import psycopg2


In [2]:
spark = SparkSession.builder\
                    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                    .appName("Project: i94")\
                    .getOrCreate()

In [4]:
# Read the csv i94 countries file
csv_i94cnty = pd.read_csv('D:/Capstone-Project/Project-Workspace/inputs/i94cnty.csv', quotechar="'")

In [5]:
csv_i94cnty.head()

Unnamed: 0,i94cnty,cntyname
0,0,INVALID: STATELESS
1,54,No Country Code (54)
2,100,No Country Code (100)
3,101,ALBANIA
4,102,ANDORRA


In [6]:
# create a spark dataframe of countries
sp_i94cnty = spark.createDataFrame(csv_i94cnty)

In [7]:
sp_i94cnty.show(5)

+-------+--------------------+
|i94cnty|            cntyname|
+-------+--------------------+
|      0|  INVALID: STATELESS|
|     54|No Country Code (54)|
|    100|No Country Code (...|
|    101|             ALBANIA|
|    102|             ANDORRA|
+-------+--------------------+
only showing top 5 rows



In [8]:
sp_i94cnty.printSchema()

root
 |-- i94cnty: long (nullable = true)
 |-- cntyname: string (nullable = true)



In [9]:
#sp_i94cnty.coalesce(1).write.format('json').mode('overwrite').save('D:/capstone/solarhenge/dim_cnty/i94cnty')


# https://en.wikipedia.org/wiki/ISO_3166-1

In [10]:
# Read the csv CountryCode.org tab separated values (tsv) file
tsv_iso_3166 = pd.read_csv('D:/Capstone-Project/Project-Workspace/inputs/ISO_3166-1.tsv', sep=r'\t', engine='python')

In [11]:
tsv_iso_3166.head()

Unnamed: 0,i94cnty,cntyname
0,4,Afghanistan
1,248,Åland Islands
2,8,Albania
3,12,Algeria
4,16,American Samoa


In [12]:
# create a spark dataframe of countries
sp_iso_3166 = spark.createDataFrame(tsv_iso_3166)

In [13]:
sp_iso_3166.show(5)

+-------+--------------+
|i94cnty|      cntyname|
+-------+--------------+
|      4|   Afghanistan|
|    248| Åland Islands|
|      8|       Albania|
|     12|       Algeria|
|     16|American Samoa|
+-------+--------------+
only showing top 5 rows



In [14]:
sp_iso_3166.printSchema()

root
 |-- i94cnty: long (nullable = true)
 |-- cntyname: string (nullable = true)



In [15]:
#sp_iso_3166.coalesce(1).write.format('json').mode('overwrite').save('D:/capstone/solarhenge/dim_cnty/iso_3166')


In [16]:
sp_dim_cnty = sp_iso_3166.union(sp_i94cnty)
sp_dim_cnty = sp_dim_cnty.dropDuplicates(subset=['I94CNTY'])

In [17]:
sp_dim_cnty.printSchema()

root
 |-- i94cnty: long (nullable = true)
 |-- cntyname: string (nullable = true)



In [18]:
# create temporary view(s)
sp_dim_cnty.createOrReplaceTempView("dim_cnty")

In [20]:
spark.sql("""
select count(*)
from dim_cnty
where 1 = 1
and cntyname is null
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [21]:
sp_dim_cnty.coalesce(1).write.format('json').mode('overwrite').save('D:/capstone/solarhenge/dim_cnty')
