# https://en.m.wikipedia.org/wiki/Visa_policy_of_the_United_States#Classes_of_visas

In [1]:
# Do all imports and installs here
from pyspark.sql.functions import udf
from pyspark.sql.session import SparkSession
import datetime
import pandas as pd
import pyspark.sql.functions as F
import psycopg2

spark = SparkSession.builder\
                    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                    .appName("project: data engineering capstone project")\
                    .getOrCreate()



In [3]:
# Read tsv file(s)
tsv_i94visatype = pd.read_csv('D:/Capstone-Project/Project-Workspace/inputs/Classes_of_visas.tsv', sep=r'\t', engine='python')

tsv_i94visatype.head()


Unnamed: 0,visatype,description
0,A1,"Head of state and immediate family, prime mini..."
1,A2,"Minister of state, other foreign government of..."
2,A3,"Attendant, servant, or personal employee of A1..."
3,B1,"Temporary visitor for business, domestic emplo..."
4,B2,"Temporary visitor for holiday, tourism, medica..."


In [4]:
sp_i94visatype = spark.createDataFrame(tsv_i94visatype)

sp_i94visatype.printSchema()



root
 |-- visatype: string (nullable = true)
 |-- description: string (nullable = true)



In [5]:
sp_i94visatype.createOrReplaceTempView("dim_visatype")

In [6]:
spark.sql("""
select *
from dim_visatype
where 1 = 1
and visatype not like 'NATO%'
""").show()

+--------+--------------------+
|visatype|         description|
+--------+--------------------+
|      A1|Head of state and...|
|      A2|Minister of state...|
|      A3|Attendant, servan...|
|      B1|Temporary visitor...|
|      B2|Temporary visitor...|
|   B1/B2|Temporary visitor...|
|      C1|   Person in transit|
|    C1/D|Combined Transit ...|
|      C2|Person in transit...|
|      C3|Foreign governmen...|
|     CW1|Commonwealth of N...|
|     CW2|Spouse or child o...|
|       D|Crewmember (sea o...|
|      E1|Treaty trader, sp...|
|      E2|Treaty investor, ...|
|     E2C|Commonwealth of N...|
|      E3|Treaty traders an...|
|     E3D|Spouse or child o...|
|     E3R|        Returning E3|
|      F1|Student (academic...|
+--------+--------------------+
only showing top 20 rows



In [7]:
df = spark.sql("""
select *
from dim_visatype
where 1 = 1
and visatype not like 'NATO%'
""")

In [8]:
df.coalesce(1).write.format('json').mode('overwrite').save('D:/capstone/solarhenge/dim_visatype')
