Analysis


In [2]:
from pyspark.sql import SparkSession
output_data = 'data/output'
spark = SparkSession \
        .builder \
        .appName("ND027_Capstone_Project_query") \
        .getOrCreate()


Create Views


In [3]:
immigration_fact_tbl = spark.read.parquet(f'{output_data}/immigration_fact')
users_dimensions_tbl = spark.read.parquet(f'{output_data}/users_dimension')
countries_dimension_tbl = spark.read.parquet(f'{output_data}/countries_dimension')
cities_demographics_dimension_tbl = spark.read.parquet(f'{output_data}/cities_demographics_dimension')

immigration_fact_view = immigration_fact_tbl.createOrReplaceTempView('immigration_fact_view')
users_dimensions_view = users_dimensions_tbl.createOrReplaceTempView('users_dimensions_view')
countries_dimension_view = countries_dimension_tbl.createOrReplaceTempView('countries_dimension_view')
cities_demographics_dimension_view = cities_demographics_dimension_tbl.createOrReplaceTempView('cities_demographics_dimension_view')

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

Analyze the data

In [5]:
spark.sql("""
    SELECT count(*) FROM immigration_fact_view
""").show()

+--------+
|count(1)|
+--------+
|  219268|
+--------+



In [6]:
spark.sql("""
    SELECT * FROM countries_dimension_view
""").show(10)

+------------+--------------------+
|country_code|                name|
+------------+--------------------+
|         527|TURKS AND CAICOS ...|
|         420|              TUVALU|
|         352|              UGANDA|
|         162|             UKRAINE|
|         296|UNITED ARAB EMIRATES|
|         135|      UNITED KINGDOM|
|         695|             URUGUAY|
|         163|          UZBEKISTAN|
|         410|             VANUATU|
|         696|           VENEZUELA|
+------------+--------------------+
only showing top 10 rows



In [15]:
spark.sql("""
    SELECT *
    FROM
        (SELECT DISTINCT
            iv.i94cit, 
            COUNT(iv.cicid) AS count_immigrants
        FROM immigration_fact_view iv
        GROUP BY iv.i94cit
        ) AS immigrants_by_country
    JOIN countries_dimension_view cv
        ON immigrants_by_country.i94cit = cv.country_code
    ORDER BY immigrants_by_country.count_immigrants DESC
""").show()

+------+----------------+------------+--------------------+
|i94cit|count_immigrants|country_code|                name|
+------+----------------+------------+--------------------+
| 135.0|           33237|         135|      UNITED KINGDOM|
| 111.0|           20288|         111|              FRANCE|
| 209.0|           12476|         209|               JAPAN|
| 245.0|           11095|         245|          CHINA, PRC|
| 582.0|            9846|         582|MEXICO Air Sea, a...|
| 689.0|            8109|         689|              BRAZIL|
| 117.0|            7867|         117|               ITALY|
| 438.0|            7002|         438|           AUSTRALIA|
| 129.0|            5363|         129|               SPAIN|
| 213.0|            5161|         213|               INDIA|
| 687.0|            4802|         687|          ARGENTINA |
| 123.0|            4387|         123|         NETHERLANDS|
| 130.0|            4055|         130|              SWEDEN|
| 691.0|            3774|         691|  

In [14]:
spark.sql("""
    SELECT i94port, count_immigrants, City, state_code, total_population, foreign_born
    FROM
        (SELECT DISTINCT
            iv.i94port, 
            COUNT(iv.cicid) AS count_immigrants
        FROM immigration_fact_view iv
        GROUP BY iv.i94port
        ) AS immigrants_by_city
    JOIN cities_demographics_dimension_view cv
        ON immigrants_by_city.i94port = cv.city_code
    ORDER BY count_immigrants DESC
""").show()

+-------+----------------+---------------+----------+----------------+------------+
|i94port|count_immigrants|           City|state_code|total_population|foreign_born|
+-------+----------------+---------------+----------+----------------+------------+
|    NYC|           32779|       New York|        NY|        42752025|    16062500|
|    MIA|           22660|          Miami|        FL|         2204945|     1303945|
|    LOS|           20983|    Los Angeles|        CA|        19859480|     7427125|
|    ORL|           12332|        Orlando|        FL|         1354585|      252790|
|    CHI|           11823|        Chicago|        IL|        13602780|     2867315|
|    SFR|           11056|  San Francisco|        CA|         4324080|     1485995|
|    FTL|            8087|Fort Lauderdale|        FL|          892935|      237910|
|    HOU|            7305|        Houston|        TX|        11493140|     3481050|
|    LVG|            7216|      Las Vegas|        NV|         3118845|      