### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [3]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### Assignment 1 ###

The code below demonstrates how to explore and load the data provided for the assignment from Azure Blob Storage and how to save any outputs that you generate to a separate user container.

**Key points**

- The data provided for the assignment is stored in Azure Blob Storage and outputs that you generate will be stored in Azure Blob Storage as well. Hadoop and Spark can both interact with Azure Blob Storage similar to how they interact with HDFS, but where the replication and distribution is handled by Azure instead. This makes it possible to read or write data in Azure over HTTPS where the path is prefixed by `wasbs://`.
- There are two containers, one for the data which is read only and one for any outputs that you generate,
  - `wasbs://campus-data@madsstorage002.blob.core.windows.net/`
  - `wasbs://campus-user@madsstorage002.blob.core.windows.net/`
- You can use variable interpolation to insert your global username variable into paths automatically.
  - This works for bash commands as well.

In [4]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

25/08/18 21:30:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.driver.memory,4g
spark.app.name,rsh224 (notebook)
spark.kubernetes.executor.podNamePrefix,rsh224-notebook-a26ce898bc83cec3
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224


In [5]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [6]:
# Use the hdfs command to explore the data in Azure Blob Storage

!hdfs dfs -ls wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd/
!hdfs dfs -ls wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd/daily/

Found 5 items
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/daily
-rwxrwxrwx   1       3659 2025-08-01 21:31 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-countries.txt
-rwxrwxrwx   1   35272064 2025-08-01 21:31 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-inventory.txt
-rwxrwxrwx   1       1086 2025-08-01 21:31 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-states.txt
-rwxrwxrwx   1   11150502 2025-08-01 21:31 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-stations.txt
Found 264 items
-rwxrwxrwx   1    1385743 2025-08-01 21:30 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/daily/1750.csv.gz
-rwxrwxrwx   1       3358 2025-08-01 21:30 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/daily/1763.csv.gz
-rwxrwxrwx   1       3327 2025-08-01 21:30 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/dail

In [7]:
directory_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd'

In [8]:
# Define the input path for the last year in daily

daily_relative_path = f'ghcnd/daily/2025.csv.gz'
daily_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/{daily_relative_path}'

print(daily_path)

wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/daily/2025.csv.gz


In [9]:
# Load a subset of the last year in daily into Spark from Azure Blob Storage using spark.read.csv

schema = StructType([
    StructField("ID", StringType()),           # Character Station code
    StructField("DATE", StringType()),         # Date Observation date formatted as YYYYMMDD
    StructField("ELEMENT", StringType()),      # Character Element type indicator
    StructField("VALUE", DoubleType()),        # Real Data value for ELEMENT
    StructField("MEASUREMENT", StringType()),  # Character Measurement Flag
    StructField("QUALITY", StringType()),      # Character Quality Flag
    StructField("SOURCE", StringType()),       # Character Source Flag
    StructField("TIME", StringType()),         # Time Observation time formatted as HHMM
])

daily = spark.read.csv(
    path=f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd/daily/',
    schema=schema
)

print(type(daily))
daily.printSchema()
print(daily)
daily.show(100, False)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- ID: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- ELEMENT: string (nullable = true)
 |-- VALUE: double (nullable = true)
 |-- MEASUREMENT: string (nullable = true)
 |-- QUALITY: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- TIME: string (nullable = true)

DataFrame[ID: string, DATE: string, ELEMENT: string, VALUE: double, MEASUREMENT: string, QUALITY: string, SOURCE: string, TIME: string]


                                                                                

+-----------+--------+-------+------+-----------+-------+------+----+
|ID         |DATE    |ELEMENT|VALUE |MEASUREMENT|QUALITY|SOURCE|TIME|
+-----------+--------+-------+------+-----------+-------+------+----+
|ASN00030019|20100101|PRCP   |24.0  |NULL       |NULL   |a     |NULL|
|ASN00030021|20100101|PRCP   |200.0 |NULL       |NULL   |a     |NULL|
|ASN00030022|20100101|TMAX   |294.0 |NULL       |NULL   |a     |NULL|
|ASN00030022|20100101|TMIN   |215.0 |NULL       |NULL   |a     |NULL|
|ASN00030022|20100101|PRCP   |408.0 |NULL       |NULL   |a     |NULL|
|ASN00029121|20100101|PRCP   |820.0 |NULL       |NULL   |a     |NULL|
|ASN00029126|20100101|TMAX   |371.0 |NULL       |NULL   |S     |NULL|
|ASN00029126|20100101|TMIN   |225.0 |NULL       |NULL   |S     |NULL|
|ASN00029126|20100101|PRCP   |0.0   |NULL       |NULL   |a     |NULL|
|ASN00029126|20100101|TAVG   |298.0 |H          |NULL   |S     |NULL|
|ASN00029127|20100101|TMAX   |371.0 |NULL       |NULL   |a     |NULL|
|ASN00029127|2010010

In [10]:
daily = daily.withColumn(
    'DATETIME',
    F.to_timestamp(
        F.concat_ws(
            " ",
            F.col('DATE'),
            F.when(F.col('TIME') == '2400', F.lit('0000')).otherwise(F.coalesce(F.col('TIME'), F.lit('0000')))),
        'yyyyMMdd HHmm'
    )
)

daily = daily.withColumn('DATE', F.to_date(F.col('DATE'), 'yyyyMMdd'))

daily.show(20, False)

[Stage 1:>                                                          (0 + 1) / 1]

+-----------+----------+-------+-----+-----------+-------+------+----+-------------------+
|ID         |DATE      |ELEMENT|VALUE|MEASUREMENT|QUALITY|SOURCE|TIME|DATETIME           |
+-----------+----------+-------+-----+-----------+-------+------+----+-------------------+
|ASN00030019|2010-01-01|PRCP   |24.0 |NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030021|2010-01-01|PRCP   |200.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030022|2010-01-01|TMAX   |294.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030022|2010-01-01|TMIN   |215.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030022|2010-01-01|PRCP   |408.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00029121|2010-01-01|PRCP   |820.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00029126|2010-01-01|TMAX   |371.0|NULL       |NULL   |S     |NULL|2010-01-01 00:00:00|
|ASN00029126|2010-01-01|TMIN   |225.0|NULL       |NULL   |S     |NULL|2010-01-01 00:00:00|

                                                                                

## question 2(b)

get the last file containing the latest data file

In [11]:
!hdfs dfs -ls {directory_path}/daily | tail -n 1

-rwxrwxrwx   1   85656432 2025-08-01 21:30 wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/daily/2025.csv.gz


In [12]:
daily_latest = spark.read.csv(
    path=f'{directory_path}/daily/2025.csv.gz',
    schema = schema,
)

daily_latest.show(20, False)

+-----------+--------+-------+-----+-----------+-------+------+----+
|ID         |DATE    |ELEMENT|VALUE|MEASUREMENT|QUALITY|SOURCE|TIME|
+-----------+--------+-------+-----+-----------+-------+------+----+
|ASN00030019|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00030021|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00030022|20250101|TMAX   |414.0|NULL       |NULL   |a     |NULL|
|ASN00030022|20250101|TMIN   |247.0|NULL       |NULL   |a     |NULL|
|ASN00030022|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00030025|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00029118|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00029121|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00029126|20250101|TMAX   |414.0|NULL       |NULL   |S     |NULL|
|ASN00029126|20250101|TMIN   |198.0|NULL       |NULL   |S     |NULL|
|ASN00029126|20250101|PRCP   |0.0  |NULL       |NULL   |a     |NULL|
|ASN00029126|20250101|TAVG   |321.

In [13]:
# Define the input path for stations

stations_relative_path = f'ghcnd/ghcnd-stations.txt'
stations_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/{stations_relative_path}'

print(stations_path)

wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-stations.txt


In [14]:
# Load the stations metadata into Spark from Azure Blob Storage using spark.read.text without any other processing

stations_txt = spark.read.text(stations_path).limit(1000)

print(type(stations_txt))
stations_txt.printSchema()
print(stations_txt)
stations_txt.show(20, False)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- value: string (nullable = true)

DataFrame[value: string]
+-------------------------------------------------------------------------------------+
|value                                                                                |
+-------------------------------------------------------------------------------------+
|ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       |
|ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    |
|AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196|
|AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194|
|AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217|
|AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218|
|AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930|
|AFM000409

In [15]:
stations = stations_txt.select(
    F.substring('value', 1,11).alias("ID"),
    F.substring('value', 13,8).cast("double").alias("LATITUDE"),
    F.substring('value', 22,9).cast("double").alias("LONGITUDE"),
    F.substring('value', 32,6).cast("double").alias("ELEVATION"),
    F.substring('value', 39,2).alias("STATE"),
    F.substring('value', 42,30).alias("NAME"),
    F.substring('value', 73,3).alias("GSN"),
    F.substring('value', 77,3).alias("HCN_CRN_FLAG"),
    F.substring('value', 81,5).cast("int").alias("WMO_ID"),
)

stations.printSchema()
print(stations)
stations.show(20, False)

root
 |-- ID: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- STATE: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- GSN: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: integer (nullable = true)

DataFrame[ID: string, LATITUDE: double, LONGITUDE: double, ELEVATION: double, STATE: string, NAME: string, GSN: string, HCN_CRN_FLAG: string, WMO_ID: int]


[Stage 4:>                                                          (0 + 1) / 1]

+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+
|ID         |LATITUDE|LONGITUDE|ELEVATION|STATE|NAME                          |GSN|HCN_CRN_FLAG|WMO_ID|
+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+
|ACW00011604|17.1167 |-61.7833 |10.1     |     |ST JOHNS COOLIDGE FLD         |   |            |NULL  |
|ACW00011647|17.1333 |-61.7833 |19.2     |     |ST JOHNS                      |   |            |NULL  |
|AE000041196|25.333  |55.517   |34.0     |     |SHARJAH INTER. AIRP           |GSN|            |41196 |
|AEM00041194|25.255  |55.364   |10.4     |     |DUBAI INTL                    |   |            |41194 |
|AEM00041217|24.433  |54.651   |26.8     |     |ABU DHABI INTL                |   |            |41217 |
|AEM00041218|24.262  |55.609   |264.9    |     |AL AIN INTL                   |   |            |41218 |
|AF000040930|35.317  |69.017   |3366.0   |     |NORTH-SALANG    

                                                                                

In [16]:
countries_path = f'{directory_path}/ghcnd-countries.txt'

countries_txt = spark.read.text(countries_path)

countries_txt.printSchema()

countries_txt.show(20, False)

root
 |-- value: string (nullable = true)

+----------------------------------+
|value                             |
+----------------------------------+
|AC Antigua and Barbuda            |
|AE United Arab Emirates           |
|AF Afghanistan                    |
|AG Algeria                        |
|AJ Azerbaijan                     |
|AL Albania                        |
|AM Armenia                        |
|AO Angola                         |
|AQ American Samoa [United States] |
|AR Argentina                      |
|AS Australia                      |
|AU Austria                        |
|AY Antarctica                     |
|BA Bahrain                        |
|BB Barbados                       |
|BC Botswana                       |
|BD Bermuda [United Kingdom]       |
|BE Belgium                        |
|BF Bahamas, The                   |
|BG Bangladesh                     |
+----------------------------------+
only showing top 20 rows



In [17]:
countries = countries_txt.select(
    F.substring('value', 1,2).alias('COUNTRY_CODE'),
    F.substring('value', 4,61).alias('COUNTRY_NAME')
)

countries.printSchema()

countries.show(20, False)

root
 |-- COUNTRY_CODE: string (nullable = true)
 |-- COUNTRY_NAME: string (nullable = true)

+------------+-------------------------------+
|COUNTRY_CODE|COUNTRY_NAME                   |
+------------+-------------------------------+
|AC          |Antigua and Barbuda            |
|AE          |United Arab Emirates           |
|AF          |Afghanistan                    |
|AG          |Algeria                        |
|AJ          |Azerbaijan                     |
|AL          |Albania                        |
|AM          |Armenia                        |
|AO          |Angola                         |
|AQ          |American Samoa [United States] |
|AR          |Argentina                      |
|AS          |Australia                      |
|AU          |Austria                        |
|AY          |Antarctica                     |
|BA          |Bahrain                        |
|BB          |Barbados                       |
|BC          |Botswana                       |
|BD          

In [18]:
states_path = f'{directory_path}/ghcnd-states.txt'

states_txt = spark.read.text(states_path).limit(1000)

states_txt.show(20, False)

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------------------------------------+
|value                                             |
+--------------------------------------------------+
|AB ALBERTA                                        |
|AK ALASKA                                         |
|AL ALABAMA                                        |
|AR ARKANSAS                                       |
|AS AMERICAN SAMOA                                 |
|AZ ARIZONA                                        |
|BC BRITISH COLUMBIA                               |
|CA CALIFORNIA                                     |
|CO COLORADO                                       |
|CT CONNECTICUT                                    |
|DC DISTRICT OF COLUMBIA                           |
|DE DELAWARE                                       |
|FL FLORIDA                                        |
|FM MICRONESIA                                     |
|GA GEORGIA                                        |
|GU GUAM                                      

                                                                                

In [19]:
states = states_txt.select(
    F.substring('value', 1,2).alias('CODE'),
    F.substring('value', 4,47).alias('NAME')
)

states.printSchema()
states.show(20, False)

root
 |-- CODE: string (nullable = true)
 |-- NAME: string (nullable = true)

+----+-----------------------------------------------+
|CODE|NAME                                           |
+----+-----------------------------------------------+
|AB  |ALBERTA                                        |
|AK  |ALASKA                                         |
|AL  |ALABAMA                                        |
|AR  |ARKANSAS                                       |
|AS  |AMERICAN SAMOA                                 |
|AZ  |ARIZONA                                        |
|BC  |BRITISH COLUMBIA                               |
|CA  |CALIFORNIA                                     |
|CO  |COLORADO                                       |
|CT  |CONNECTICUT                                    |
|DC  |DISTRICT OF COLUMBIA                           |
|DE  |DELAWARE                                       |
|FL  |FLORIDA                                        |
|FM  |MICRONESIA                          

In [20]:
inventory_path = f'{directory_path}/ghcnd-inventory.txt'

inventory_txt = spark.read.text(inventory_path)

inventory_txt.show(20, False)

+---------------------------------------------+
|value                                        |
+---------------------------------------------+
|ACW00011604  17.1167  -61.7833 TMAX 1949 1949|
|ACW00011604  17.1167  -61.7833 TMIN 1949 1949|
|ACW00011604  17.1167  -61.7833 PRCP 1949 1949|
|ACW00011604  17.1167  -61.7833 SNOW 1949 1949|
|ACW00011604  17.1167  -61.7833 SNWD 1949 1949|
|ACW00011604  17.1167  -61.7833 PGTM 1949 1949|
|ACW00011604  17.1167  -61.7833 WDFG 1949 1949|
|ACW00011604  17.1167  -61.7833 WSFG 1949 1949|
|ACW00011604  17.1167  -61.7833 WT03 1949 1949|
|ACW00011604  17.1167  -61.7833 WT08 1949 1949|
|ACW00011604  17.1167  -61.7833 WT16 1949 1949|
|ACW00011647  17.1333  -61.7833 TMAX 1961 1961|
|ACW00011647  17.1333  -61.7833 TMIN 1961 1961|
|ACW00011647  17.1333  -61.7833 PRCP 1957 1970|
|ACW00011647  17.1333  -61.7833 SNOW 1957 1970|
|ACW00011647  17.1333  -61.7833 SNWD 1957 1970|
|ACW00011647  17.1333  -61.7833 WT03 1961 1961|
|ACW00011647  17.1333  -61.7833 WT16 196

In [71]:
inventory = inventory_txt.select(
    F.substring('value', 1,11).alias('ID'),
    F.substring('value', 13,8).alias('LATITUDE'),
    F.substring('value', 22,9).alias('LONGITUDE'),
    F.substring('value', 32,4).alias('ELEMENT'),
    F.substring('value', 37,4).alias('FIRST_YEAR'),
    F.substring('value', 42,4).alias('LAST_YEAR'),    
)

inventory.printSchema()
inventory.show(20, False)

root
 |-- ID: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEMENT: string (nullable = true)
 |-- FIRST_YEAR: string (nullable = true)
 |-- LAST_YEAR: string (nullable = true)

+-----------+--------+---------+-------+----------+---------+
|ID         |LATITUDE|LONGITUDE|ELEMENT|FIRST_YEAR|LAST_YEAR|
+-----------+--------+---------+-------+----------+---------+
|ACW00011604| 17.1167| -61.7833|TMAX   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|TMIN   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|PRCP   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|SNOW   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|SNWD   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|PGTM   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WDFG   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WSFG   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WT03   |1949      |1949     |
|ACW00011604|

In [72]:
print('daily:')
daily.show(5,False)
print('stations:')
stations.show(5,False)
print('countries:')
countries.show(5,False)
print('states:')
states.show(5,False)
print('inventory:')
inventory.show(5,False)

daily:
+-----------+----------+-------+-----+-----------+-------+------+----+-------------------+
|ID         |DATE      |ELEMENT|VALUE|MEASUREMENT|QUALITY|SOURCE|TIME|DATETIME           |
+-----------+----------+-------+-----+-----------+-------+------+----+-------------------+
|ASN00030019|2010-01-01|PRCP   |24.0 |NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030021|2010-01-01|PRCP   |200.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030022|2010-01-01|TMAX   |294.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030022|2010-01-01|TMIN   |215.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
|ASN00030022|2010-01-01|PRCP   |408.0|NULL       |NULL   |a     |NULL|2010-01-01 00:00:00|
+-----------+----------+-------+-----+-----------+-------+------+----+-------------------+
only showing top 5 rows

stations:
+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|ID         |LATITUDE|

In [73]:
print(f'Inventory: {inventory.count()}')
print(f'Stations: {stations.count()}')
print(f'Countries: {countries.count()}')
print(f'States: {states.count()}')

Inventory: 766784
Stations: 1000
Countries: 219
States: 74


In [74]:
# print(f'daily row count: {daily.count()}')

# Question 3

In [75]:
stations.show(20, False)

+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|ID         |LATITUDE|LONGITUDE|ELEVATION|STATE|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_CODE|
+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|ACW00011604|17.1167 |-61.7833 |10.1     |     |ST JOHNS COOLIDGE FLD         |   |            |NULL  |AC          |
|ACW00011647|17.1333 |-61.7833 |19.2     |     |ST JOHNS                      |   |            |NULL  |AC          |
|AE000041196|25.333  |55.517   |34.0     |     |SHARJAH INTER. AIRP           |GSN|            |41196 |AE          |
|AEM00041194|25.255  |55.364   |10.4     |     |DUBAI INTL                    |   |            |41194 |AE          |
|AEM00041217|24.433  |54.651   |26.8     |     |ABU DHABI INTL                |   |            |41217 |AE          |
|AEM00041218|24.262  |55.609   |264.9    |     |AL AIN INTL     

In [76]:
stations = stations.withColumn(
    'COUNTRY_CODE',
    F.substring('ID', 1, 2)
).withColumnRenamed('NAME', 'STATION_NAME')

stations.show(5, False)

+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|ID         |LATITUDE|LONGITUDE|ELEVATION|STATE|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_CODE|
+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|ACW00011604|17.1167 |-61.7833 |10.1     |     |ST JOHNS COOLIDGE FLD         |   |            |NULL  |AC          |
|ACW00011647|17.1333 |-61.7833 |19.2     |     |ST JOHNS                      |   |            |NULL  |AC          |
|AE000041196|25.333  |55.517   |34.0     |     |SHARJAH INTER. AIRP           |GSN|            |41196 |AE          |
|AEM00041194|25.255  |55.364   |10.4     |     |DUBAI INTL                    |   |            |41194 |AE          |
|AEM00041217|24.433  |54.651   |26.8     |     |ABU DHABI INTL                |   |            |41217 |AE          |
+-----------+--------+---------+---------+-----+----------------

In [77]:
countries.show(5, False)

+------------+---------------------+
|COUNTRY_CODE|COUNTRY_NAME         |
+------------+---------------------+
|AC          |Antigua and Barbuda  |
|AE          |United Arab Emirates |
|AF          |Afghanistan          |
|AG          |Algeria              |
|AJ          |Azerbaijan           |
+------------+---------------------+
only showing top 5 rows



In [78]:
stations_with_country = stations.join(
    countries,
    on="COUNTRY_CODE",
    how="left"
)

stations_with_country.orderBy(F.rand()).show(20, False)

+------------+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|COUNTRY_CODE|ID         |LATITUDE|LONGITUDE|ELEVATION|STATE|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME|
+------------+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+------------+
|AR          |AR000000008|-29.783 |-57.983  |70.0     |     |CURUZU CUATIA AERO            |   |            |87286 |Argentina   |
|AS          |ASN00001024|-15.9572|127.0628 |300.0    |     |ELLENBRAE                     |   |            |NULL  |Australia   |
|AS          |ASN00003051|-16.7372|125.9086 |401.0    |     |MOUNT BARNETT                 |   |            |NULL  |Australia   |
|AJ          |AJ000037605|41.3    |45.6     |440.0    |     |BOGDANOVKA                    |   |            |37605 |Azerbaijan  |
|AS          |ASN00007127|-28.6   |116.3    |-999.9   |     |GULLEWA HOUSE                

In [79]:
stations_with_country.filter((F.col('STATE').isNotNull()) & (F.col('STATE') != '')).show(20, False)

+------------+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+---------------------+
|COUNTRY_CODE|ID         |LATITUDE|LONGITUDE|ELEVATION|STATE|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME         |
+------------+-----------+--------+---------+---------+-----+------------------------------+---+------------+------+---------------------+
|AC          |ACW00011604|17.1167 |-61.7833 |10.1     |     |ST JOHNS COOLIDGE FLD         |   |            |NULL  |Antigua and Barbuda  |
|AC          |ACW00011647|17.1333 |-61.7833 |19.2     |     |ST JOHNS                      |   |            |NULL  |Antigua and Barbuda  |
|AE          |AE000041196|25.333  |55.517   |34.0     |     |SHARJAH INTER. AIRP           |GSN|            |41196 |United Arab Emirates |
|AE          |AEM00041194|25.255  |55.364   |10.4     |     |DUBAI INTL                    |   |            |41194 |United Arab Emirates |
|AE          |AEM00041217|2

In [80]:
states.orderBy(F.rand()).show(5, False)

+----+---------------------------+
|CODE|NAME                       |
+----+---------------------------+
|ME  |MAINE                      |
|UM  |U.S. MINOR OUTLYING ISLANDS|
|RI  |RHODE ISLAND               |
|IN  |INDIANA                    |
|FM  |MICRONESIA                 |
+----+---------------------------+
only showing top 5 rows



In [81]:
stations_country_states = stations_with_country.withColumnRenamed('STATE', 'STATE_CODE').join(
        states.withColumnRenamed('CODE', 'STATE_CODE').withColumnRenamed('NAME', 'STATE_NAME'),
        on="STATE_CODE",
        how="left"
    )

stations_country_states.filter(F.col('STATE_CODE') != "").show(5, False)

+----------+------------+-----------+--------+---------+---------+------------------------------+---+------------+------+--------------+----------+
|STATE_CODE|COUNTRY_CODE|ID         |LATITUDE|LONGITUDE|ELEVATION|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME  |STATE_NAME|
+----------+------------+-----------+--------+---------+---------+------------------------------+---+------------+------+--------------+----------+
|VA        |US          |US1VAAM0001|37.3685 |-78.2274 |115.2    |RICE 7.3 NE                   |   |            |NULL  |United States |VIRGINIA  |
|VA        |US          |US1VAAM0002|37.3009 |-78.1609 |118.0    |JETERSVILLE 3.6 W             |   |            |NULL  |United States |VIRGINIA  |
|VA        |US          |US1VAAP0001|37.4083 |-78.9638 |219.5    |CONCORD 4.6 NNE               |   |            |NULL  |United States |VIRGINIA  |
|VA        |US          |US1VAAP0004|37.2836 |-78.667  |205.4    |PAMPLIN 1.6 NNE               |   |           

In [99]:
stations_country_states.orderBy(F.rand()).show(20, False)

+----------+------------+-----------+--------+---------+---------+------------------------------+---+------------+------+-------------------------------+--------------+
|STATE_CODE|COUNTRY_CODE|ID         |LATITUDE|LONGITUDE|ELEVATION|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME                   |STATE_NAME    |
+----------+------------+-----------+--------+---------+---------+------------------------------+---+------------+------+-------------------------------+--------------+
|          |AR          |ARM00087178|-27.386 |-55.971  |131.1    |POSADAS                       |GSN|            |87178 |Argentina                      |NULL          |
|          |AR          |ARM00087582|-34.559 |-58.416  |5.5      |AEROPARQUE JORGE NEWBERY      |   |            |87582 |Argentina                      |NULL          |
|          |AO          |AO000066422|-15.2   |12.15    |45.0     |MOCAMEDES                     |GSN|            |66422 |Angola                         |NU

In [83]:
inventory.show(20, False)

+-----------+--------+---------+-------+----------+---------+
|ID         |LATITUDE|LONGITUDE|ELEMENT|FIRST_YEAR|LAST_YEAR|
+-----------+--------+---------+-------+----------+---------+
|ACW00011604| 17.1167| -61.7833|TMAX   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|TMIN   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|PRCP   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|SNOW   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|SNWD   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|PGTM   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WDFG   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WSFG   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WT03   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WT08   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WT16   |1949      |1949     |
|ACW00011647| 17.1333| -61.7833|TMAX   |1961      |1961     |
|ACW00011647| 17.1333| -61.7833|TMIN   |1961      |1961     |
|ACW0001

In [84]:
station_service_life = inventory.groupBy('ID').agg(
    F.min('FIRST_YEAR').alias('FIRST_YEAR'),
    F.min('LAST_YEAR').alias('LAST_YEAR')
)

station_service_life.show(20, False)

+-----------+----------+---------+
|ID         |FIRST_YEAR|LAST_YEAR|
+-----------+----------+---------+
|ACW00011604|1949      |1949     |
|ACW00011647|1957      |1961     |
|AE000041196|1944      |2025     |
|AEM00041194|1983      |2025     |
|AEM00041217|1983      |2025     |
|AEM00041218|1994      |2025     |
|AF000040930|1973      |1988     |
|AFM00040938|1973      |2020     |
|AFM00040948|1966      |2018     |
|AFM00040990|1973      |2020     |
|AG000060390|1940      |2005     |
|AG000060590|1892      |2025     |
|AG000060611|1958      |2004     |
|AG000060680|1940      |2004     |
|AGE00135039|1852      |1966     |
|AGE00147704|1909      |1937     |
|AGE00147705|1877      |1938     |
|AGE00147706|1893      |1920     |
|AGE00147707|1878      |1879     |
|AGE00147708|1879      |2005     |
+-----------+----------+---------+
only showing top 20 rows



In [85]:
inventory.groupBy('ID').agg(
    F.countDistinct('ELEMENT').alias('COUNT_ELEMENTS')
).orderBy(F.col('COUNT_ELEMENTS').desc()).show(20, False)

+-----------+--------------+
|ID         |COUNT_ELEMENTS|
+-----------+--------------+
|USW00013880|70            |
|USW00014607|70            |
|USW00023066|67            |
|USW00013958|66            |
|USW00093817|65            |
|USW00024121|65            |
|USW00093058|65            |
|USW00014944|64            |
|USW00024156|63            |
|USW00024157|63            |
|USW00024127|63            |
|USW00094849|61            |
|USW00094908|61            |
|USW00025309|61            |
|USW00014914|61            |
|USW00093822|61            |
|USW00013722|61            |
|USW00026510|61            |
|USW00003822|60            |
|USW00003813|60            |
+-----------+--------------+
only showing top 20 rows



### Question 3(d)

In [86]:
def get_element_counts(elements):
    """
    Returns dict with count of core and other elements

    Parameters
    ----------
    elements: list of strings

    Returns
    -------
    dict { "num_core": number, "num_other": number }    
    
    """    
    
    CORE_ELEMENTS = {'PRCP','SNOW','SNWD','TMAX','TMIN'}   

    elements_set = set(elements)
    
    return {
        "num_core": len(elements_set & CORE_ELEMENTS),
        "num_other": len(elements_set - CORE_ELEMENTS)
    }

get_element_counts_udf = F.udf(get_element_counts, StructType([
    StructField("num_core", IntegerType()),
    StructField("num_other", IntegerType())
]))


In [87]:
inventory_with_counts = (
    inventory.groupBy('ID')
    .agg(F.collect_list('ELEMENT').alias('ELEMENTS'))
    .withColumn('ELEMENT_COUNTS', get_element_counts_udf('ELEMENTS'))
    .select(
        'ID',
        'ELEMENTS',
        F.col('ELEMENT_COUNTS.num_core').alias('NUM_CORE_ELEMENTS'),
        F.col('ELEMENT_COUNTS.num_other').alias('NUM_OTHER_ELEMENTS')
    )
)

inventory_with_counts.show(20, False)

+-----------+------------------------------+-----------------+------------------+
|ID         |ELEMENTS                      |NUM_CORE_ELEMENTS|NUM_OTHER_ELEMENTS|
+-----------+------------------------------+-----------------+------------------+
|AE000041196|[TMAX, TMIN, PRCP, TAVG]      |3                |1                 |
|AEM00041218|[TMAX, TMIN, PRCP, TAVG]      |3                |1                 |
|AF000040930|[TMAX, TMIN, PRCP, SNWD, TAVG]|4                |1                 |
|AG000060590|[TMAX, TMIN, PRCP, TAVG]      |3                |1                 |
|AGE00147704|[TMAX, TMIN, PRCP]            |3                |0                 |
|AGE00147705|[TMAX, TMIN, PRCP]            |3                |0                 |
|AGE00147706|[TMAX, TMIN, PRCP]            |3                |0                 |
|AGE00147708|[TMAX, TMIN, PRCP, SNWD, TAVG]|4                |1                 |
|AGE00147709|[TMAX, TMIN, PRCP]            |3                |0                 |
|AGE00147714|[TM

 How many stations collect all five core elements?

In [88]:
inventory_with_counts.printSchema()

root
 |-- ID: string (nullable = true)
 |-- ELEMENTS: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- NUM_CORE_ELEMENTS: integer (nullable = true)
 |-- NUM_OTHER_ELEMENTS: integer (nullable = true)



In [94]:
inventory_with_counts.filter(F.col('NUM_CORE_ELEMENTS') == 5).count()

20504

How many collect only precipitation and no other elements?

In [97]:
inventory_with_counts.filter(
    (F.size('ELEMENTS') == 1) &
    (F.array_contains(F.col('ELEMENTS'), 'PRCP'))
).count()

16267

## Question 3(e)

In [105]:
station_enriched = stations_country_states.join(
    inventory_with_counts,
    on='ID',
    how='left'
)

station_enriched.show(20, False)

+-----------+----------+------------+--------+---------+---------+------------------------------+---+------------+------+---------------------+----------+------------------------------------------+-----------------+------------------+
|ID         |STATE_CODE|COUNTRY_CODE|LATITUDE|LONGITUDE|ELEVATION|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME         |STATE_NAME|ELEMENTS                                  |NUM_CORE_ELEMENTS|NUM_OTHER_ELEMENTS|
+-----------+----------+------------+--------+---------+---------+------------------------------+---+------------+------+---------------------+----------+------------------------------------------+-----------------+------------------+
|AE000041196|          |AE          |25.333  |55.517   |34.0     |SHARJAH INTER. AIRP           |GSN|            |41196 |United Arab Emirates |NULL      |[TMAX, TMIN, PRCP, TAVG]                  |3                |1                 |
|AEM00041218|          |AE          |24.262  |55.609   |264.

In [106]:
station_enriched.printSchema()

root
 |-- ID: string (nullable = true)
 |-- STATE_CODE: string (nullable = true)
 |-- COUNTRY_CODE: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- STATION_NAME: string (nullable = true)
 |-- GSN: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: integer (nullable = true)
 |-- COUNTRY_NAME: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- ELEMENTS: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- NUM_CORE_ELEMENTS: integer (nullable = true)
 |-- NUM_OTHER_ELEMENTS: integer (nullable = true)



In [107]:
station_enriched.count()

1000

converting array of strings in 'ELEMENTS' columns to string like 'TMAX;TMIN;PRCP' to allow storing in csv format

In [113]:
station_enriched_formatted = station_enriched.withColumn(
    "ELEMENTS",
    F.concat_ws(';', F.col('ELEMENTS'))
)

station_enriched_formatted.show(20, False)

+-----------+----------+------------+--------+---------+---------+------------------------------+---+------------+------+---------------------+----------+----------------------------------+-----------------+------------------+
|ID         |STATE_CODE|COUNTRY_CODE|LATITUDE|LONGITUDE|ELEVATION|STATION_NAME                  |GSN|HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME         |STATE_NAME|ELEMENTS                          |NUM_CORE_ELEMENTS|NUM_OTHER_ELEMENTS|
+-----------+----------+------------+--------+---------+---------+------------------------------+---+------------+------+---------------------+----------+----------------------------------+-----------------+------------------+
|AE000041196|          |AE          |25.333  |55.517   |34.0     |SHARJAH INTER. AIRP           |GSN|            |41196 |United Arab Emirates |NULL      |TMAX;TMIN;PRCP;TAVG               |3                |1                 |
|AEM00041218|          |AE          |24.262  |55.609   |264.9    |AL AIN INTL               

In [109]:
# Define an output path as an exmaple

output_relative_path = f'{username}/stations-enriched'
output_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{output_relative_path}'

print(output_path)

wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations-enriched


In [117]:
# Save the stations metadata to Azure Blob Storage from Spark

station_enriched_formatted.write.mode("overwrite").option('header', True).csv(output_path)

25/08/19 00:11:57 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1
25/08/19 00:11:58 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1


In [118]:
# Use the hdfs command to explore the data in Azure Blob Storage

!hdfs dfs -ls wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{username}/stations-enriched/

Found 3 items
-rw-r--r--   1 rsh224 supergroup          0 2025-08-19 00:11 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations-enriched/_SUCCESS
-rw-r--r--   1 rsh224 supergroup      30449 2025-08-19 00:11 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations-enriched/part-00000-00b83bb9-4c39-4a36-be86-07379435497b-c000.csv
-rw-r--r--   1 rsh224 supergroup      59050 2025-08-19 00:11 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations-enriched/part-00001-00b83bb9-4c39-4a36-be86-07379435497b-c000.csv


In [119]:

data = spark.read.csv(output_path, header=True, inferSchema=True)

data.show(20, False)

[Stage 329:>                                                        (0 + 2) / 2]

+-----------+----------+------------+--------+---------+---------+-------------------+----+------------+------+------------+----------+-----------------------------+-----------------+------------------+
|ID         |STATE_CODE|COUNTRY_CODE|LATITUDE|LONGITUDE|ELEVATION|STATION_NAME       |GSN |HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME|STATE_NAME|ELEMENTS                     |NUM_CORE_ELEMENTS|NUM_OTHER_ELEMENTS|
+-----------+----------+------------+--------+---------+---------+-------------------+----+------------+------+------------+----------+-----------------------------+-----------------+------------------+
|RSM00030846|NULL      |RS          |51.35   |112.467  |743.0    |ULETY              |NULL|NULL        |30846 |Russia      |NULL      |TMAX;TMIN;PRCP;SNWD;TAVG     |4                |1                 |
|RSM00030853|NULL      |RS          |51.633  |114.317  |616.0    |KARYMSKAJA         |NULL|NULL        |30853 |Russia      |NULL      |TMAX;TMIN;PRCP;SNWD;TAVG     |4                |1    

                                                                                

In [121]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()

25/08/19 00:13:24 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
