In [5]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [6]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=8, master_memory=8)

25/08/27 14:36:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.app.name,rsh224 (notebook)
spark.cores.max,16
spark.kubernetes.executor.podNamePrefix,rsh224-notebook-40f76898e96205a4
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224


In [7]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [8]:
directory_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd'

In [9]:
# Load a subset of the last year in daily into Spark from Azure Blob Storage using spark.read.csv

schema = StructType([
    StructField("ID", StringType()),           # Character Station code
    StructField("DATE", StringType()),         # Date Observation date formatted as YYYYMMDD
    StructField("ELEMENT", StringType()),      # Character Element type indicator
    StructField("VALUE", DoubleType()),        # Real Data value for ELEMENT
    StructField("MEASUREMENT", StringType()),  # Character Measurement Flag
    StructField("QUALITY", StringType()),      # Character Quality Flag
    StructField("SOURCE", StringType()),       # Character Source Flag
    StructField("TIME", StringType()),         # Time Observation time formatted as HHMM
])

daily = spark.read.csv(
    path=f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd/daily/',
    schema=schema
)

print(type(daily))
daily.printSchema()
print(daily)
daily.show(100, False)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- ID: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- ELEMENT: string (nullable = true)
 |-- VALUE: double (nullable = true)
 |-- MEASUREMENT: string (nullable = true)
 |-- QUALITY: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- TIME: string (nullable = true)

DataFrame[ID: string, DATE: string, ELEMENT: string, VALUE: double, MEASUREMENT: string, QUALITY: string, SOURCE: string, TIME: string]


                                                                                

+-----------+--------+-------+------+-----------+-------+------+----+
|ID         |DATE    |ELEMENT|VALUE |MEASUREMENT|QUALITY|SOURCE|TIME|
+-----------+--------+-------+------+-----------+-------+------+----+
|ASN00030019|20100101|PRCP   |24.0  |NULL       |NULL   |a     |NULL|
|ASN00030021|20100101|PRCP   |200.0 |NULL       |NULL   |a     |NULL|
|ASN00030022|20100101|TMAX   |294.0 |NULL       |NULL   |a     |NULL|
|ASN00030022|20100101|TMIN   |215.0 |NULL       |NULL   |a     |NULL|
|ASN00030022|20100101|PRCP   |408.0 |NULL       |NULL   |a     |NULL|
|ASN00029121|20100101|PRCP   |820.0 |NULL       |NULL   |a     |NULL|
|ASN00029126|20100101|TMAX   |371.0 |NULL       |NULL   |S     |NULL|
|ASN00029126|20100101|TMIN   |225.0 |NULL       |NULL   |S     |NULL|
|ASN00029126|20100101|PRCP   |0.0   |NULL       |NULL   |a     |NULL|
|ASN00029126|20100101|TAVG   |298.0 |H          |NULL   |S     |NULL|
|ASN00029127|20100101|TMAX   |371.0 |NULL       |NULL   |a     |NULL|
|ASN00029127|2010010

In [10]:
stations_enriched_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{username}/stations-enriched'

In [11]:
stations_enriched = spark.read.csv(stations_enriched_path, header=True, inferSchema=False)

                                                                                

In [12]:
stations_enriched.show(20, False)

[Stage 2:>                                                          (0 + 1) / 1]

+-----------+----------+------------+--------+---------+---------+--------------------+----+------------+------+------------+----------+------------------------+-----------------+------------------+
|ID         |STATE_CODE|COUNTRY_CODE|LATITUDE|LONGITUDE|ELEVATION|STATION_NAME        |GSN |HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME|STATE_NAME|ELEMENTS                |NUM_CORE_ELEMENTS|NUM_OTHER_ELEMENTS|
+-----------+----------+------------+--------+---------+---------+--------------------+----+------------+------+------------+----------+------------------------+-----------------+------------------+
|AFM00040990|NULL      |AF          |31.5    |65.85    |1010.0   |KANDAHAR AIRPORT    |NULL|NULL        |40990 |Afghanistan |NULL      |TMAX;TMIN;PRCP;SNWD;TAVG|4                |1                 |
|AGE00147712|NULL      |AG          |36.17   |1.34     |112.0    |ORLEANSVILLE (CHLEF)|NULL|NULL        |NULL  |Algeria     |NULL      |TMAX;TMIN;PRCP          |3                |0                 |
|AGE0

                                                                                

In [13]:
inventory_path = f'{directory_path}/ghcnd-inventory.txt'

inventory_txt = spark.read.text(inventory_path)

inventory_txt.show(20, False)

+---------------------------------------------+
|value                                        |
+---------------------------------------------+
|ACW00011604  17.1167  -61.7833 TMAX 1949 1949|
|ACW00011604  17.1167  -61.7833 TMIN 1949 1949|
|ACW00011604  17.1167  -61.7833 PRCP 1949 1949|
|ACW00011604  17.1167  -61.7833 SNOW 1949 1949|
|ACW00011604  17.1167  -61.7833 SNWD 1949 1949|
|ACW00011604  17.1167  -61.7833 PGTM 1949 1949|
|ACW00011604  17.1167  -61.7833 WDFG 1949 1949|
|ACW00011604  17.1167  -61.7833 WSFG 1949 1949|
|ACW00011604  17.1167  -61.7833 WT03 1949 1949|
|ACW00011604  17.1167  -61.7833 WT08 1949 1949|
|ACW00011604  17.1167  -61.7833 WT16 1949 1949|
|ACW00011647  17.1333  -61.7833 TMAX 1961 1961|
|ACW00011647  17.1333  -61.7833 TMIN 1961 1961|
|ACW00011647  17.1333  -61.7833 PRCP 1957 1970|
|ACW00011647  17.1333  -61.7833 SNOW 1957 1970|
|ACW00011647  17.1333  -61.7833 SNWD 1957 1970|
|ACW00011647  17.1333  -61.7833 WT03 1961 1961|
|ACW00011647  17.1333  -61.7833 WT16 196

In [14]:
inventory = inventory_txt.select(
    F.substring('value', 1,11).alias('ID'),
    F.substring('value', 13,8).alias('LATITUDE'),
    F.substring('value', 22,9).alias('LONGITUDE'),
    F.substring('value', 32,4).alias('ELEMENT'),
    F.substring('value', 37,4).alias('FIRST_YEAR'),
    F.substring('value', 42,4).alias('LAST_YEAR'),    
)

inventory.printSchema()
inventory.show(20, False)

root
 |-- ID: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEMENT: string (nullable = true)
 |-- FIRST_YEAR: string (nullable = true)
 |-- LAST_YEAR: string (nullable = true)

+-----------+--------+---------+-------+----------+---------+
|ID         |LATITUDE|LONGITUDE|ELEMENT|FIRST_YEAR|LAST_YEAR|
+-----------+--------+---------+-------+----------+---------+
|ACW00011604| 17.1167| -61.7833|TMAX   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|TMIN   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|PRCP   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|SNOW   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|SNWD   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|PGTM   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WDFG   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WSFG   |1949      |1949     |
|ACW00011604| 17.1167| -61.7833|WT03   |1949      |1949     |
|ACW00011604|

### Answer 1(a)

In [16]:
num_stations_active_25 = inventory.filter('LAST_YEAR == 2025').select('ID').distinct().count()

In [17]:
num_stations_active_25

38481

In [21]:
num_gsn = stations_enriched.filter(F.col('GSN') == 'GSN').select('ID').count()

num_gsn

991

In [22]:
num_hsn = stations_enriched.filter(F.col('HCN_CRN_FLAG') == 'HCN').count()

num_hsn

1218

In [23]:
num_crn = stations_enriched.filter(F.col('HCN_CRN_FLAG') == 'CRN').count()

num_crn

234

In [32]:
num_multiple_networks = stations_enriched.filter(
    F.col('GSN').isNotNull() & F.col('HCN_CRN_FLAG').isNotNull()
).count()

num_multiple_networks

15

### Answer 1(b)

In [33]:
num_stations_south_hemi = stations_enriched.filter(F.col('LATITUDE') < 0).count()

num_stations_south_hemi

25316

In [37]:
num_us_territory = stations_enriched.filter(
    (F.col('COUNTRY_NAME').like('%United States%')) &
    (F.col('COUNTRY_NAME') != 'United States')
).count()

num_us_territory

414

### Answer (c)

In [38]:
countries_path = f'{directory_path}/ghcnd-countries.txt'

countries_txt = spark.read.text(countries_path)

countries = countries_txt.select(
    F.substring('value', 1,2).alias('COUNTRY_CODE'),
    F.substring('value', 4,61).alias('COUNTRY_NAME')
)

countries.printSchema()

countries.show(20, False)

root
 |-- COUNTRY_CODE: string (nullable = true)
 |-- COUNTRY_NAME: string (nullable = true)

+------------+-------------------------------+
|COUNTRY_CODE|COUNTRY_NAME                   |
+------------+-------------------------------+
|AC          |Antigua and Barbuda            |
|AE          |United Arab Emirates           |
|AF          |Afghanistan                    |
|AG          |Algeria                        |
|AJ          |Azerbaijan                     |
|AL          |Albania                        |
|AM          |Armenia                        |
|AO          |Angola                         |
|AQ          |American Samoa [United States] |
|AR          |Argentina                      |
|AS          |Australia                      |
|AU          |Austria                        |
|AY          |Antarctica                     |
|BA          |Bahrain                        |
|BB          |Barbados                       |
|BC          |Botswana                       |
|BD          

In [43]:
stations_count_country = stations_enriched.groupBy('COUNTRY_CODE').agg(F.countDistinct('ID').alias('NUM_STATIONS')).select('COUNTRY_CODE', 'NUM_STATIONS')

stations_count_country.show()

+------------+------------+
|COUNTRY_CODE|NUM_STATIONS|
+------------+------------+
|          TI|          62|
|          MX|        5249|
|          NI|          10|
|          SW|        1721|
|          UG|           8|
|          GM|        1123|
|          TO|          10|
|          HU|          10|
|          NH|           6|
|          RS|        1123|
|          MB|           2|
|          CJ|           1|
|          IV|          21|
|          EG|          23|
|          PS|          13|
|          HO|           8|
|          AR|         101|
|          CG|          13|
|          TL|           1|
|          SU|          28|
+------------+------------+
only showing top 20 rows



In [45]:
countries_with_station_counts = countries.join(stations_count_country, on='COUNTRY_CODE', how='left')

countries_with_station_counts.orderBy(F.desc('NUM_STATIONS')).show()

+------------+--------------------+------------+
|COUNTRY_CODE|        COUNTRY_NAME|NUM_STATIONS|
+------------+--------------------+------------+
|          US|      United States |       75846|
|          AS|          Australia |       17088|
|          CA|             Canada |        9269|
|          BR|              Brazil|        5989|
|          MX|             Mexico |        5249|
|          IN|              India |        3807|
|          SW|             Sweden |        1721|
|          SF|       South Africa |        1166|
|          GM|            Germany |        1123|
|          RS|             Russia |        1123|
|          FI|            Finland |         922|
|          NO|             Norway |         461|
|          NL|        Netherlands |         386|
|          KZ|         Kazakhstan |         329|
|          WA|            Namibia |         283|
|          RQ|Puerto Rico [Unit...|         260|
|          CH|             China  |         228|
|          SP|      

In [55]:
output_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net'

In [64]:
output_countries_count_path = f'{output_path}/rsh224/countries_count_stations'
countries_with_station_counts.write.mode('overwrite').option('header', True).csv(output_countries_count_path)

25/08/27 15:45:29 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1


In [65]:
!hdfs dfs -ls {output_path}/rsh224/

Found 3 items
drwxr-xr-x   - rsh224 supergroup          0 2025-08-27 15:45 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/countries_count_stations
-rwxrwxrwx   1                            6 2025-07-09 15:22 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/rsh224
drwxr-xr-x   - rsh224 supergroup          0 2025-08-26 19:41 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations-enriched


In [68]:
!hdfs dfs -ls {output_path}/rsh224/countries_count_stations/

Found 2 items
-rw-r--r--   1 rsh224 supergroup          0 2025-08-27 15:45 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/countries_count_stations/_SUCCESS
-rw-r--r--   1 rsh224 supergroup       4133 2025-08-27 15:45 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/countries_count_stations/part-00000-fe0afcaa-f455-4b1b-9f9f-84f85636a5c2-c000.csv


In [69]:
states_path = f'{directory_path}/ghcnd-states.txt'

states_txt = spark.read.text(states_path)

states = states_txt.select(
    F.substring('value', 1,2).alias('CODE'),
    F.substring('value', 4,47).alias('NAME')
)

states.show(20, False)

+----+-----------------------------------------------+
|CODE|NAME                                           |
+----+-----------------------------------------------+
|AB  |ALBERTA                                        |
|AK  |ALASKA                                         |
|AL  |ALABAMA                                        |
|AR  |ARKANSAS                                       |
|AS  |AMERICAN SAMOA                                 |
|AZ  |ARIZONA                                        |
|BC  |BRITISH COLUMBIA                               |
|CA  |CALIFORNIA                                     |
|CO  |COLORADO                                       |
|CT  |CONNECTICUT                                    |
|DC  |DISTRICT OF COLUMBIA                           |
|DE  |DELAWARE                                       |
|FL  |FLORIDA                                        |
|FM  |MICRONESIA                                     |
|GA  |GEORGIA                                        |
|GU  |GUAM

In [70]:
stations_enriched.printSchema()

root
 |-- ID: string (nullable = true)
 |-- STATE_CODE: string (nullable = true)
 |-- COUNTRY_CODE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- STATION_NAME: string (nullable = true)
 |-- GSN: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: string (nullable = true)
 |-- COUNTRY_NAME: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- ELEMENTS: string (nullable = true)
 |-- NUM_CORE_ELEMENTS: string (nullable = true)
 |-- NUM_OTHER_ELEMENTS: string (nullable = true)



In [72]:
stations_states = stations_enriched.select('ID', 'STATE_CODE', 'STATE_NAME')

In [75]:
stations_states.select('STATE_CODE').distinct().count()

76

In [76]:
stations_states.select('STATE_NAME').distinct().count()

74

In [78]:
states_stations_count = stations_enriched.filter(
    (F.col('COUNTRY_CODE') == 'US') &
    (F.col('STATE_CODE').isNotNull()) &
    (F.col('STATE_CODE') != '')
).groupBy('STATE_CODE').agg(F.countDistinct('ID').alias('STATIONS_COUNT')).select('STATE_CODE', 'STATIONS_COUNT')

states_stations_count.show()

+----------+--------------+
|STATE_CODE|STATIONS_COUNT|
+----------+--------------+
|        ND|           588|
|        NH|           495|
|        AZ|          1692|
|        NM|          2295|
|        AR|           961|
|        KS|          2401|
|        LA|           849|
|        NY|          1912|
|        UT|           986|
|        WA|          1694|
|        AK|          1049|
|        IA|          1106|
|        IL|          2234|
|        WY|          1348|
|        CT|           434|
|        WV|           552|
|        MT|          1312|
|        SD|          1191|
|        MI|          1494|
|        MS|           673|
+----------+--------------+
only showing top 20 rows



In [81]:
states_enriched = states.withColumnRenamed('CODE', 'STATE_CODE').join(states_stations_count, on='STATE_CODE', how='left').select(
    'STATE_CODE',
    'NAME',
    F.coalesce('STATIONS_COUNT', F.lit(0)).alias('STATIONS_COUNT')
)

states_enriched.show()

+----------+--------------------+--------------+
|STATE_CODE|                NAME|STATIONS_COUNT|
+----------+--------------------+--------------+
|        AB|             ALBERTA|             0|
|        AK|              ALASKA|          1049|
|        AL|ALABAMA          ...|          1151|
|        AR|            ARKANSAS|           961|
|        AS|      AMERICAN SAMOA|             0|
|        AZ|             ARIZONA|          1692|
|        BC|    BRITISH COLUMBIA|             0|
|        CA|          CALIFORNIA|          3166|
|        CO|            COLORADO|          4784|
|        CT|         CONNECTICUT|           434|
|        DC|DISTRICT OF COLUMBIA|            18|
|        DE|            DELAWARE|           148|
|        FL|             FLORIDA|          2244|
|        FM|          MICRONESIA|             0|
|        GA|             GEORGIA|          1407|
|        GU|                GUAM|             0|
|        HI|              HAWAII|           801|
|        IA|        

In [83]:
output_states_count_path = f'{output_path}/rsh224/states_count_stations'
states_enriched.write.mode('overwrite').option('header', True).csv(output_states_count_path)

25/08/27 16:15:41 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1
25/08/27 16:15:42 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1


In [84]:
!hdfs dfs -ls {output_path}/rsh224/

Found 4 items
drwxr-xr-x   - rsh224 supergroup          0 2025-08-27 15:45 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/countries_count_stations
-rwxrwxrwx   1                            6 2025-07-09 15:22 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/rsh224
drwxr-xr-x   - rsh224 supergroup          0 2025-08-27 16:15 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/states_count_stations
drwxr-xr-x   - rsh224 supergroup          0 2025-08-26 19:41 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations-enriched


In [86]:
!hdfs dfs -cat {output_path}/rsh224/states_count_stations/*

STATE_CODE,NAME,STATIONS_COUNT
AB,ALBERTA,0
AK,ALASKA,1049
AL,ALABAMA,1151
AR,ARKANSAS,961
AS,AMERICAN SAMOA,0
AZ,ARIZONA,1692
BC,BRITISH COLUMBIA,0
CA,CALIFORNIA,3166
CO,COLORADO,4784
CT,CONNECTICUT,434
DC,DISTRICT OF COLUMBIA,18
DE,DELAWARE,148
FL,FLORIDA,2244
FM,MICRONESIA,0
GA,GEORGIA,1407
GU,GUAM,0
HI,HAWAII,801
IA,IOWA,1106
ID,IDAHO,846
IL,ILLINOIS,2234
IN,INDIANA,2020
KS,KANSAS,2401
KY,KENTUCKY,1012
LA,LOUISIANA,849
MA,MASSACHUSETTS,852
MB,MANITOBA,0
MD,MARYLAND,759
ME,MAINE,586
MH,MARSHALL ISLANDS,0
MI,MICHIGAN,1494
MN,MINNESOTA,2675
MO,MISSOURI,1624
MP,NORTHERN MARIANA ISLANDS,0
MS,MISSISSIPPI,673
MT,MONTANA,1312
NB,NEW BRUNSWICK,0
NC,NORTH CAROLINA,2747
ND,NORTH DAKOTA,588
NE,NEBRASKA,2436
NH,NEW HAMPSHIRE,495
NJ,NEW JERSEY,870
NL,NEWFOUNDLAND AND LABRADOR,0
NM,NEW MEXICO,2295
NS,NOVA SCOTIA,0
NT,NORTHWEST TERRITORIES,0
NU,NUNAVUT,0
NV,NEVADA,714
NY,NEW YORK,1912
OH,OHIO,1479
OK,OKLAHOMA,1116
ON,ONTARIO,0
OR,OREGON,2031
PA,PENNSYLVANIA,1641
PE,PRINCE EDWARD ISLAND,0
PI,PACIFI

In [21]:
stop_spark()