In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))


# Print function docstrings

help(start_spark)
help(stop_spark)
help(display_spark)
help(show_as_html)

Help on function start_spark in module __main__:

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)
    Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)

Help on function stop_spark in module __main__:

stop_spark()
    Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).

Help on function display_spark in module __main__:

display_spark()
    Display the status of the active Spark session if one is currently running.

Help on function show_as_html in module __main__:

show_as_html(df, n=20)
    Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n 

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=8, master_memory=4)

0,1
spark.ui.port,4821
spark.sql.shuffle.partitions,64
spark.executor.instances,4
spark.dynamicAllocation.enabled,false
spark.driver.memory,4g
spark.app.startTime,1682571717023
spark.master,spark://masternode2:7077
spark.cores.max,16
spark.executor.id,driver
spark.sql.warehouse.dir,file:/users/home/mda205/spark-warehouse


In [3]:
# Write your imports and code here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import textwrap
import glob
import math as m
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import os
import pandas as pd
import shapely
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

  shapely_geos_version, geos_capi_version_string


In [4]:
def savefile(filename, tablename):

    name = "mda205"

    data_path = f"hdfs:///user/{name}/outputs/{filename}/"


    tablename.write.mode("overwrite").csv(data_path)
    
    (tablename.write
        .option("compression", "gzip")
        .mode("overwrite")
        .csv(data_path))

In [5]:
schema_stations = StructType([
    StructField("Station_ID", StringType(), True),
    StructField("State_Code", StringType(), True),
    StructField("Country_Code", StringType(), True),
    StructField("Lat", DoubleType(), True),
    StructField("Lon", DoubleType(), True),
    StructField("Elevation", IntegerType(), True),
    StructField("Station_Name", StringType(), True),
    StructField("GSN_flag", StringType(), True),
    StructField("HCN_flag", StringType(), True),
    StructField("WMO_ID", IntegerType(), True),
    StructField("Country_Name", StringType(), True),
    StructField("State_Name", StringType(), True),
    StructField("Station_Elements", StringType(), True),
    StructField("FirstYear", IntegerType(), True),
    StructField("LastYear", IntegerType(), True),
    StructField("Element_Count", IntegerType(), True),
    StructField("Core_Element_Count", IntegerType(), True),
    StructField("Not_Core_Elements", IntegerType(), True)
])

stations=(
    spark.read.format("com.databricks.spark.csv")
    .option("header", "False")
    .option("inferSchema", "False")
    .schema(schema_stations)
    .load("hdfs:///user/mda205/outputs/no_array_stations"))

schema_daily = StructType([
    StructField("Station_ID", StringType(), True),
    StructField("DATE", DateType(), True),
    StructField("Element", StringType(), True),
    StructField("Element_Value", IntegerType(), True),
    StructField("MEASUREMENT_FLAG", StringType(), True),
    StructField("QUALITY_FLAG", StringType(), True),
    StructField("SOURCE_FLAG", StringType(), True),
    StructField("OBSERVATION_TIME", StringType(), True),
])

#Q2 c

#Bringing in the small directories 
states=(
    spark.read.text("hdfs:///data/ghcnd/ghcnd-states.txt"))

states=states.withColumn(
    'State_Code', F.substring('value', 1,2)).withColumn(
    'State_Name', F.substring('value', 4,47))

states=states.drop(F.col("value"))
#--------
countries=(
    spark.read.text("hdfs:///data/ghcnd/ghcnd-countries.txt"))

countries=countries.withColumn(
    'Country_Code', F.substring('value', 1,2)).withColumn(
    'Country_Name', F.substring('value', 4,60))

countries=countries.drop(F.col("value"))
#--------
inventory=(
    spark.read.text("hdfs:///data/ghcnd/ghcnd-inventory.txt"))

inventory=inventory.withColumn(
    'Station_ID', F.substring('value', 1,11)).withColumn(
    'Lat', F.substring('value', 13,8)).withColumn(
    'Lon', F.substring('value', 22,9)).withColumn(
    'Element', F.substring('value', 32,4)).withColumn(
    'First_Year', F.substring('value', 37,4)).withColumn(
    'Last_Year', F.substring('value', 42,4))

inventory=inventory.drop(F.col("value"))

In [6]:
stations_broadcasted = F.broadcast(stations).cache()

## Analysis Q1 

An Q1 a

In [7]:
x=stations.agg(
    F.countDistinct("Station_ID").alias("station count"),
    F.sum(F.when(
        F.col("LastYear")==2022, 1)).alias("active in 2022 count"),
    F.sum(F.when(
        F.col("HCN_flag")=="HCN", 1)).alias("HCN count"),
    F.sum(F.when(
        F.col("HCN_flag")=="CRN", 1)).alias("CRN count"),
    F.count("GSN_flag").alias("GSN count"),
    F.sum(F.when(
        (F.col("GSN_flag")=="GSN") & 
        (F.col("HCN_flag")=="CRN"), 1)).alias("GSN & CRN count"),
    F.sum(F.when(
        (F.col("GSN_flag")=="GSN") & 
        (F.col("HCN_flag")=="HCN"), 1)).alias("GSN & HCN count"))

x.show()

+-------------+--------------------+---------+---------+---------+---------------+---------------+
|station count|active in 2022 count|HCN count|CRN count|GSN count|GSN & CRN count|GSN & HCN count|
+-------------+--------------------+---------+---------+---------+---------------+---------------+
|       124247|                8448|     1218|      234|      991|           null|             15|
+-------------+--------------------+---------+---------+---------+---------------+---------------+



An Q1 b

In [8]:
countryCount=(stations.groupBy(["Country_Code"])
              .agg(
                  F.countDistinct("Station_ID").alias("Station_Count"))
             .sort([F.col("Country_Code").asc()]))

In [9]:
countryCount.show(5)

+------------+-------------+
|Country_Code|Station_Count|
+------------+-------------+
|          AC|            2|
|          AE|            4|
|          AF|            4|
|          AG|           87|
|          AJ|           66|
+------------+-------------+
only showing top 5 rows



In [10]:
countries_stationCount=(countries
              .join(countryCount, on="Country_Code", how='left'))

countries_stationCount.show(5)
savefile("countries", countries_stationCount) 

+------------+--------------------+-------------+
|Country_Code|        Country_Name|Station_Count|
+------------+--------------------+-------------+
|          AC|Antigua and Barbuda |            2|
|          AE|United Arab Emira...|            4|
|          AF|         Afghanistan|            4|
|          AG|            Algeria |           87|
|          AJ|         Azerbaijan |           66|
+------------+--------------------+-------------+
only showing top 5 rows



In [11]:
statesCount=(stations.groupBy(["State_Code"])
              .agg(
                  F.countDistinct("Station_ID").alias("Station_Count"))
             .sort([F.col("State_Code").asc()]))

In [12]:
states_stationsCount=(states
              .join(statesCount, on="State_Code", how='left')
              .withColumn("Station_Count", F.col("Station_Count")))

states_stationsCount.show(5)
savefile("states", states_stationsCount) 

+----------+--------------------+-------------+
|State_Code|          State_Name|Station_Count|
+----------+--------------------+-------------+
|        AB|             ALBERTA|         1444|
|        AK|              ALASKA|         1034|
|        AL|ALABAMA          ...|         1089|
|        AR|            ARKANSAS|          926|
|        AS|      AMERICAN SAMOA|           21|
+----------+--------------------+-------------+
only showing top 5 rows



An Q1 c

In [13]:
y=stations.agg(
    F.sum(F.when(F.col("Lat")<=0, 1)).alias("southern hemisphere"),
    F.sum(F.when((stations.Country_Name.contains("[United States]")), 1)).alias("US Territories"))
y.show()

+-------------------+--------------+
|southern hemisphere|US Territories|
+-------------------+--------------+
|              25337|           371|
+-------------------+--------------+



## Analysis Q2

In [14]:
def distance_between_two_points(lat_A, lon_A, lat_B, lon_B):
    
    # i tried to use geopy but it didnt seem to work when wrapped in the UDF 
    # this was from user Michael0x2a on stackoverflow with minor changes
    
    R = 6373.0 #earth's radius in km

    #math likes radians...kind of annoying
    lat1=m.radians(lat_A)
    lon1=m.radians(lon_A)
    lat2=m.radians(lat_B)
    lon2=m.radians(lon_B)

    #some equation off stackoverflow 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    
    a = m.sin(dlat / 2)**2 + m.cos(lat1) * m.cos(lat2) * m.sin(dlon / 2)**2
    c = 2*m.atan2(m.sqrt(a), m.sqrt(1 - a))

    distance = R*c

    return distance

An Q2 a

In [15]:
# wrapping function in pyspark UDF
distance_between_two_points_UDF=(
    F.udf(distance_between_two_points, FloatType()))

An Q2 b

In [16]:
NZStations=stations_broadcasted.alias("stationsCopy")

# filter to NZ and reduce table to only needed columns
NZStationsA= (NZStations.filter((F.col("Country_Code")=="NZ"))
              .select(
                  F.col("Station_ID"),
                  F.col("Lat").cast(FloatType()),
                  F.col("Lon").cast(FloatType())))

NZStationsB=(NZStationsA
             .withColumnRenamed("Station_ID", "ID_Station_B")
             .withColumnRenamed("Lat", "Lat_Station_B")
             .withColumnRenamed("Lon", "Lon_Station_B"))

# this might make less shuffling but also might not matter because stations is small
NZStationsB_broadcasted = F.broadcast(NZStationsB) 

# joining tables
NZstations_joined=(NZStationsA.crossJoin(NZStationsB_broadcasted)
                   .withColumnRenamed("Station_ID", "ID_Station_A")
                   .withColumnRenamed("Lat", "Lat_Station_A")
                   .withColumnRenamed("Lon", "Lon_Station_A"))

# adding column of calculation of distance using UDF
NZstations_distance_calculated=(NZstations_joined.withColumn("Distance_Between_Stations_km", 
                                 distance_between_two_points_UDF(
                                     F.col("Lat_Station_A"), 
                                     F.col("Lon_Station_A"),
                                     F.col("Lat_Station_B"),
                                     F.col("Lon_Station_B"))))

# dropping where stations compared were identical, making single row with both station IDs
NZstations_distance_calculated=(NZstations_distance_calculated
                     .where(NZstations_distance_calculated.Distance_Between_Stations_km!=0)
                     .withColumn("Station_IDs_compared", 
                                 F.concat(F.col("ID_Station_A"), 
                                          F.lit("__"), 
                                          F.col("ID_Station_B"))))

NZstations_pairwise_distances=(NZstations_distance_calculated
            .select(
                F.col("Station_IDs_compared"), 
                F.col("Distance_Between_Stations_km")
            ))

# look, its 2 weeks after i did this whole bit and i'm now aware at how....unnecesary 
# I made it, I fixed some but I'm not redoing the whole thing at this point

In [17]:
NZstations_pairwise_distances.show(5, truncate=80)

+------------------------+----------------------------+
|    Station_IDs_compared|Distance_Between_Stations_km|
+------------------------+----------------------------+
|NZ000933090__NZ000939450|                   1553.7822|
|NZ000933090__NZM00093929|                    1417.352|
|NZ000933090__NZ000093844|                    951.2242|
|NZ000933090__NZ000093417|                   220.26935|
|NZ000933090__NZM00093781|                    516.1928|
+------------------------+----------------------------+
only showing top 5 rows



In [18]:
# savefile("NZ_stations_pairwise_distances", NZstations_pairwise_distances) 
# dont need to keep saving when running book

In [19]:
y=(NZstations_pairwise_distances
   .orderBy(
       F.col("Distance_Between_Stations_km").desc()).limit(1)
   .withColumnRenamed("Distance_Between_Stations_km", "max_distance")
   .withColumnRenamed("Station_IDs_compared", "Farthest_apart_stations"))
   
y.show(truncate=60)

+------------------------+------------+
| Farthest_apart_stations|max_distance|
+------------------------+------------+
|NZ000939450__NZ000093994|   2800.0547|
+------------------------+------------+



## Analysis Q3

An Q3 a

In [20]:
!hdfs getconf -confKey "dfs.blocksize" 

134217728


In [21]:
!hdfs fsck hdfs:///data/ghcnd/daily/2023.csv.gz -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=mda205&files=1&blocks=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2023.csv.gz
FSCK started by mda205 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2023.csv.gz at Thu Apr 27 17:02:43 NZST 2023

/data/ghcnd/daily/2023.csv.gz 27521531 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073824428_83608 len=27521531 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	27521531 B
 Total files:	1
 Total blocks (validated):	1 (avg. block size 27521531 B)
 Minimally replicated blocks:	1 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	4
 Average block replication:	8.0
 Missing blocks:		0
 Corrupt blocks:		0
 Missing replicas:		0 (0.0 %)
 Blocks queued for replication:	0

Erasure Coded Block Groups:

In [22]:
!hdfs fsck hdfs:///data/ghcnd/daily/2022.csv.gz -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=mda205&files=1&blocks=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2022.csv.gz
FSCK started by mda205 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2022.csv.gz at Thu Apr 27 17:02:46 NZST 2023

/data/ghcnd/daily/2022.csv.gz 166075423 bytes, replicated: replication=8, 2 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073824426_83606 len=134217728 Live_repl=8
1. BP-700027894-132.181.129.68-1626517177804:blk_1073824427_83607 len=31857695 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	166075423 B
 Total files:	1
 Total blocks (validated):	2 (avg. block size 83037711 B)
 Minimally replicated blocks:	2 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	4
 Average block replication:	8.0
 Missing block

An Q3 b

In [23]:
def get_daily_data_for_select_year(startyear, endyear=False):
    
    """retreive data for specified year or years. if only looking at one year,
    startyear is treated as target year. enter years as integers and in chronological order
    endyear defaults to false if no second argument is given"""
    
    if endyear != False:
        
        years=range(startyear, endyear + 1)
        
        # ChatGPT helped with the join part
        yearsconcat="{"+",".join([f"{year}" for year in years])+"}*"
        
        filepath=("hdfs:///data/ghcnd/daily/"+yearsconcat+"*")
       
    elif endyear==False:
        targetYear=f'{startyear}'
        
        filepath=("hdfs:///data/ghcnd/daily/"+targetYear+"*")

    data=(
        spark.read.format("com.databricks.spark.csv")
        .option("header", "false")
        .option("inferSchema", "false")
        .option("dateFormat", "yyyymmdd")
        .schema(schema_daily).load(filepath))
    
    return data

In [24]:
data2022=get_daily_data_for_select_year(2022)

k=data2022.select(F.count(data2022.Element).alias("# of observations in 2022"))

k.show()

+-------------------------+
|# of observations in 2022|
+-------------------------+
|                 37375779|
+-------------------------+



In [25]:
data2023=get_daily_data_for_select_year(2023)

r=data2023.select(F.count(data2023.Element).alias("# of observations in 2023"))

r.show()

+-------------------------+
|# of observations in 2023|
+-------------------------+
|                  6031842|
+-------------------------+



An Q3 c

In [26]:
data2014_23=get_daily_data_for_select_year(2014, 2023)

w=data2014_23.select(F.count(data2014_23.Element).alias("# of observations since 2014"))

w.show()

+----------------------------+
|# of observations since 2014|
+----------------------------+
|                   337279894|
+----------------------------+



## Analysis Q4 has it's own notebook

In [None]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()