In [65]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, to_timestamp, col, when, sum, avg, count, split, explode, udf, collect_list, monotonically_increasing_id, row_number, size 
from pyspark.sql.functions import substring, col, to_date
from pyspark import Row
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import ArrayType, TimestampType
import datetime
from matplotlib import pyplot as plt

In [2]:
sc = SparkContext.getOrCreate()

In [3]:
spark = SparkSession.builder.getOrCreate()

# Step 1 - Setting Up the Data

## 1. Load the global weather data into your big data technology of choice.

In [4]:
weather = spark.read \
               .format("csv") \
               .option("header", "true") \
               .load("data/2019/*.csv")
weather.createOrReplaceTempView('weather')       

In [5]:
weather.show()

+------+-----+--------+----+----+------+------+-----+----+-----+-----+------+-----+-----+-----+------+
|STN---| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD| GUST|   MAX|  MIN| PRCP| SNDP|FRSHTT|
+------+-----+--------+----+----+------+------+-----+----+-----+-----+------+-----+-----+-----+------+
|958360|99999|20190101|78.8|54.9|9999.9|9999.9|999.9| 8.8| 13.0|999.9| 96.1*| 61.9|0.00G|999.9|000000|
|958360|99999|20190102|73.1|53.7|9999.9|9999.9|999.9| 9.5| 14.0|999.9| 89.2*|57.4*|0.00G|999.9|000000|
|958360|99999|20190103|79.5|47.4|9999.9|9999.9|999.9| 3.2|  8.0|999.9| 96.6*| 57.2|0.00G|999.9|000000|
|958360|99999|20190104|82.7|52.0|9999.9|9999.9|999.9|13.0| 19.0|999.9|109.8*| 60.6|0.02G|999.9|000000|
|958360|99999|20190105|61.9|47.7|9999.9|9999.9|999.9| 8.5| 15.9|999.9| 70.5*|52.3*|0.02G|999.9|010000|
|958360|99999|20190106|68.6|48.1|9999.9|9999.9|999.9| 9.2| 13.0|999.9| 79.9*| 52.0|0.00G|999.9|000000|
|958360|99999|20190107|75.3|53.3|9999.9|9999.9|999.9| 5.9|  9.9|999.9| 87

In [6]:
countries = spark.read \
                 .format("csv") \
                 .option("header", "true") \
                 .load("countrylist.csv")
countries.createOrReplaceTempView('countries')

In [7]:
countries.show()

+------------+--------------------+
|COUNTRY_ABBR|        COUNTRY_FULL|
+------------+--------------------+
|          AA|               ARUBA|
|          AC| ANTIGUA AND BARBUDA|
|          AF|         AFGHANISTAN|
|          AG|             ALGERIA|
|          AI|    ASCENSION ISLAND|
|          AJ|          AZERBAIJAN|
|          AL|             ALBANIA|
|          AM|             ARMENIA|
|          AN|             ANDORRA|
|          AO|              ANGOLA|
|          AQ|      AMERICAN SAMOA|
|          AR|           ARGENTINA|
|          AS|           AUSTRALIA|
|          AT|ASHMORE AND CARTI...|
|          AU|             AUSTRIA|
|          AV|            ANGUILLA|
|          AX|             ANTIGUA|
|          AY|          ANTARCTICA|
|          AZ|              AZORES|
|          BA|             BAHRAIN|
+------------+--------------------+
only showing top 20 rows



In [8]:
stations = spark.read \
                .format("csv") \
                .option("header", "true") \
                .load("stationlist.csv")
stations.createOrReplaceTempView('stations')                

In [9]:
stations.show()

+------+------------+
|STN_NO|COUNTRY_ABBR|
+------+------------+
|012240|          NO|
|020690|          SW|
|020870|          SW|
|021190|          SW|
|032690|          UK|
|033450|          UK|
|039290|          UK|
|039790|          EI|
|040480|          IC|
|041300|          IC|
|060100|          FO|
|061443|          DA|
|063401|          NL|
|071910|          FR|
|092640|          GM|
|123766|          PL|
|125990|          PL|
|129700|          HU|
|132240|          HR|
|156500|          BU|
+------+------------+
only showing top 20 rows



In [10]:
stations.dtypes

[('STN_NO', 'string'), ('COUNTRY_ABBR', 'string')]

## 2. Join the stationlist.csv with the countrylist.csv to get the full country name for each station number.


In [11]:
stations_n_countries = spark.sql("""
select a.STN_NO
     , b.COUNTRY_ABBR
     , b.COUNTRY_FULL
from stations a inner join countries b on a.COUNTRY_ABBR = b.COUNTRY_ABBR
""")
stations_n_countries.createOrReplaceTempView('stations_n_countries') 

In [12]:
stations_n_countries.show()

+------+------------+--------------+
|STN_NO|COUNTRY_ABBR|  COUNTRY_FULL|
+------+------------+--------------+
|012240|          NO|        NORWAY|
|020690|          SW|        SWEDEN|
|020870|          SW|        SWEDEN|
|021190|          SW|        SWEDEN|
|032690|          UK|UNITED KINGDOM|
|033450|          UK|UNITED KINGDOM|
|039290|          UK|UNITED KINGDOM|
|039790|          EI|       IRELAND|
|040480|          IC|       ICELAND|
|041300|          IC|       ICELAND|
|060100|          FO| FAROE ISLANDS|
|061443|          DA|       DENMARK|
|063401|          NL|   NETHERLANDS|
|071910|          FR|        FRANCE|
|092640|          GM|       GERMANY|
|123766|          PL|        POLAND|
|125990|          PL|        POLAND|
|129700|          HU|       HUNGARY|
|132240|          HR|       CROATIA|
|156500|          BU|      BULGARIA|
+------+------------+--------------+
only showing top 20 rows



In [13]:
stations.count() - stations_n_countries.count()

97

## 3. Join the global weather data with the full country names by station number.

In [14]:
df = spark.sql("""
select a.*
     , b.COUNTRY_ABBR
     , b.COUNTRY_FULL
from weather a inner join stations_n_countries b on a.`STN---` = b.STN_NO
""")
df.createOrReplaceTempView('df')

In [16]:
df.show()

+------+-----+--------+----+----+------+------+-----+----+-----+-----+-----+-----+-----+-----+------+------------+------------+
|STN---| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD| GUST|  MAX|  MIN| PRCP| SNDP|FRSHTT|COUNTRY_ABBR|COUNTRY_FULL|
+------+-----+--------+----+----+------+------+-----+----+-----+-----+-----+-----+-----+-----+------+------------+------------+
|010875|99999|20190101|41.1|30.1|9999.9|9999.9|  5.9|46.7| 59.1| 74.0|44.6*|37.4*|99.99|999.9|011010|          NO|      NORWAY|
|010875|99999|20190102|40.5|29.0|9999.9|9999.9|  6.2|20.5| 32.1| 44.1|41.0*|37.4*|99.99|999.9|010000|          NO|      NORWAY|
|010875|99999|20190103|43.0|36.6|9999.9|9999.9|  6.1|13.5| 21.0|999.9|44.6*|41.0*|0.00I|999.9|000000|          NO|      NORWAY|
|010875|99999|20190104|46.7|44.4|9999.9|9999.9|  5.8|27.4| 33.0| 40.0|48.2*|42.8*|99.99|999.9|010000|          NO|      NORWAY|
|010875|99999|20190105|46.5|44.1|9999.9|9999.9|  6.1|18.3| 25.1|999.9|48.2*|44.6*|99.99|999.9|010000|   

In [15]:
weather.count() - df.count()

15249

In [17]:
weather.count()

4158416

In [66]:
df = df.withColumn('NTEMP', df.TEMP.cast('float')) \
       .withColumn('NWDSP', df.WDSP.cast('float')) \
       .withColumn('tornado', substring(col('FRSHTT'), -1, 1).cast('int')) \
       .withColumn('dt', to_date('YEARMODA', 'yyyyMMdd'))
df.createOrReplaceTempView('df')

In [67]:
df.dtypes

[('STN---', 'string'),
 ('WBAN', 'string'),
 ('YEARMODA', 'string'),
 ('TEMP', 'string'),
 ('DEWP', 'string'),
 ('SLP', 'string'),
 ('STP', 'string'),
 ('VISIB', 'string'),
 ('WDSP', 'string'),
 ('MXSPD', 'string'),
 ('GUST', 'string'),
 ('MAX', 'string'),
 ('MIN', 'string'),
 ('PRCP', 'string'),
 ('SNDP', 'string'),
 ('FRSHTT', 'string'),
 ('COUNTRY_ABBR', 'string'),
 ('COUNTRY_FULL', 'string'),
 ('NTEMP', 'float'),
 ('NWDSP', 'float'),
 ('tornado', 'int'),
 ('dt', 'date')]

# Step 2 - Questions

## 1. Which country had the hottest average mean temperature over the year?

In [40]:
spark.sql("""
select *
from df
where NTEMP >= 9999.9
""").show()

+------+----+--------+----+----+---+---+-----+----+-----+----+---+---+----+----+------+------------+------------+-----+-----+
|STN---|WBAN|YEARMODA|TEMP|DEWP|SLP|STP|VISIB|WDSP|MXSPD|GUST|MAX|MIN|PRCP|SNDP|FRSHTT|COUNTRY_ABBR|COUNTRY_FULL|NTEMP|NWDSP|
+------+----+--------+----+----+---+---+-----+----+-----+----+---+---+----+----+------+------------+------------+-----+-----+
+------+----+--------+----+----+---+---+-----+----+-----+----+---+---+----+----+------+------------+------------+-----+-----+



In [33]:
spark.sql("""
select country
     , avg_temp
from (
    select COUNTRY_FULL as country
         , avg(NTEMP) as avg_temp
    from df
    where NTEMP < 9999.9
    group by COUNTRY_FULL
    )
order by avg_temp desc
limit 1
""").show()

+--------+-----------------+
| country|         avg_temp|
+--------+-----------------+
|DJIBOUTI|90.06114474836602|
+--------+-----------------+



##  2. Which country had the most consecutive days of tornadoes/funnel cloud formations?

In [69]:
last_event = df.withColumn("last_event", lag('dt').over(Window.partitionBy('COUNTRY_FULL').orderBy('dt')))

In [71]:
lag_in_day = last_event.withColumn('lag_in_day', (col('dt').astype("long") - col('last_event').astype("long")))

In [72]:
lag_in_day.toPandas()

Py4JJavaError: An error occurred while calling o409.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 54 in stage 276.0 failed 1 times, most recent failure: Lost task 54.0 in stage 276.0 (TID 7251, 192.168.15.142, executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
	at java.base/java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:120)
	at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:95)
	at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:156)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:223)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:176)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:539)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:343)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$2141/0x0000000840f02040.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
	at org.apache.spark.rdd.RDD$$Lambda$2137/0x0000000840ef1040.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2104/0x0000000840ebe040.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3450)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3447)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
	at java.base/java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:120)
	at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:95)
	at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:156)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:223)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:176)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:539)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:343)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$2141/0x0000000840f02040.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
	at org.apache.spark.rdd.RDD$$Lambda$2137/0x0000000840ef1040.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2104/0x0000000840ebe040.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


## 3. Which country had the second highest average mean wind speed over the year?

In [42]:
spark.sql("""
select *
from df
where NWDSP >= 999.9
""").show()

+------+-----+--------+-----+-----+------+------+-----+-----+-----+-----+------+------+-----+-----+------+------------+------------+-----+-----+
|STN---| WBAN|YEARMODA| TEMP| DEWP|   SLP|   STP|VISIB| WDSP|MXSPD| GUST|   MAX|   MIN| PRCP| SNDP|FRSHTT|COUNTRY_ABBR|COUNTRY_FULL|NTEMP|NWDSP|
+------+-----+--------+-----+-----+------+------+-----+-----+-----+-----+------+------+-----+-----+------+------------+------------+-----+-----+
|232050|99999|20191223| -0.2| -3.6|1015.5|1014.1|  5.7|999.9|999.9|999.9|  16.0| -7.6*|0.02F|999.9|101000|          RS|      RUSSIA| -0.2|999.9|
|232050|99999|20191224|-17.3|-22.1|1019.0|1017.4|  4.7|999.9|999.9|999.9| -9.6*|-22.7*|0.01F| 19.7|100000|          RS|      RUSSIA|-17.3|999.9|
|232050|99999|20191225|-21.8|-27.0|1023.1|1021.6| 12.4|999.9|999.9|999.9| -17.9| -25.6|0.00F| 19.7|000000|          RS|      RUSSIA|-21.8|999.9|
|255610|99999|20190113|-31.5|-38.2|1021.2|1020.0| 31.1|999.9|999.9|999.9|-28.5*| -35.5|0.00I|  7.9|000000|          RS|      RUSSI

In [55]:
spark.sql("""
select country
     , avg_wind_speed
from (
        select *
            , row_number() over (order by country desc) as rn
        from (
                select COUNTRY_FULL as country
                    , avg(NWDSP) as avg_wind_speed
                from df
                where NWDSP < 999.9
                group by COUNTRY_FULL
        )
)
where rn = 2
""").show()

+-------+-----------------+
|country|   avg_wind_speed|
+-------+-----------------+
| ZAMBIA|5.920833328117927|
+-------+-----------------+

