In [27]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, to_timestamp, col, when, sum, avg, count, split, explode, udf, collect_list, monotonically_increasing_id, row_number, size 
from pyspark.sql.functions import substring, col, to_date, datediff
from pyspark import Row
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import ArrayType, TimestampType
import datetime
from matplotlib import pyplot as plt

In [2]:
sc = SparkContext.getOrCreate()

In [3]:
spark = SparkSession.builder.getOrCreate()

# Step 1 - Setting Up the Data

## 1. Load the global weather data into your big data technology of choice.

In [4]:
weather = spark.read \
               .format("csv") \
               .option("header", "true") \
               .load("data/2019/*.csv")
weather.createOrReplaceTempView('weather')       

In [5]:
weather.show()

+------+-----+--------+----+----+------+------+-----+----+-----+-----+------+-----+-----+-----+------+
|STN---| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD| GUST|   MAX|  MIN| PRCP| SNDP|FRSHTT|
+------+-----+--------+----+----+------+------+-----+----+-----+-----+------+-----+-----+-----+------+
|958360|99999|20190101|78.8|54.9|9999.9|9999.9|999.9| 8.8| 13.0|999.9| 96.1*| 61.9|0.00G|999.9|000000|
|958360|99999|20190102|73.1|53.7|9999.9|9999.9|999.9| 9.5| 14.0|999.9| 89.2*|57.4*|0.00G|999.9|000000|
|958360|99999|20190103|79.5|47.4|9999.9|9999.9|999.9| 3.2|  8.0|999.9| 96.6*| 57.2|0.00G|999.9|000000|
|958360|99999|20190104|82.7|52.0|9999.9|9999.9|999.9|13.0| 19.0|999.9|109.8*| 60.6|0.02G|999.9|000000|
|958360|99999|20190105|61.9|47.7|9999.9|9999.9|999.9| 8.5| 15.9|999.9| 70.5*|52.3*|0.02G|999.9|010000|
|958360|99999|20190106|68.6|48.1|9999.9|9999.9|999.9| 9.2| 13.0|999.9| 79.9*| 52.0|0.00G|999.9|000000|
|958360|99999|20190107|75.3|53.3|9999.9|9999.9|999.9| 5.9|  9.9|999.9| 87

In [6]:
countries = spark.read \
                 .format("csv") \
                 .option("header", "true") \
                 .load("countrylist.csv")
countries.createOrReplaceTempView('countries')

In [7]:
countries.show()

+------------+--------------------+
|COUNTRY_ABBR|        COUNTRY_FULL|
+------------+--------------------+
|          AA|               ARUBA|
|          AC| ANTIGUA AND BARBUDA|
|          AF|         AFGHANISTAN|
|          AG|             ALGERIA|
|          AI|    ASCENSION ISLAND|
|          AJ|          AZERBAIJAN|
|          AL|             ALBANIA|
|          AM|             ARMENIA|
|          AN|             ANDORRA|
|          AO|              ANGOLA|
|          AQ|      AMERICAN SAMOA|
|          AR|           ARGENTINA|
|          AS|           AUSTRALIA|
|          AT|ASHMORE AND CARTI...|
|          AU|             AUSTRIA|
|          AV|            ANGUILLA|
|          AX|             ANTIGUA|
|          AY|          ANTARCTICA|
|          AZ|              AZORES|
|          BA|             BAHRAIN|
+------------+--------------------+
only showing top 20 rows



In [8]:
stations = spark.read \
                .format("csv") \
                .option("header", "true") \
                .load("stationlist.csv")
stations.createOrReplaceTempView('stations')                

In [9]:
stations.show()

+------+------------+
|STN_NO|COUNTRY_ABBR|
+------+------------+
|012240|          NO|
|020690|          SW|
|020870|          SW|
|021190|          SW|
|032690|          UK|
|033450|          UK|
|039290|          UK|
|039790|          EI|
|040480|          IC|
|041300|          IC|
|060100|          FO|
|061443|          DA|
|063401|          NL|
|071910|          FR|
|092640|          GM|
|123766|          PL|
|125990|          PL|
|129700|          HU|
|132240|          HR|
|156500|          BU|
+------+------------+
only showing top 20 rows



In [10]:
stations.dtypes

[('STN_NO', 'string'), ('COUNTRY_ABBR', 'string')]

## 2. Join the stationlist.csv with the countrylist.csv to get the full country name for each station number.


In [11]:
stations_n_countries = spark.sql("""
select a.STN_NO
     , b.COUNTRY_ABBR
     , b.COUNTRY_FULL
from stations a inner join countries b on a.COUNTRY_ABBR = b.COUNTRY_ABBR
""")
stations_n_countries.createOrReplaceTempView('stations_n_countries') 

In [12]:
stations_n_countries.show()

+------+------------+--------------+
|STN_NO|COUNTRY_ABBR|  COUNTRY_FULL|
+------+------------+--------------+
|012240|          NO|        NORWAY|
|020690|          SW|        SWEDEN|
|020870|          SW|        SWEDEN|
|021190|          SW|        SWEDEN|
|032690|          UK|UNITED KINGDOM|
|033450|          UK|UNITED KINGDOM|
|039290|          UK|UNITED KINGDOM|
|039790|          EI|       IRELAND|
|040480|          IC|       ICELAND|
|041300|          IC|       ICELAND|
|060100|          FO| FAROE ISLANDS|
|061443|          DA|       DENMARK|
|063401|          NL|   NETHERLANDS|
|071910|          FR|        FRANCE|
|092640|          GM|       GERMANY|
|123766|          PL|        POLAND|
|125990|          PL|        POLAND|
|129700|          HU|       HUNGARY|
|132240|          HR|       CROATIA|
|156500|          BU|      BULGARIA|
+------+------------+--------------+
only showing top 20 rows



In [13]:
stations.count() - stations_n_countries.count()

97

## 3. Join the global weather data with the full country names by station number.

In [14]:
df = spark.sql("""
select a.*
     , b.COUNTRY_ABBR
     , b.COUNTRY_FULL
from weather a inner join stations_n_countries b on a.`STN---` = b.STN_NO
""")
df.createOrReplaceTempView('df')

In [15]:
df.show()

+------+-----+--------+----+----+------+------+-----+----+-----+-----+-----+-----+-----+-----+------+------------+------------+
|STN---| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD| GUST|  MAX|  MIN| PRCP| SNDP|FRSHTT|COUNTRY_ABBR|COUNTRY_FULL|
+------+-----+--------+----+----+------+------+-----+----+-----+-----+-----+-----+-----+-----+------+------------+------------+
|010875|99999|20190101|41.1|30.1|9999.9|9999.9|  5.9|46.7| 59.1| 74.0|44.6*|37.4*|99.99|999.9|011010|          NO|      NORWAY|
|010875|99999|20190102|40.5|29.0|9999.9|9999.9|  6.2|20.5| 32.1| 44.1|41.0*|37.4*|99.99|999.9|010000|          NO|      NORWAY|
|010875|99999|20190103|43.0|36.6|9999.9|9999.9|  6.1|13.5| 21.0|999.9|44.6*|41.0*|0.00I|999.9|000000|          NO|      NORWAY|
|010875|99999|20190104|46.7|44.4|9999.9|9999.9|  5.8|27.4| 33.0| 40.0|48.2*|42.8*|99.99|999.9|010000|          NO|      NORWAY|
|010875|99999|20190105|46.5|44.1|9999.9|9999.9|  6.1|18.3| 25.1|999.9|48.2*|44.6*|99.99|999.9|010000|   

In [16]:
weather.count() - df.count()

15249

In [17]:
weather.count()

4158416

In [18]:
df = df.withColumn('NTEMP', df.TEMP.cast('float')) \
       .withColumn('NWDSP', df.WDSP.cast('float')) \
       .withColumn('tornado', substring(col('FRSHTT'), -1, 1).cast('int')) \
       .withColumn('dt', to_date('YEARMODA', 'yyyyMMdd'))
df.createOrReplaceTempView('df')

In [19]:
df.dtypes

[('STN---', 'string'),
 ('WBAN', 'string'),
 ('YEARMODA', 'string'),
 ('TEMP', 'string'),
 ('DEWP', 'string'),
 ('SLP', 'string'),
 ('STP', 'string'),
 ('VISIB', 'string'),
 ('WDSP', 'string'),
 ('MXSPD', 'string'),
 ('GUST', 'string'),
 ('MAX', 'string'),
 ('MIN', 'string'),
 ('PRCP', 'string'),
 ('SNDP', 'string'),
 ('FRSHTT', 'string'),
 ('COUNTRY_ABBR', 'string'),
 ('COUNTRY_FULL', 'string'),
 ('NTEMP', 'float'),
 ('NWDSP', 'float'),
 ('tornado', 'int'),
 ('dt', 'date')]

# Step 2 - Questions

## 1. Which country had the hottest average mean temperature over the year?

In [20]:
spark.sql("""
select *
from df
where NTEMP >= 9999.9
""").show()

+------+----+--------+----+----+---+---+-----+----+-----+----+---+---+----+----+------+------------+------------+-----+-----+-------+---+
|STN---|WBAN|YEARMODA|TEMP|DEWP|SLP|STP|VISIB|WDSP|MXSPD|GUST|MAX|MIN|PRCP|SNDP|FRSHTT|COUNTRY_ABBR|COUNTRY_FULL|NTEMP|NWDSP|tornado| dt|
+------+----+--------+----+----+---+---+-----+----+-----+----+---+---+----+----+------+------------+------------+-----+-----+-------+---+
+------+----+--------+----+----+---+---+-----+----+-----+----+---+---+----+----+------+------------+------------+-----+-----+-------+---+



In [21]:
spark.sql("""
select country
     , avg_temp
from (
    select COUNTRY_FULL as country
         , avg(NTEMP) as avg_temp
    from df
    where NTEMP < 9999.9
    group by COUNTRY_FULL
    )
order by avg_temp desc
limit 1
""").show()

+--------+-----------------+
| country|         avg_temp|
+--------+-----------------+
|DJIBOUTI|90.06114474836602|
+--------+-----------------+



##  2. Which country had the most consecutive days of tornadoes/funnel cloud formations?

In [39]:
tornadoes = spark.sql("""
select dt
     , COUNTRY_FULL as country
     , max(tornado) as tornado
from df
where tornado = 1
group by dt, COUNTRY_FULL
order by COUNTRY_FULL, dt
""")

In [40]:
tornadoes.show()

+----------+-----------+-------+
|        dt|    country|tornado|
+----------+-----------+-------+
|2019-07-04|    ALGERIA|      1|
|2019-02-06|     ANGOLA|      1|
|2019-04-05|     ANGOLA|      1|
|2019-06-06|   ANGUILLA|      1|
|2019-05-10|  ARGENTINA|      1|
|2019-09-23|      ARUBA|      1|
|2019-01-20|    AUSTRIA|      1|
|2019-06-16|    AUSTRIA|      1|
|2019-06-19|    AUSTRIA|      1|
|2019-10-14|    AUSTRIA|      1|
|2019-10-19|    AUSTRIA|      1|
|2019-02-25|BAHAMAS THE|      1|
|2019-03-27|BAHAMAS THE|      1|
|2019-04-02|BAHAMAS THE|      1|
|2019-04-15|BAHAMAS THE|      1|
|2019-04-29|BAHAMAS THE|      1|
|2019-06-02|BAHAMAS THE|      1|
|2019-06-15|BAHAMAS THE|      1|
|2019-07-04|BAHAMAS THE|      1|
|2019-08-06|BAHAMAS THE|      1|
+----------+-----------+-------+
only showing top 20 rows



In [41]:
last_event = tornadoes.withColumn("last_event", lag('dt').over(Window.partitionBy('country').orderBy('dt')))

In [42]:
last_event.show()

+----------+--------------+-------+----------+
|        dt|       country|tornado|last_event|
+----------+--------------+-------+----------+
|2019-08-22|    BANGLADESH|      1|      null|
|2019-01-17|         JAPAN|      1|      null|
|2019-01-24|         JAPAN|      1|2019-01-17|
|2019-03-02|         JAPAN|      1|2019-01-24|
|2019-04-02|         JAPAN|      1|2019-03-02|
|2019-06-10|         JAPAN|      1|2019-04-02|
|2019-06-11|         JAPAN|      1|2019-06-10|
|2019-08-10|         JAPAN|      1|2019-06-11|
|2019-08-30|         JAPAN|      1|2019-08-10|
|2019-12-03|         JAPAN|      1|2019-08-30|
|2019-12-04|         JAPAN|      1|2019-12-03|
|2019-12-14|         JAPAN|      1|2019-12-04|
|2019-06-12|        JERSEY|      1|      null|
|2019-11-09|      MALDIVES|      1|      null|
|2019-05-17|      TANZANIA|      1|      null|
|2019-01-19|         MALTA|      1|      null|
|2019-05-14|         MALTA|      1|2019-01-19|
|2019-06-12|UNITED KINGDOM|      1|      null|
|2019-06-20|U

In [43]:
lags = last_event.withColumn('lag', datediff(col('dt'), col('last_event')))

In [44]:
lags.show()

+----------+--------------+-------+----------+----+
|        dt|       country|tornado|last_event| lag|
+----------+--------------+-------+----------+----+
|2019-08-22|    BANGLADESH|      1|      null|null|
|2019-01-17|         JAPAN|      1|      null|null|
|2019-01-24|         JAPAN|      1|2019-01-17|   7|
|2019-03-02|         JAPAN|      1|2019-01-24|  37|
|2019-04-02|         JAPAN|      1|2019-03-02|  31|
|2019-06-10|         JAPAN|      1|2019-04-02|  69|
|2019-06-11|         JAPAN|      1|2019-06-10|   1|
|2019-08-10|         JAPAN|      1|2019-06-11|  60|
|2019-08-30|         JAPAN|      1|2019-08-10|  20|
|2019-12-03|         JAPAN|      1|2019-08-30|  95|
|2019-12-04|         JAPAN|      1|2019-12-03|   1|
|2019-12-14|         JAPAN|      1|2019-12-04|  10|
|2019-06-12|        JERSEY|      1|      null|null|
|2019-11-09|      MALDIVES|      1|      null|null|
|2019-05-17|      TANZANIA|      1|      null|null|
|2019-01-19|         MALTA|      1|      null|null|
|2019-05-14|

In [46]:
new_period = lags.withColumn('is_new_period', when( col('lag') > 1, 1).otherwise(0))

In [47]:
new_period.show()

+----------+--------------+-------+----------+----+-------------+
|        dt|       country|tornado|last_event| lag|is_new_period|
+----------+--------------+-------+----------+----+-------------+
|2019-08-22|    BANGLADESH|      1|      null|null|            0|
|2019-01-17|         JAPAN|      1|      null|null|            0|
|2019-01-24|         JAPAN|      1|2019-01-17|   7|            1|
|2019-03-02|         JAPAN|      1|2019-01-24|  37|            1|
|2019-04-02|         JAPAN|      1|2019-03-02|  31|            1|
|2019-06-10|         JAPAN|      1|2019-04-02|  69|            1|
|2019-06-11|         JAPAN|      1|2019-06-10|   1|            0|
|2019-08-10|         JAPAN|      1|2019-06-11|  60|            1|
|2019-08-30|         JAPAN|      1|2019-08-10|  20|            1|
|2019-12-03|         JAPAN|      1|2019-08-30|  95|            1|
|2019-12-04|         JAPAN|      1|2019-12-03|   1|            0|
|2019-12-14|         JAPAN|      1|2019-12-04|  10|            1|
|2019-06-1

In [50]:
country_period_id = new_period.withColumn("country_period_id", sum('is_new_period').over(Window.partitionBy('country').orderBy('dt')))

In [51]:
country_period_id.show()

+----------+--------------+-------+----------+----+-------------+-----------------+
|        dt|       country|tornado|last_event| lag|is_new_period|country_period_id|
+----------+--------------+-------+----------+----+-------------+-----------------+
|2019-08-22|    BANGLADESH|      1|      null|null|            0|                0|
|2019-01-17|         JAPAN|      1|      null|null|            0|                0|
|2019-01-24|         JAPAN|      1|2019-01-17|   7|            1|                1|
|2019-03-02|         JAPAN|      1|2019-01-24|  37|            1|                2|
|2019-04-02|         JAPAN|      1|2019-03-02|  31|            1|                3|
|2019-06-10|         JAPAN|      1|2019-04-02|  69|            1|                4|
|2019-06-11|         JAPAN|      1|2019-06-10|   1|            0|                4|
|2019-08-10|         JAPAN|      1|2019-06-11|  60|            1|                5|
|2019-08-30|         JAPAN|      1|2019-08-10|  20|            1|           

In [52]:
country_period_id.createOrReplaceTempView('tornado_period')

In [56]:
spark.sql("""
select *
from (
    select country
         , country_period_id
         , count(1) as qty
    from tornado_period
    where lag = 1
    group by country, country_period_id
)
order by qty desc
limit 1
""").show()

+-------+-----------------+---+
|country|country_period_id|qty|
+-------+-----------------+---+
|  JAPAN|                4|  1|
+-------+-----------------+---+



## 3. Which country had the second highest average mean wind speed over the year?

In [42]:
spark.sql("""
select *
from df
where NWDSP >= 999.9
""").show()

+------+-----+--------+-----+-----+------+------+-----+-----+-----+-----+------+------+-----+-----+------+------------+------------+-----+-----+
|STN---| WBAN|YEARMODA| TEMP| DEWP|   SLP|   STP|VISIB| WDSP|MXSPD| GUST|   MAX|   MIN| PRCP| SNDP|FRSHTT|COUNTRY_ABBR|COUNTRY_FULL|NTEMP|NWDSP|
+------+-----+--------+-----+-----+------+------+-----+-----+-----+-----+------+------+-----+-----+------+------------+------------+-----+-----+
|232050|99999|20191223| -0.2| -3.6|1015.5|1014.1|  5.7|999.9|999.9|999.9|  16.0| -7.6*|0.02F|999.9|101000|          RS|      RUSSIA| -0.2|999.9|
|232050|99999|20191224|-17.3|-22.1|1019.0|1017.4|  4.7|999.9|999.9|999.9| -9.6*|-22.7*|0.01F| 19.7|100000|          RS|      RUSSIA|-17.3|999.9|
|232050|99999|20191225|-21.8|-27.0|1023.1|1021.6| 12.4|999.9|999.9|999.9| -17.9| -25.6|0.00F| 19.7|000000|          RS|      RUSSIA|-21.8|999.9|
|255610|99999|20190113|-31.5|-38.2|1021.2|1020.0| 31.1|999.9|999.9|999.9|-28.5*| -35.5|0.00I|  7.9|000000|          RS|      RUSSI

In [55]:
spark.sql("""
select country
     , avg_wind_speed
from (
        select *
            , row_number() over (order by country desc) as rn
        from (
                select COUNTRY_FULL as country
                    , avg(NWDSP) as avg_wind_speed
                from df
                where NWDSP < 999.9
                group by COUNTRY_FULL
        )
)
where rn = 2
""").show()

+-------+-----------------+
|country|   avg_wind_speed|
+-------+-----------------+
| ZAMBIA|5.920833328117927|
+-------+-----------------+

