# Environment preparation

## Download generic purpose datasets

In [None]:
%%bash
wget -P data/ds-spark -q https://storage.googleapis.com/academy-data/ds-with-spark.zip
cd data/ds-spark; unzip ds-with-spark.zip

## Download weather dataset format

In [None]:
%%bash
wget -q https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/ghcn-daily-by_year-format.rtf

In [None]:
%%bash 
pip3 install striprtf

In [None]:
from striprtf.striprtf import rtf_to_text

with open('ghcn-daily-by_year-format.rtf', 'r') as file:
    data = file.read().replace('\n', '')

text = rtf_to_text(data) 
print(text)

## Download and show data files

In [None]:
%%bash
START=2017
END=2018
for ((i=$START;i<=END;i++)); do 
    wget -P data/weather -q https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/$i.csv.gz
    echo "Finished $i."
done

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-1").\
        master("spark://spark-master:7077").\
        config("spark.executor.enabled", "true").\
        config("spark.eventLog.dir", "/opt/workspace/history").\
        getOrCreate()

In [None]:
data = spark.read.csv("data/weather")

In [None]:
data.where('_c5 is not null').show(10)

In [None]:
header = "ID, YYYYMMDD, ELEMENT, DATA_VALUE, M-FLAG, Q-FLAG, S-FLAG, OBS-TIME".split(', ')
header

In [None]:
 data.schema.names

In [None]:
from functools import reduce

oldColumns = data.schema.names
df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], header[idx]), range(len(oldColumns)), data)
df.printSchema()
df.show(10)

In [None]:
df.rdd.getNumPartitions()

In [None]:
spark.stop()