## Preamble: Execute this if checking any answer!

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
import pathlib

import numpy as np
import xarray

sys.path.append("../scripts")
import normalize_text_bootcamp
import utils_bootcamp
import plotting

In [None]:
FOLDER_TO_TWEETS = "/p/project/training2223/a2/data/tweets/tweets_2017_normalized.nc"
FOLDER_TO_TWEETS = "../../data/tweets/tweets_2017_normalized.nc"

FOLDER_TO_PRECIPITATION = "/p/project/training2223/a2/data/precipitation/ds_precipitation_2017.nc"
FOLDER_TO_PRECIPITATION = "../../data/precipitation/ds_precipitation_2017.nc"

In [None]:
def load_tweets_dataset():
    ds = xarray.load_dataset(FOLDER_TO_TWEETS)
    ds = utils_bootcamp.reset_index_coordinate(ds)
    return ds


def load_precipitation_dataset():
    ds = xarray.load_dataset(FOLDER_TO_PRECIPITATION)
    return ds

## Task 0.1

In [None]:
ds_prec = load_precipitation_dataset()

In [None]:
for var in ds_prec.variables.values():
    print(var.dtype)

`np.datetime64[ns]` may be less well known. It's documentation can be found [here](https://numpy.org/doc/stable/reference/arrays.datetime.html). To instantiate an object including date and time, we just give our date and time as a single string to the numoy object `np.datetime64`. For example:

In [None]:
np.datetime64("2011-06-15T00:00")

## Task 1.1

In [None]:
ds_prec = load_precipitation_dataset()

In [None]:
# Waterloo station is located at 51.5031°N 0.1132°W -> Latitude = 51.5019408, Longitude = -0.1131576, we assume 0.1~10 km at 2:30pm on 2nd January 2017 -> np.datetime64('2017-01-02T14:30')
time = np.datetime64("2017-01-02T14:30")
ds_london = ds_prec.sel(time=time).where(
    (ds_prec.latitude < 51.5019408 + 0.2)
    & (ds_prec.latitude > 51.5019408 - 0.2)
    & (ds_prec.longitude > -0.1131576 - 0.2)
    & (ds_prec.longitude < -0.1131576 + 0.2),
    drop=True,
)
ds_london

In [None]:
linear_thresh = 1e-7

bins = plotting.histograms.get_bin_edges(
    vmin=None,
    vmax=None,
    linear_thresh=linear_thresh,
    n_bins=60,
    data=ds_prec.tp.values,
    log="symlog",
)
# initialize a figure `figure` and an axes `ax`
figure, ax = plotting.utils_plotting.create_figure_axes()
# plot our histogram using our user-defined bins
ds_london.tp.plot.hist(bins=bins, ax=ax)
# change the x-axis scale to a symmetrical log scale
plotting.utils_plotting.set_x_log(ax=ax, log="symlog", linear_thresh=linear_thresh)
# change the y-axis scale to a logarithmic scale
plotting.utils_plotting.set_y_log(ax=ax, log=True)

It wasn't raining at that time!

## Task 1.2

In [None]:
ds_prec = load_precipitation_dataset()

In [None]:
# Waterloo station is located at 51.5031°N 0.1132°W -> Latitude = 51.5019408, Longitude = -0.1131576, we assume 0.1~10 km on 2nd January 2017 -> np.datetime64('2017-01-02T00:00') < time < np.datetime64('2017-01-03T00:00')
ds_london = ds_prec.where(
    (ds_prec.latitude < 51.5019408 + 0.2)
    & (ds_prec.latitude > 51.5019408 - 0.2)
    & (ds_prec.longitude > -0.1131576 - 0.2)
    & (ds_prec.longitude < -0.1131576 + 0.2)
    & (ds_prec.time > np.datetime64("2017-01-02T00:00"))
    & (ds_prec.time < np.datetime64("2017-01-03T00:00")),
    drop=True,
)
ds_london

In [None]:
linear_thresh = 1e-7

bins = plotting.histograms.get_bin_edges(
    vmin=None,
    vmax=None,
    linear_thresh=linear_thresh,
    n_bins=60,
    data=ds_prec.tp.values,
    log="symlog",
)

# initialize a figure `figure` and an axes `ax`
figure, ax = plotting.utils_plotting.create_figure_axes()

# plot our histogram using our user-defined bins
ds_london.tp.plot.hist(bins=bins, ax=ax)

# change the x-axis scale to a symmetrical log scale
plotting.utils_plotting.set_x_log(ax=ax, log="symlog", linear_thresh=linear_thresh)

# change the y-axis scale to a logarithmic scale
plotting.utils_plotting.set_y_log(ax=ax, log=True)

It rained that day!

## Task 1.3

In [None]:
ds_prec = load_precipitation_dataset()

In [None]:
# let's look at the UK, which is the source of our Tweets, range in longitude ~ -10 to 1, latitude ~ 51 to 61
ds_prec.sel(time=np.datetime64("2017-07-01T17:30:00")).tp.plot(xlim=[-10, 1], ylim=[51, 61])

## Task 1.4

In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
ds_prec = load_precipitation_dataset()

In [None]:
# we again look at the UK, range in longitude ~ -10 to 1, latitude ~ 51 to 61
ds_prec.sel(time=np.datetime64("2017-07-01T12:30:00")).tp.plot(
    norm=matplotlib.colors.LogNorm(vmax=1e-2, vmin=1e-6),
    cmap="plasma",
    xlim=[-10, 1],
    ylim=[51, 61],
)

## Task 2.1

In [None]:
ds_tweets = load_tweets_dataset()

In [None]:
ds_tweets["text_original"].values[:100]

In [None]:
# no nan values found in longitude/ latitude -> all Tweets have ascribed location
np.sum(utils_bootcamp.is_nan(ds_tweets, "longitude") + utils_bootcamp.is_nan(ds_tweets, "latitude"))

How queried?
* Only focus on years 2017, (or 2017 - 2020 for full dataset)
* Need location for all Tweets
* English language
* Query looked for emojis/keywords related to rain/sun 

## Task 2.2

In [None]:
ds_tweets = load_tweets_dataset()

In [None]:
emojis = [
    "🏔️",
    "🏔",
    "☀️",
    "☀",
    "🌞",
    "⛅",
    "⛈️",
    "⛈",
    "🌤️",
    "🌤",
    "🌥️",
    "🌥",
    "🌦️",
    "🌦",
    "🌧️",
    "🌧",
    "🌨️",
    "🌨",
    "🌩️",
    "🌩",
    "☔",
    "⛄",
]
keywords = emojis + [
    "blizzard",
    "cloudburst",
    "downpour",
    "drizzle",
    "flash flood",
    "flood",
    "flood stage",
    "forecast",
    "freezing rain",
    "hail",
    "ice storm",
    "lightning",
    "precipitation",
    "rain",
    "rain gauge",
    "rain shadow",
    "rainbands",
    "rain shower",
    "snow",
    "snow shower",
    "snowstorm",
    "sun",
    "sunny",
    "thunder",
    "thunderstorm",
]

In [None]:
plotting.histograms.plot_distribution_keywords(ds_tweets.text_original.values, keywords)

## Task 2.3:

In [None]:
ds_tweets = load_tweets_dataset()

In [None]:
def get_grouped_dataset(ds, group_by, sort_by="id"):
    ds_grouped_unsorted = ds.groupby(group_by).count()
    ds_grouped = ds_grouped_unsorted.sortby(sort_by, ascending=False)
    return ds_grouped


ds_grouped = get_grouped_dataset(ds_tweets, group_by="source")

In [None]:
keys = ds_grouped["source"].values[:10]
values = ds_grouped["id"].values[:10]
for k, v in zip(keys, values):
    print(f"{k}: {v}")

## Task 2.4:

In [None]:
ds_tweets = load_tweets_dataset()

In [None]:
ds_tweets.raining.plot.hist()

More "non raining" than "raining" labels but basically balanced.