# Thesis

## Step 1. Exploring the data

In [7]:
from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

hide_toggle()

From DWD website __[DWD website](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/)__ we can download the following data:

1. air temperature
1. cloud types
3. cloudiness
4. dew point
5. extreme wind
6. moisture
7. precipitation
8. pressure
9. soil temperature
10. solar
11. sun
12. visibility
13. weather phenomena
14. wind
15. wind synop

In [3]:
import pandas as pd
import requests
from requests_html import HTMLSession
from typing import List
from functions import get_date


In [4]:
def explore_data(parameter:str, start_year:int, end_year:int) -> int:
    url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/' + parameter +'/historical/'
    try:
        session = HTMLSession()
        response = session.get(url)
        
    except requests.exceptions.RequestException as e:
        print(e)

    links = response.html.absolute_links
    i=0
    for link in links:
        try:
            start_interval = int(get_date(link)[0])
            end_interval = int(get_date(link)[1])
            if ((start_interval <= start_year)  & (end_interval >= end_year)):
                i = i+1
        except:
            pass
    return i

In [5]:
def show_available_data(parameters: List[str]):
    data_balance = pd.DataFrame(columns = parameters, index = [str(i) + "'s - present" for i in range(1950,2020,10)])
    for parameter in parameters:
        for i in range(1950,2020,10):
            data_balance[parameter][str(i)+ "'s - present"] = explore_data(parameter, i, 2020)
    print(data_balance)

In [6]:
show_available_data(["air_temperature","dew_point", "moisture", "precipitation"])

                 air_temperature dew_point moisture precipitation
1950's - present              29        48       48             0
1960's - present              67        57       57             0
1970's - present              75        59       59             0
1980's - present              97       118      118             0
1990's - present             129       149      149             0
2000's - present             155       185      185           144
2010's - present             481       481      481           927
