# Exploring the effects of changing agroclimatological conditions on potential occurence of major winter wheat diseases: A spatio-temporal analysis for Germany from 1960 to today

## Step 1. Exploring the data

From the __[DWD website](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/)__ we can access the following data:

1. air temperature
1. cloud types
3. cloudiness
4. dew point
5. extreme wind
6. moisture
7. precipitation
8. pressure
9. soil temperature
10. solar
11. sun
12. visibility
13. weather phenomena
14. wind
15. wind synop

In [7]:
import pandas as pd
import re
import requests
from requests_html import HTMLSession
from typing import List
from functions import get_date, hide_toggle
import wget
from zipfile import ZipFile
from os.path import exists
from time import sleep

In [8]:
def get_links(parameters:List[str], time:List[str] = ["1_minute","5_minutes","10_minutes","hourly"]) -> dict:
    dwd_links = {interval:{key: None for key in parameters} for interval in time}
    for interval in time:
        for parameter in parameters:
            url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/' + str(interval) + '/' + parameter + '/historical/'
            try:
                session = HTMLSession()
                response = session.get(url)
                dwd_links[interval][parameter] = response.html.absolute_links

            except requests.exceptions.RequestException as e:
                print(e)
    return dwd_links

def count_datapoints(dwd_links:dict,time,parameter:str, start_year:int, end_year:int) -> int:
    i=0
    for link in dwd_links[time][parameter]:
        try:
            start_interval = int(get_date(link)[0])
            end_interval = int(get_date(link)[1])
            if ((start_interval <= start_year)  & (end_interval >= end_year)):
                i = i+1
        except:
            pass
    return i


In [9]:

def show_available_data(dwd_links:dict,time, parameters: List[str]):
    data_balance = pd.DataFrame(columns = parameters, index = [str(i) + "'s - present" for i in range(1950,2020,10)])
    for parameter in parameters:
        for i in range(1950,2020,10):
            data_balance[parameter][str(i)+ "'s - present"] = count_datapoints(dwd_links,time,parameter, i, 2020)
    print(data_balance)

### From this data, the most relevant for disease models are air temperature, dew point, moisture and precipitation. The following table summarizes the hourly data points we have for each parameter and each period.

In [10]:
parameters =["air_temperature","dew_point", "moisture", "precipitation"]
dwd_links = get_links(parameters)
show_available_data(dwd_links,"hourly",parameters)

                 air_temperature dew_point moisture precipitation
1950's - present              29        48       48             0
1960's - present              67        57       57             0
1970's - present              75        59       59             0
1980's - present              97       118      118             0
1990's - present             129       149      149             0
2000's - present             155       185      185           144
2010's - present             481       481      481           927


In [11]:
def ids_datapoints(dwd_links:dict,time,parameter:str, start_year:int, end_year:int) -> List[str]:
    list = []
    for link in dwd_links[time][parameter]:
        try:
            start_interval = int(get_date(link)[0])
            end_interval = int(get_date(link)[1])
            if ((start_interval <= start_year)  & (end_interval >= end_year)):
                id = re.findall("_\d{5}_",str(link))[0]
                ##print("link")
                list.append(id)
        except:
            pass
    return list
def common_stations(paramer1:str, parameter2:str, time):
    print(f"{paramer1} and {parameter2} have the following number of stations that measured in:")
    for i in range(1950,2020,10):
        list1 = ids_datapoints(dwd_links,time,paramer1,i,2020)
        list2 = ids_datapoints(dwd_links,time,parameter2,i,2020)
        list1_as_set = set(list1)
        intersection = list1_as_set.intersection(list2)
        intersection_as_list = list(intersection)
        print(str(i) + " - present" + ": " + str(len(intersection_as_list)))


In [12]:
common_stations("dew_point", "air_temperature","hourly")

dew_point and air_temperature have the following number of stations that measured in:
1950 - present: 24
1960 - present: 46
1970 - present: 52
1980 - present: 87
1990 - present: 116
2000 - present: 143
2010 - present: 480


## Step 2. Downloading the data

In [13]:
def create_df(parameter,time,start_year,end_year, is_test = True):
    test_count = 0
    if(is_test):
        limit =10
    else:
        limit = 99999
    df = pd.DataFrame()
    for link in dwd_links[time][parameter]:   
        if (test_count <= limit):
            test_count+=1
            filename = str.split(link,"/")[-1]
            
            if(not exists("downloads/" + time + "/" + parameter + "/" + filename)):
                file_zip = wget.download(link,"downloads/"+time+"/" + parameter + "/")
                print("downloading" + filename + "...",end = " ", flush=True)
            else:
                file_zip = "downloads/"+ time+ "/" + parameter + "/" + filename
                print("FOUND "+ filename, end =" ", flush=True)
                sleep(1)
            try:    
                with ZipFile(file_zip) as myzip:
                    for filename in myzip.namelist():
                        if "Metadat" not in filename:
                            with myzip.open(filename) as myfile:
                                this_df = pd.read_csv(myfile, sep =";")
                                df = pd.concat([df,this_df])
            except:
                print("Not able to open:",filename, "Reason: unknown.")
    return df

print(create_df("moisture","hourly",2020,2020,True))

FOUND stundenwerte_TF_00098_19490101_19541206_hist.zip FOUND stundenwerte_TF_02601_19490101_20211231_hist.zip 

In [None]:
print(create_df("dew_point","hourly",2020,2020,False))

FOUND stundenwerte_TD_02485_20110502_20211231_hist.zip
FOUND stundenwerte_TD_00379_20170901_20211231_hist.zip
FOUND stundenwerte_TD_03821_19900611_20211231_hist.zip
FOUND stundenwerte_TD_06182_20040601_20120327_hist.zip
FOUND stundenwerte_TD_05906_19490101_20211231_hist.zip
FOUND stundenwerte_TD_06264_20040601_20211231_hist.zip
FOUND stundenwerte_TD_02925_19760401_20211231_hist.zip
FOUND stundenwerte_TD_04485_20050201_20170301_hist.zip
FOUND stundenwerte_TD_00685_19790101_20061102_hist.zip
FOUND stundenwerte_TD_01207_20060301_20211231_hist.zip
FOUND stundenwerte_TD_02503_19750701_20030502_hist.zip
downloadingstundenwerte_TD_04039_19960207_20211231_hist.zip...
downloadingstundenwerte_TD_05642_19950320_20010928_hist.zip...
downloadingstundenwerte_TD_02814_20021101_20211231_hist.zip...
downloadingstundenwerte_TD_00303_19930901_20211231_hist.zip...
downloadingstundenwerte_TD_04261_20060301_20211231_hist.zip...
downloadingstundenwerte_TD_01339_19930301_20211231_hist.zip...
downloadingstunde

In [None]:
print(create_df("air_temperature","hourly",2020,2020,False))

FOUND stundenwerte_TU_01303_19510101_20211231_hist.zip
FOUND stundenwerte_TU_07244_19810101_20040930_hist.zip
FOUND stundenwerte_TU_01612_19730101_20211231_hist.zip
FOUND stundenwerte_TU_00685_20020101_20061031_hist.zip
FOUND stundenwerte_TU_05068_19530101_19710101_hist.zip
FOUND stundenwerte_TU_07395_20071201_20211231_hist.zip
FOUND stundenwerte_TU_02014_19490101_20211231_hist.zip
FOUND stundenwerte_TU_05155_19480101_20140901_hist.zip
FOUND stundenwerte_TU_03897_20020101_20211231_hist.zip
FOUND stundenwerte_TU_04104_19480101_20211231_hist.zip
FOUND stundenwerte_TU_03028_19710401_20211231_hist.zip
FOUND stundenwerte_TU_00460_19610101_20211231_hist.zip
FOUND stundenwerte_TU_03879_19480101_19970101_hist.zip
FOUND stundenwerte_TU_00131_20041101_20211231_hist.zip
FOUND stundenwerte_TU_03639_20020101_20211231_hist.zip
FOUND stundenwerte_TU_00535_20040601_20211231_hist.zip
downloadingstundenwerte_TU_00891_19510101_20211231_hist.zip...
downloadingstundenwerte_TU_00954_20000501_20211231_hist.z

In [None]:
print(create_df("precipitation","10_minutes",2020,2020,False))

In [None]:
print(create_df("precipitation","hourly",2020,2020,False))

In [None]:
for parameter in ["sun,solar,wind"]:
    try:
        create_df(parameter, "hourly",0,0,False)
    except:
        pass

### The following code is written just to email me when the files are downloaded/models are trained

In [None]:
import smtplib, ssl, os

def mail_me():
    port = 465  # For SSL
    smtp_server = "smtp.gmail.com"
    sender_email = "computersays521@gmail.com"  # Enter your address
    receiver_email = "radulescu_serban_petre@yahoo.com"  # Enter receiver address
    password = open("mailingfile.txt","r").readlines()[0]
    message = """\
    Subject: Hi there

    It seems that what you were working on is done. Or you had some errors. Either way the program is completed. """

    context = ssl.create_default_context()
    with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
        server.login(sender_email, password)
        server.sendmail(sender_email, receiver_email, message)

In [None]:
mail_me()