# Exploring the effects of changing agroclimatological conditions on potential occurence of major winter wheat diseases: A spatio-temporal analysis for Germany from 1960 to today

## Step 1. Exploring the data

From the __[DWD website](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/)__ we can access the following data:

1. air temperature
1. cloud types
3. cloudiness
4. dew point
5. extreme wind
6. moisture
7. precipitation
8. pressure
9. soil temperature
10. solar
11. sun
12. visibility
13. weather phenomena
14. wind
15. wind synop

In [2]:
import pandas as pd
import re
import requests
from requests_html import HTMLSession
from typing import List
from functions import get_date, hide_toggle
import wget
from zipfile import ZipFile
from os.path import exists
from time import sleep

In [3]:
def get_links(parameters:List[str], time:List[str] = ["1_minute","5_minutes","10_minutes","hourly"]) -> dict:
    dwd_links = {interval:{key: None for key in parameters} for interval in time}
    for interval in time:
        for parameter in parameters:
            url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/' + str(interval) + '/' + parameter + '/historical/'
            try:
                session = HTMLSession()
                response = session.get(url)
                dwd_links[interval][parameter] = response.html.absolute_links

            except requests.exceptions.RequestException as e:
                print(e)
    return dwd_links

def count_datapoints(dwd_links:dict,time,parameter:str, start_year:int, end_year:int) -> int:
    i=0
    for link in dwd_links[time][parameter]:
        try:
            start_interval = int(get_date(link)[0])
            end_interval = int(get_date(link)[1])
            if ((start_interval <= start_year)  & (end_interval >= end_year)):
                i = i+1
        except:
            pass
    return i


In [4]:

def show_available_data(dwd_links:dict,time, parameters: List[str]):
    data_balance = pd.DataFrame(columns = parameters, index = [str(i) + "'s - present" for i in range(1950,2020,10)])
    for parameter in parameters:
        for i in range(1950,2020,10):
            data_balance[parameter][str(i)+ "'s - present"] = count_datapoints(dwd_links,time,parameter, i, 2020)
    print(data_balance)

### From this data, the most relevant for disease models are air temperature, dew point, moisture and precipitation. The following table summarizes the hourly data points we have for each parameter and each period.

In [5]:
parameters =["air_temperature","dew_point", "moisture", "precipitation"]
dwd_links = get_links(parameters)
show_available_data(dwd_links,"hourly",parameters)

                 air_temperature dew_point moisture precipitation
1950's - present              29        48       48             0
1960's - present              67        57       57             0
1970's - present              75        59       59             0
1980's - present              97       118      118             0
1990's - present             129       149      149             0
2000's - present             155       185      185           144
2010's - present             481       481      481           927


In [6]:
def ids_datapoints(dwd_links:dict,time,parameter:str, start_year:int, end_year:int) -> List[str]:
    list = []
    for link in dwd_links[time][parameter]:
        try:
            start_interval = int(get_date(link)[0])
            end_interval = int(get_date(link)[1])
            if ((start_interval <= start_year)  & (end_interval >= end_year)):
                id = re.findall("_\d{5}_",str(link))[0]
                ##print("link")
                list.append(id)
        except:
            pass
    return list
def common_stations(paramer1:str, parameter2:str, time):
    print(f"{paramer1} and {parameter2} have the following number of stations that measured in:")
    for i in range(1950,2020,10):
        list1 = ids_datapoints(dwd_links,time,paramer1,i,2020)
        list2 = ids_datapoints(dwd_links,time,parameter2,i,2020)
        list1_as_set = set(list1)
        intersection = list1_as_set.intersection(list2)
        intersection_as_list = list(intersection)
        print(str(i) + " - present" + ": " + str(len(intersection_as_list)))


In [7]:
common_stations("dew_point", "air_temperature","hourly")

dew_point and air_temperature have the following number of stations that measured in:
1950 - present: 24
1960 - present: 46
1970 - present: 52
1980 - present: 87
1990 - present: 116
2000 - present: 143
2010 - present: 480


## Step 2. Downloading the data

In [8]:
def create_df(parameter,time,start_year,end_year, is_test = True):
    test_count = 0
    if(is_test):
        limit =10
    else:
        limit = 99999
    df = pd.DataFrame()
    for link in dwd_links[time][parameter]:   
        if (test_count <= limit):
            test_count+=1
            filename = str.split(link,"/")[-1]
            
            if(not exists("downloads/" + time + "/" + parameter + "/" + filename)):
                file_zip = wget.download(link,"downloads/"+time+"/" + parameter + "/")
                print("downloading" + filename + "...",end = " ", flush=True)
            else:
                file_zip = "downloads/"+ time+ "/" + parameter + "/" + filename
                print("FOUND "+ filename, end =" ", flush=True)
                sleep(1)
            try:    
                with ZipFile(file_zip) as myzip:
                    for filename in myzip.namelist():
                        if "Metadat" not in filename:
                            with myzip.open(filename) as myfile:
                                this_df = pd.read_csv(myfile, sep =";")
                                df = pd.concat([df,this_df])
            except:
                print("Not able to open:",filename, "Reason: unknown.")
    return df

print(create_df("moisture","hourly",2020,2020,True))

FOUND stundenwerte_TF_00656_19490101_20211231_hist.zip FOUND stundenwerte_TF_00876_19750701_19901231_hist.zip FOUND stundenwerte_TF_04063_20030701_20211231_hist.zip FOUND stundenwerte_TF_04354_20060101_20211231_hist.zip FOUND stundenwerte_TF_04160_20040701_20211231_hist.zip FOUND stundenwerte_TF_00953_19500701_20211231_hist.zip FOUND stundenwerte_TF_02750_20051101_20211231_hist.zip FOUND stundenwerte_TF_01886_20160811_20211231_hist.zip FOUND stundenwerte_TF_03857_20050101_20211231_hist.zip FOUND stundenwerte_TF_07075_20050201_20211231_hist.zip FOUND stundenwerte_TF_06344_20041201_20211231_hist.zip         STATIONS_ID  MESS_DATUM  QN_8  ABSF_STD  VP_STD  TF_STD  P_STD  \
0               656  1949010100     3       3.9     4.9    -1.8  941.9   
1               656  1949010103     3       4.2     5.3    -1.4  941.9   
2               656  1949010106     3       4.2     5.3    -0.8  941.9   
3               656  1949010109     3       4.2     5.3    -1.4  941.9   
4               656  1949

In [9]:
print(create_df("dew_point","hourly",2020,2020,False))

FOUND stundenwerte_TD_06109_20040901_20211231_hist.zip FOUND stundenwerte_TD_04275_20041201_20211231_hist.zip FOUND stundenwerte_TD_00880_19830103_20211231_hist.zip FOUND stundenwerte_TD_02115_19520301_20211231_hist.zip FOUND stundenwerte_TD_03126_19750701_20211231_hist.zip FOUND stundenwerte_TD_03668_19490101_20211231_hist.zip FOUND stundenwerte_TD_02362_20040801_20211231_hist.zip FOUND stundenwerte_TD_00164_19750701_20211231_hist.zip FOUND stundenwerte_TD_07393_20100101_20211231_hist.zip FOUND stundenwerte_TD_00377_20041001_20211231_hist.zip FOUND stundenwerte_TD_01262_19920517_20211231_hist.zip         STATIONS_ID  MESS_DATUM  QN_8    TT    TD  eor
0              6109  2004090100     1  12.4  11.2  eor
1              6109  2004090101     1  12.5  11.2  eor
2              6109  2004090102     1  13.0  11.2  eor
3              6109  2004090103     1  12.5  11.1  eor
4              6109  2004090104     1  12.1  10.7  eor
...             ...         ...   ...   ...   ...  ...
259385    

In [10]:
print(create_df("air_temperature","hourly",2020,2020,False))

FOUND stundenwerte_TU_14003_19610103_19920629_hist.zip FOUND stundenwerte_TU_02905_20020101_20211231_hist.zip downloadingstundenwerte_TU_07374_20060301_20211231_hist.zip... FOUND stundenwerte_TU_00722_19510101_20211231_hist.zip FOUND stundenwerte_TU_02410_19610102_20211231_hist.zip FOUND stundenwerte_TU_03513_19890501_20211231_hist.zip FOUND stundenwerte_TU_01526_20041101_20211231_hist.zip FOUND stundenwerte_TU_07412_20061001_20211231_hist.zip FOUND stundenwerte_TU_01358_19510101_20211231_hist.zip FOUND stundenwerte_TU_00880_19560101_20211231_hist.zip FOUND stundenwerte_TU_01239_20040701_20080923_hist.zip        STATIONS_ID  MESS_DATUM  QN_9  TT_TU  RF_TU  eor
0            14003  1961010306     1   -4.0   93.0  eor
1            14003  1961010307     1   -3.0   86.0  eor
2            14003  1961010308     1   -2.0   86.0  eor
3            14003  1961010309     1    4.0   60.0  eor
4            14003  1961010310     1    5.0   65.0  eor
...            ...         ...   ...    ...    ... 

In [11]:
print(create_df("precipitation","10_minutes",2020,2020,False))

downloading10minutenwerte_nieder_00517_20041102_20060629_hist.zip... downloading10minutenwerte_nieder_01282_20200101_20211231_hist.zip... downloading10minutenwerte_nieder_03432_20060915_20091231_hist.zip... downloading10minutenwerte_nieder_02331_20200101_20211231_hist.zip... downloading10minutenwerte_nieder_00427_20100101_20191231_hist.zip... downloading10minutenwerte_nieder_01216_20100101_20191231_hist.zip... downloading10minutenwerte_nieder_05854_20100101_20191231_hist.zip... downloading10minutenwerte_nieder_05162_20040708_20091231_hist.zip... downloading10minutenwerte_nieder_04393_20100101_20191231_hist.zip... downloading10minutenwerte_nieder_05797_20051201_20091231_hist.zip... downloading10minutenwerte_nieder_05871_20200101_20211231_hist.zip...         STATIONS_ID    MESS_DATUM   QN  RWS_DAU_10  RWS_10  RWS_IND_10    QN  \
0               517  200411021400  3.0        -999  -999.0        -999   NaN   
1               517  200411021410  3.0        -999     0.0        -999   NaN   
2

In [12]:
print(create_df("precipitation","hourly",2020,2020,False))

downloadingstundenwerte_RR_01435_20021101_20211231_hist.zip... downloadingstundenwerte_RR_01336_20040901_20211231_hist.zip... downloadingstundenwerte_RR_00978_20050201_20211231_hist.zip... downloadingstundenwerte_RR_01056_20051201_20211231_hist.zip... downloadingstundenwerte_RR_04039_19960207_20211231_hist.zip... downloadingstundenwerte_RR_07368_20071101_20211231_hist.zip... downloadingstundenwerte_RR_01544_19950901_20211231_hist.zip... downloadingstundenwerte_RR_00731_20071201_20211231_hist.zip... downloadingstundenwerte_RR_06093_20040501_20211231_hist.zip... downloadingstundenwerte_RR_03304_20070401_20211231_hist.zip... downloadingstundenwerte_RR_07077_20041101_20211231_hist.zip...         STATIONS_ID  MESS_DATUM  QN_8    R1  RS_IND  WRTR  eor
0              1435  2002110106     1   0.0       0  -999  eor
1              1435  2002110206     1   3.1       1  -999  eor
2              1435  2002110306     1   0.0       0  -999  eor
3              1435  2002110406     1   0.0       1  -9

### The following code is written just to email me when the files are downloaded/models are trained

In [16]:
import smtplib, ssl, os

def mail_me():
    port = 465  # For SSL
    smtp_server = "smtp.gmail.com"
    sender_email = "computersays521@gmail.com"  # Enter your address
    receiver_email = "radulescu_serban_petre@yahoo.com"  # Enter receiver address
    password = open("mailingfile.txt","r").readlines()[0]
    message = """\
    Subject: Hi there

    It seems that what you were working on is done. Or you had some errors. Either way the program is completed. """

    context = ssl.create_default_context()
    with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
        server.login(sender_email, password)
        server.sendmail(sender_email, receiver_email, message)

In [17]:
mail_me()