# Download Selected Time Series from DWD FTP Server

## 1. Development, Trial

In [194]:
import pandas as pd
import os

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'data',
 'Exercises_gdms0200_V001.docx',
 'Exercises_gdms0200_V001.pdf',
 'gdms0201_DWD_NRW_fun_V001.qgz',
 'gnb0201_DWD_station_compare_V001.ipynb',
 'gnb0203_DWD_download_time_series_V001.ipynb',
 'gnbNNNN_ftp_crawler_V001.ipynb',
 'Untitled.ipynb']

In [152]:
# The DWD stations selected with QGIS and exported.
df = pd.read_csv("data/generated/DWD_stations_NRW_long_TS.csv", sep=";")

In [153]:
df

Unnamed: 0,station_id,date_from,date_to,altitude,latitude,longitude,name,state
0,1078,1940-01-01,2018-12-31,37,51.296,6.7686,Düsseldorf,Nordrhein-Westfalen
1,1300,1931-01-01,2018-12-31,351,51.254,8.1565,Eslohe,Nordrhein-Westfalen
2,1303,1888-01-01,2018-12-31,150,51.4041,6.9677,Essen-Bredeney,Nordrhein-Westfalen
3,1327,1937-01-01,2018-12-31,147,50.7119,6.7905,Weilerswist-Lommersum,Nordrhein-Westfalen
4,1590,1937-01-01,2018-12-31,37,51.4942,6.2463,Geldern-Walbeck,Nordrhein-Westfalen
5,2110,1938-01-01,2018-12-31,57,51.0411,6.1042,Heinsberg-Schleiden,Nordrhein-Westfalen
6,2483,1928-01-01,2018-12-31,839,51.1803,8.4891,Kahler Asten,Nordrhein-Westfalen
7,2497,1937-01-01,2018-12-31,505,50.5014,6.5264,Kall-Sistig,Nordrhein-Westfalen
8,2968,1903-01-01,2018-12-31,43,50.9894,6.9777,Köln-Stammheim,Nordrhein-Westfalen
9,3028,1951-01-01,2018-12-31,157,51.7855,8.8388,"Lippspringe, Bad",Nordrhein-Westfalen


In [154]:
# The time series filenames look like jahreswerte_KL_01078_19400101_20181231_hist.zip
# The filename cn be split at "_" station_id is the 2nd element (0 based counting). 
# The search pattern uniquely idenifying the station_id would be "_017078_"

#s_pattern = [] 
#for station_id in df["station_id"]:
#    s_pattern.append ("_"+("%05d" % station_id)+"_")
#for s in s_pattern:
#    print(s)

# But there is a smarter way to find the full filenames to be downloaded.

In [155]:
server = "opendata.dwd.de"
user = "anonymous"
passwd = ""
ftpdir = "/climate_environment/CDC/observations_germany/climate/annual/kl/historical/"
localdir = "data/DWD"

In [156]:
os.makedirs(localdir,exist_ok = True)
os.listdir(localdir)

['jahreswerte_KL_01078_19400101_20181231_hist',
 'jahreswerte_KL_01078_19400101_20181231_hist.zip',
 'jahreswerte_KL_01303_18880101_20181231_hist',
 'jahreswerte_KL_01303_18880101_20181231_hist.zip',
 'KL_Jahreswerte_Beschreibung_Stationen.txt',
 'KL_Tageswerte_Beschreibung_Stationen.txt',
 'TU_Stundenwerte_Beschreibung_Stationen.txt']

In [157]:
import ftplib
ftp = ftplib.FTP(server)
res = ftp.login(user=user, passwd = passwd)
print(res)

230 Login successful.


In [138]:
# callback function for ftp.retrlines to append the line "input" to the list "lst".
#lines = []
def cb_append(input, lst):
    lst.append(input)
#    print("appended: ",input)

In [None]:
# The list lines[] is to be filled with the lines retrieved from the execution of the list directory command.  
lines = []

In [70]:
# The common execution to retrieve the lines would be: ftp.retrlines("LIST", callback_function). 
# The definition of the callback function usually looks like
# def callback_function(input):
#     [...]
# It is not directly possible to pass a second argument such as "lines[]" to the function.
# funtools are used to allow this. 
import functools
s = ftp.retrlines("LIST "+ftpdir, functools.partial(cb, lst = lines))

In [113]:
line = lines[0]
[line[0:1], line[1:10], line[10:15], line[15:20], line[20:30], line[30:42], line[42:46], line[46:49], line[49:55], line[55:]]

['-',
 'rw-r--r--',
 '    1',
 ' 9261',
 '     15101',
 '      232713',
 ' Oct',
 ' 07',
 ' 08:28',
 ' KL_Jahreswerte_Beschreibung_Stationen.txt']

In [131]:
fdict = {}
for line in lines:
    [ftype, fsize, fname] = [line[0:1], int(line[31:42]), line[56:]]
    #fname = "a_b_c_d"
    #fname = "a_b_100"
    #fname = "ab_100"
    lst = fname.split("_")
    if len(lst)<3:
        print("Unexpected number of '_' delimiters. Skip string.")
    else:
        dtres = lst[0]
        dtype = lst[1]
        dsid_str  = lst[2]
    
    try:
        dsid = int(dsid_str)
        print("station_id = %d" % int(dsid))
        fdict[dsid] = fname
    except:
        print("No station_id at pos. 2. Skip file.")



No station_id at pos. 2. Skip file.
station_id = 1
station_id = 3
station_id = 44
station_id = 52
station_id = 61
station_id = 70
station_id = 71
station_id = 72
station_id = 73
station_id = 78
station_id = 91
station_id = 98
station_id = 102
station_id = 116
station_id = 129
station_id = 131
station_id = 132
station_id = 142
station_id = 150
station_id = 151
station_id = 154
station_id = 161
station_id = 164
station_id = 167
station_id = 169
station_id = 172
station_id = 174
station_id = 175
station_id = 177
station_id = 183
station_id = 184
station_id = 185
station_id = 186
station_id = 191
station_id = 193
station_id = 198
station_id = 202
station_id = 207
station_id = 211
station_id = 217
station_id = 221
station_id = 222
station_id = 231
station_id = 232
station_id = 243
station_id = 257
station_id = 259
station_id = 268
station_id = 273
station_id = 282
station_id = 283
station_id = 284
station_id = 288
station_id = 294
station_id = 298
station_id = 303
station_id = 314
station_i

In [133]:
for station_id in df["station_id"]:
    print("%5d -> %s" % (station_id , fdict[station_id]))

 1078 -> jahreswerte_KL_01078_19400101_20181231_hist.zip
 1300 -> jahreswerte_KL_01300_19310101_20181231_hist.zip
 1303 -> jahreswerte_KL_01303_18880101_20181231_hist.zip
 1327 -> jahreswerte_KL_01327_19370101_20181231_hist.zip
 1590 -> jahreswerte_KL_01590_19370101_20181231_hist.zip
 2110 -> jahreswerte_KL_02110_19630101_20181231_hist.zip
 2483 -> jahreswerte_KL_02483_19280101_20181231_hist.zip
 2497 -> jahreswerte_KL_02497_19370101_20181231_hist.zip
 2968 -> jahreswerte_KL_02968_19030101_20181231_hist.zip
 3028 -> jahreswerte_KL_03028_19510101_20181231_hist.zip
 4063 -> jahreswerte_KL_04063_19510101_20181231_hist.zip
 4371 -> jahreswerte_KL_04371_19310101_20181231_hist.zip


## 2. Summarized and Cleaned

### Create the list of station_ids the times series of which have to be downloaded.

In [203]:
import pandas as pd
# The DWD stations selected with QGIS and exported.
df = pd.read_csv("data/generated/DWD_stations_NRW_long_TS.csv", sep=";")

#station_ids = list(df["station_id"].values)
for station_id in df["station_id"]:
    print(station_id)

1078
1300
1303
1327
1590
2110
2483
2497
2968
3028
4063
4371


### FTP and Local Settings

In [210]:
server = "opendata.dwd.de"
user = "anonymous"
passwd = ""

ftpdir = "/climate_environment/CDC/observations_germany/climate/annual/kl/historical/"

localdir = "data/DWD"

### Create Local Dir

In [211]:
import os
os.makedirs(localdir,exist_ok = True) # it does not complain if the dir already exists.
os.listdir(localdir)

['jahreswerte_KL_01078_19400101_20181231_hist',
 'jahreswerte_KL_01078_19400101_20181231_hist.zip',
 'jahreswerte_KL_01303_18880101_20181231_hist',
 'jahreswerte_KL_01303_18880101_20181231_hist.zip',
 'KL_Jahreswerte_Beschreibung_Stationen.txt',
 'KL_Tageswerte_Beschreibung_Stationen.txt',
 'TU_Stundenwerte_Beschreibung_Stationen.txt']

### Define function generating a dictionary from ftp dir with station_id as key.

In [214]:
# To summarize everything in one function:
def gen_dir_dict_with_key_station_id(ftp, ftpdir):
    lines = []
    
    try:    
        res = ftp.retrlines("LIST "+ftpdir, lines.append)
    except:
        print("Error: ftp.retrlines() failed. ftp timeout? Reconnect!")
        return
        
    if len(lines) == 0:
        print("Error: ftp dir is empty")
        return
    
    fdict = {}
    for line in lines:
        [ftype, fsize, fname] = [line[0:1], int(line[31:42]), line[56:]]
        lst = fname.split("_")
        if len(lst)<3:
            print("Unexpected number of '_' delimiters. Skip.")
        else:
            [dtres, dtype, dsid_str] = lst[0:3]
            print(fname, end = ' : ');
            
            try:
                dsid = int(dsid_str)
                print("station_id = %5d" % int(dsid))
                fdict[dsid] = fname
            except:
                print("No station_id where excepted. Skip.")

    # return(lines)
    return(fdict)

### Connect and Retrieve

In [215]:
import ftplib
ftp = ftplib.FTP(server)
res = ftp.login(user=user, passwd = passwd)
print(res)

230 Login successful.


In [216]:
fdict = gen_dir_dict_with_key_station_id(ftp, ftpdir)

KL_Jahreswerte_Beschreibung_Stationen.txt : No station_id where excepted. Skip.
jahreswerte_KL_00001_19310101_19851231_hist.zip : station_id =     1
jahreswerte_KL_00003_18510101_20101231_hist.zip : station_id =     3
jahreswerte_KL_00044_19720101_20181231_hist.zip : station_id =    44
jahreswerte_KL_00052_19730101_20011231_hist.zip : station_id =    52
jahreswerte_KL_00061_19760101_19771231_hist.zip : station_id =    61
jahreswerte_KL_00070_19740101_19851231_hist.zip : station_id =    70
jahreswerte_KL_00071_19870101_20181231_hist.zip : station_id =    71
jahreswerte_KL_00072_19790101_19941231_hist.zip : station_id =    72
jahreswerte_KL_00073_19530101_20181231_hist.zip : station_id =    73
jahreswerte_KL_00078_19610101_20181231_hist.zip : station_id =    78
jahreswerte_KL_00091_19790101_20181231_hist.zip : station_id =    91
jahreswerte_KL_00098_18870101_19541231_hist.zip : station_id =    98
jahreswerte_KL_00102_20020101_20181231_hist.zip : station_id =   102
jahreswerte_KL_00116_19

jahreswerte_KL_05419_18820101_20051231_hist.zip : station_id =  5419
jahreswerte_KL_05424_19500101_20181231_hist.zip : station_id =  5424
jahreswerte_KL_05426_19530101_20181231_hist.zip : station_id =  5426
jahreswerte_KL_05429_19010101_19831231_hist.zip : station_id =  5429
jahreswerte_KL_05431_19470101_19751231_hist.zip : station_id =  5431
jahreswerte_KL_05433_19570101_20181231_hist.zip : station_id =  5433
jahreswerte_KL_05434_19410101_19911231_hist.zip : station_id =  5434
jahreswerte_KL_05435_19810101_20031231_hist.zip : station_id =  5435
jahreswerte_KL_05440_18810101_20181231_hist.zip : station_id =  5440
jahreswerte_KL_05442_18950101_20051231_hist.zip : station_id =  5442
jahreswerte_KL_05445_19910101_20071231_hist.zip : station_id =  5445
jahreswerte_KL_05453_19930101_20031231_hist.zip : station_id =  5453
jahreswerte_KL_05460_19790101_19831231_hist.zip : station_id =  5460
jahreswerte_KL_05463_19950101_19971231_hist.zip : station_id =  5463
jahreswerte_KL_05467_18840101_2011

### Select the file names to be downloaded from the dict and download with FTP.