# Download the weather data

In [4]:
# https://opendata.smhi.se/apidocs/

import requests
import pandas as pd
from pathlib import Path

api = "https://opendata-download-metobs.smhi.se/api/"

## Parameter selection

I have already defined the parameters to download, shown below. In comments, I provide the parameters' description.

If you need specific or only one parameter, change list to include the corresponding element(s), e.g., `params = [4]`

In [6]:
# Hourly data
# 1  = Air temperature, once/hour (mean)
# 4  = Wind speed,      once/hour (mean)
# 10 = Sunshine time,   once/hour

# Daily data
# 2  = Air temperature, once/day (mean), at 00:00 hrs.
# 5  = Precipitation,   once/day (sum of 24 hours), at 06:00 a.m.
# 8	 = Snow depth,      once/day, at 06:00 am

params = [2, 5, 8, 10]

In [8]:
Path("./smhi_data/").mkdir(parents=True, exist_ok=True)

stations_list = []
stations_LongLat = {}

for p in params:
    # Fetch stations for parameter p

    url = api + f"version/latest/parameter/{p}.json"
    res = requests.get(url)
    station_id = []
    for r in res.json()["station"]:
        if r["measuringStations"] == "ADDITIONAL" or r["active"] == "False":
            continue
        station_id.append(r["id"])
        stations_LongLat[r["id"]] = (r["latitude"], r["longitude"])

    stations_list.append(station_id)

# To make it easier to select stations, we can take the union of the stations for the different parameters. 
# We ignore "amount of Sunshine time" for union calculation as it has a low amount of stations 
# (which are not overlapping any other stations).
stations_list_union = list(set(stations_list[0]) & set(stations_list[1]) & set(stations_list[2]))

In [10]:
# Number of stations for each parameter.
for s in stations_list:
    print(len(s))

919
2090
1883
20


# Check swedish energy zones

To predict electricity prices, you may want to consider the Swedish electricity zones.

Sweden is divided into [four electricity regions](https://www.tekniskaverken.se/privat/other-languages/save-electricity/electricity-market/), as shown in the picture below. 

![Swedish energy zones](https://tekniskaverken.imagevault.media/publishedmedia/5o14j5u8dactydhw88la/karta-elomraden-sverige.png)

Below, I provide some simple steps on how to get data only from the stations included in a preferred zone.

In [46]:
# If above SE2 line, then it is SE1

# Below this line is SE2
a2 = (66.188660, 14.291650)
b2 = (64.297761, 21.484794)

# Below this line is SE3
a3 = (61.532329, 11.963278)
b3 = (61.156689, 17.719862)

# Below this line is SE4
a4 = (57.260111, 11.920329)
b4 = (57.177640, 18.843320)

# Function for checking above/below a line.
is_below = lambda px, py, x1, y1, x2, y2: (px - x1)*(y2-y1) - (py -y1)*(x2-x1) < 0

In [48]:
# Get a dict with key: station, and value: energy zone.
# Here we add the sun-amount again.
stations_zone = {}

for s in stations_list_union + stations_list[3]:
    if is_below(*stations_LongLat[s], *a4, *b4):
        stations_zone[s] = "SE4"
        continue
    if is_below(*stations_LongLat[s], *a3, *b3):
        stations_zone[s] = "SE3"
        continue
    if is_below(*stations_LongLat[s], *a2, *b2):
        stations_zone[s] = "SE2"
        continue
    stations_zone[s] = "SE1"

Let's say, for example, we want to keep only zone S3 because our data correspond to places in that zone.

In [50]:
data = [[s, *list(stations_LongLat[s]), stations_zone[s]] for s in (stations_list_union + stations_list[3])]

zone_frame = pd.DataFrame(data,
                          columns=["stations", "latitude", "longitude", "energy zone"])

# Keeping only zone S3
zone_frame = zone_frame.loc[zone_frame['energy zone'] == "SE3"]

Path(f"./smhi_data_2022-today/").mkdir(parents=True, exist_ok=True)
filepath = Path(f"./smhi_data_2022-today/station_zone_data.csv")

zone_frame.to_csv(filepath, index=False)
zone_frame

Unnamed: 0,stations,latitude,longitude,energy zone
2,106500,60.8321,16.0581,SE3
4,92170,59.2891,12.0542,SE3
16,84020,58.0389,14.9853,SE3
17,96310,59.5259,16.0192,SE3
18,108600,60.1829,18.3964,SE3
...,...,...,...,...
747,86655,58.5824,16.1485,SE3
748,98735,59.3534,18.0634,SE3
750,99275,59.4422,19.5020,SE3
753,78645,57.6727,18.3448,SE3


The code below fetches the data for the corresponding energy zone, i.e., zone SE3.

The output of the code block is several folders named "parameter_x" (x equals the parameter's number) included in the folder "smhi_data".

In [40]:
for p in params:
    # Go through each parameter p

    Path(f"./smhi_data/parameter_{p}").mkdir(parents=True, exist_ok=True)
    
    # special case for Sunshine time
    if p == 10:
        for s in stations_list[3]:
            # Fetch data from each station s
            url = api + f"version/latest/parameter/{p}/station/{s}/period/corrected-archive/data.csv"
            res = requests.get(url)

            if stations_zone[s] == "SE3":
                with open(f"./smhi_data/parameter_{p}/station_{s}-{stations_zone[s]}.csv", "wb") as binary_file:
                    # Write bytes to file
                    binary_file.write(res.content)
        continue

    for s in stations_list_union:
        # Fetch data from each station s
        url = api + f"version/latest/parameter/{p}/station/{s}/period/corrected-archive/data.csv"
        res = requests.get(url)

        if stations_zone[s] == "SE3":
            with open(f"./smhi_data/parameter_{p}/station_{s}-{stations_zone[s]}.csv", "wb") as binary_file:
                # Write bytes to file
                binary_file.write(res.content)

The code below creates the weather data you will use for your project.

Those data are stored in a folder named "smhi_data_2014-today" containing only data from 2014-01-01 to match, e.g., the Jordbruksverket data period.

You can change this parameter in the corresponding code part below.

In [57]:
for param in params:
    # defining the path, i.e., the folder where the data from the corresponding parameter exist, and accessing all .csv files
    p = Path(f"./smhi_data/parameter_{param}").glob("*.csv")
    Path(f"./smhi_data_2022-today/parameter_{param}").mkdir(parents=True, exist_ok=True)

    # defining the target path, i.e., the folder where the processed data file should be saved
    target_path = Path(f"./smhi_data_2022-today/parameter_{param}")
    files = [x for x in p if x.is_file()]

    # Start Date should be aligned with the data from the company:
    start_date = pd.Timestamp("2014-01-01")                       # ---> change the date here
    
    # Read, clean, re-save data
    for f in files:
        try:
            print(f) # keep track of the files toggling
    
            '''
            IMPORTANT:
            We try to find the start of the data set based on the different formats in the original files.
            If you HAVE ADDED ADDITIONAL parameters, check the corresponding files to revise the code below properly.
            '''
            for i in range(1, 20, 1): # csv files variable length header 
                try:
                    if param == 2: # this format is met in files corresponding to parameter 2
                        df = pd.read_csv(f, sep=";", skiprows=i, usecols=["Representativt dygn", "Lufttemperatur", "Kvalitet"])
                        df.rename(columns={'Representativt dygn': 'Datum'}, inplace=True)
                    elif param == 5: # this format is met in files corresponding to parameter 5
                        df = pd.read_csv(f, sep=";", skiprows=i, usecols=["Representativt dygn", "Nederbördsmängd", "Kvalitet"])
                        df.rename(columns={'Representativt dygn': 'Datum'}, inplace=True)
                    elif param == 8: # this format is met in files corresponding to parameter 8
                        df = pd.read_csv(f, sep=";", skiprows=i, usecols=["Datum", "Tid (UTC)", "Snödjup", "Kvalitet"])
                    elif param == 10: # this format is met in files corresponding to parameter 10
                        df = pd.read_csv(f, sep=";", skiprows=i, usecols=["Datum", "Tid (UTC)", "Solskenstid", "Kvalitet"])
                    break
                except:
                    continue
        
            # Check that the file has correct timespan
            last_time = df["Datum"].tail(1).apply(pd.to_datetime)
        
            if last_time.iloc[0] > start_date:
                # Quick remove all unneccecary data and save
                idx = df["Datum"].str.contains("2014").idxmax()   # ---> change the date here, too
                df = df.drop(df.index[:idx])
                df.to_csv((target_path / f.name), index=False)
                
        except:
            print("Bad file")
            continue