# Seasonal weather data for major EU cities

- First we will scrape https://climatedata.eu/ for the cities in our defined list.
- Because climatedata uses a special codes in their urls, we will define a dictionary for our cities
- After scraping the data we can subset for seasonal avg temperatures, precipitation and sunshine

In [1]:
import pandas as pd
import lxml

## 1. Getting the data
### 1.1 First, we define a dictionary to hold the url codes for each city

In [2]:
city_codes = {'Amsterdam':'nlxx0002', 'Athens':'grxx0004', 'Belgrade':'srxx0005', 'Berlin':'gmxx0007',
              'Bern':'szxx0006', 'Bratislava':'loxx0001', 'Brussels':'bexx0005', 'Bucharest':'roxx0003',
              'Budapest':'huxx0002', 'Copenhagen':'daxx0009', 'Dublin':'eixx0014', 'Helsinki':'fixx0002', 
              'Lisbon':'poxx0039', 'Ljubljana':'sixx0002', 'London':'ukxx0085', 'Luxembourg':'bexx0027', 
              'Madrid':'spxx0050', 'Monaco':'mnxx0001', 'Nicosia':'cyxx0005', 'Oslo':'noxx0029', 
              'Paris':'frxx0076', 'Prague':'ezxx0012', 'Reykjavik':'icxx0002', 'Riga':'lgxx0004', 
              'Rome':'itxx0067', 'Sarajevo':'bkxx0004', 'Skopje':'mkxx0001', 'Sofia':'buxx0005', 
              'Stockholm':'swxx0031', 'Tallinn':'enxx0004', 'Tirana':'alxx0002', 'Valletta':'mtxx0001', 
              'Vienna':'auxx0025', 'Vilnius':'lhxx0005', 'Warsaw':'plxx0028', 'Zagreb':'hrxx0005'}

In [3]:
amsterdam1 = pd.read_html('https://www.climatedata.eu/climate.php?loc=nlxx0002&lang=en')[2]
amsterdam1

Unnamed: 0.1,Unnamed: 0,Jan,Feb,Mar,Apr,May,June
0,Average high in °C,5,6,9,12,17,19
1,Average low in °C,1,0,2,4,8,10
2,Av. precipitation - mm,62,43,58,41,48,67
3,Days with precip.,17,13,17,14,14,14
4,Hours of sunshine,53,80,116,166,217,200


In [4]:
amsterdam2 = pd.read_html('https://www.climatedata.eu/climate.php?loc=nlxx0002&lang=en')[3]
amsterdam2

Unnamed: 0.1,Unnamed: 0,July,Aug,Sep,Oct,Nov,Dec
0,Average high in °C,21,22,18,14,9,7
1,Average low in °C,13,12,10,7,4,2
2,Av. precipitation - mm,65,61,82,85,89,75
3,Days with precip.,13,13,16,17,19,18
4,Hours of sunshine,208,201,133,103,58,45


In [5]:
athens1 = pd.read_html('https://www.climatedata.eu/climate.php?loc=grxx0004&lang=en')[2]
athens1

Unnamed: 0.1,Unnamed: 0,Jan,Feb,Mar,Apr,May,June
0,Average high in °C,14,14,16,19,24,29
1,Average low in °C,7,7,8,11,16,20
2,Av. precipitation - mm,48,40,39,26,15,5
3,Days with precip.,13,13,11,9,6,4
4,Hours of sunshine,129,136,183,230,291,336


In [6]:
athens2 = pd.read_html('https://www.climatedata.eu/climate.php?loc=grxx0004&lang=en')[3]
athens2

Unnamed: 0.1,Unnamed: 0,July,Aug,Sep,Oct,Nov,Dec
0,Average high in °C,32,32,28,23,19,15
1,Average low in °C,23,23,20,16,12,9
2,Av. precipitation - mm,5,7,9,47,55,64
3,Days with precip.,2,1,4,9,11,14
4,Hours of sunshine,364,340,277,208,152,127


### 1.2 Now we'll write a function to gather all the cities's data in one go

In [7]:
test_dict = {'Amsterdam':'nlxx0002', 'Athens':'grxx0004', 'Vienna':'auxx0025'}

In [8]:
def get_data(city_dict) -> dict:
    df = pd.DataFrame()
    errors = []

    for city, code in city_dict.items():
        try:
            # get first table (from jan to june)
            table1 = pd.read_html(f'https://www.climatedata.eu/climate.php?loc={code}&lang=en')[2]
        
            # get second table (from july to dec)
            table2 = pd.read_html(f'https://www.climatedata.eu/climate.php?loc={code}&lang=en')[3]
            
            # merge table1 and table2
            table_combo = table1.merge(table2)
            table_combo = table_combo.assign(city=city)
            
            # assign tables to df
            df = df.append(table_combo)
        
        except:
            errors.append(city)

    return df
    

In [27]:
test_weather = get_data(test_dict)

In [10]:
city_weather = get_data(city_codes)

In [11]:
city_weather.head(20)

Unnamed: 0.1,Unnamed: 0,Jan,Feb,Mar,Apr,May,June,July,Aug,Sep,Oct,Nov,Dec,city
0,Average high in °C,5,6,9,12,17,19,21,22,18,14,9,7,Amsterdam
1,Average low in °C,1,0,2,4,8,10,13,12,10,7,4,2,Amsterdam
2,Av. precipitation - mm,62,43,58,41,48,67,65,61,82,85,89,75,Amsterdam
3,Days with precip.,17,13,17,14,14,14,13,13,16,17,19,18,Amsterdam
4,Hours of sunshine,53,80,116,166,217,200,208,201,133,103,58,45,Amsterdam
0,Average high in °C,14,14,16,19,24,29,32,32,28,23,19,15,Athens
1,Average low in °C,7,7,8,11,16,20,23,23,20,16,12,9,Athens
2,Av. precipitation - mm,48,40,39,26,15,5,5,7,9,47,55,64,Athens
3,Days with precip.,13,13,11,9,6,4,2,1,4,9,11,14,Athens
4,Hours of sunshine,129,136,183,230,291,336,364,340,277,208,152,127,Athens


## 2. Now we have our dataframe, it's time to inspect and subset
### 2.1 checking dtypes and updating colnames

In [12]:
city_weather.dtypes

Unnamed: 0    object
Jan            int64
Feb            int64
Mar            int64
Apr            int64
May            int64
June           int64
July           int64
Aug            int64
Sep            int64
Oct            int64
Nov            int64
Dec            int64
city          object
dtype: object

In [13]:
city_weather = city_weather.rename(columns={"Unnamed: 0":"weather"})

In [14]:
city_weather.head()

Unnamed: 0,weather,Jan,Feb,Mar,Apr,May,June,July,Aug,Sep,Oct,Nov,Dec,city
0,Average high in °C,5,6,9,12,17,19,21,22,18,14,9,7,Amsterdam
1,Average low in °C,1,0,2,4,8,10,13,12,10,7,4,2,Amsterdam
2,Av. precipitation - mm,62,43,58,41,48,67,65,61,82,85,89,75,Amsterdam
3,Days with precip.,17,13,17,14,14,14,13,13,16,17,19,18,Amsterdam
4,Hours of sunshine,53,80,116,166,217,200,208,201,133,103,58,45,Amsterdam


### 2.2 Grouping months into seasons

#### Keep in mind that the data is not standardised (e.g we have temperatures in Celius, number of mm, days and hours)

In [15]:
test_weather

Unnamed: 0.1,Unnamed: 0,Jan,Feb,Mar,Apr,May,June,July,Aug,Sep,Oct,Nov,Dec,city
0,Average high in °C,5,6,9,12,17,19,21,22,18,14,9,7,Amsterdam
1,Average low in °C,1,0,2,4,8,10,13,12,10,7,4,2,Amsterdam
2,Av. precipitation - mm,62,43,58,41,48,67,65,61,82,85,89,75,Amsterdam
3,Days with precip.,17,13,17,14,14,14,13,13,16,17,19,18,Amsterdam
4,Hours of sunshine,53,80,116,166,217,200,208,201,133,103,58,45,Amsterdam
0,Average high in °C,14,14,16,19,24,29,32,32,28,23,19,15,Athens
1,Average low in °C,7,7,8,11,16,20,23,23,20,16,12,9,Athens
2,Av. precipitation - mm,48,40,39,26,15,5,5,7,9,47,55,64,Athens
3,Days with precip.,13,13,11,9,6,4,2,1,4,9,11,14,Athens
4,Hours of sunshine,129,136,183,230,291,336,364,340,277,208,152,127,Athens


In [30]:
round((test_weather["Dec"] + test_weather["Jan"] + test_weather["Feb"]) / 3)

0      6.0
1      1.0
2     60.0
3     16.0
4     59.0
0     14.0
1      8.0
2     51.0
3     13.0
4    131.0
0      2.0
1     -3.0
2     41.0
3     15.0
4     62.0
dtype: float64

In [31]:
test_weather.head()

Unnamed: 0.1,Unnamed: 0,Jan,Feb,Mar,Apr,May,June,July,Aug,Sep,Oct,Nov,Dec,city
0,Average high in °C,5,6,9,12,17,19,21,22,18,14,9,7,Amsterdam
1,Average low in °C,1,0,2,4,8,10,13,12,10,7,4,2,Amsterdam
2,Av. precipitation - mm,62,43,58,41,48,67,65,61,82,85,89,75,Amsterdam
3,Days with precip.,17,13,17,14,14,14,13,13,16,17,19,18,Amsterdam
4,Hours of sunshine,53,80,116,166,217,200,208,201,133,103,58,45,Amsterdam


In [46]:
def get_season_avg(df):
    season_dict = {"winter": ["Dec", "Jan", "Feb"],
                   "spring": ["Mar", "Apr", "May"],
                   "summer": ["June", "July", "Aug"],
                   "autumn": ["Sep", "Oct", "Nov"],
                   }
    
    for season, month in season_dict.items():
        # Get winter averages
        df[season] = round((df[month[0]] + df[month[1]] + df[month[2]]) / 3)
        df = df.drop(df[month], axis=1)

    return df 

In [47]:
get_season_avg(test_weather)

Unnamed: 0.1,Unnamed: 0,city,winter,spring,summer,autumn
0,Average high in °C,Amsterdam,6.0,13.0,21.0,14.0
1,Average low in °C,Amsterdam,1.0,5.0,12.0,7.0
2,Av. precipitation - mm,Amsterdam,60.0,49.0,64.0,85.0
3,Days with precip.,Amsterdam,16.0,15.0,13.0,17.0
4,Hours of sunshine,Amsterdam,59.0,166.0,203.0,98.0
0,Average high in °C,Athens,14.0,20.0,31.0,23.0
1,Average low in °C,Athens,8.0,12.0,22.0,16.0
2,Av. precipitation - mm,Athens,51.0,27.0,6.0,37.0
3,Days with precip.,Athens,13.0,9.0,2.0,8.0
4,Hours of sunshine,Athens,131.0,235.0,347.0,212.0


In [48]:
city_seasonal_avg = get_season_avg(city_weather)

In [49]:
city_seasonal_avg

Unnamed: 0,weather,city,winter,spring,summer,autumn
0,Average high in °C,Amsterdam,6.0,13.0,21.0,14.0
1,Average low in °C,Amsterdam,1.0,5.0,12.0,7.0
2,Av. precipitation - mm,Amsterdam,60.0,49.0,64.0,85.0
3,Days with precip.,Amsterdam,16.0,15.0,13.0,17.0
4,Hours of sunshine,Amsterdam,59.0,166.0,203.0,98.0
...,...,...,...,...,...,...
0,Average high in °C,Zagreb,4.0,16.0,25.0,15.0
1,Average low in °C,Zagreb,-0.0,8.0,16.0,9.0
2,Av. precipitation - mm,Zagreb,54.0,69.0,93.0,79.0
3,Days with precip.,Zagreb,7.0,9.0,9.0,7.0


### 2.3 Now to pivot the table so each city only has one row

In [56]:
test_df = city_seasonal_avg.loc[city_seasonal_avg["city"] == "Amsterdam"]

In [61]:
test_df

Unnamed: 0,weather,city,winter,spring,summer,autumn
0,Average high in °C,Amsterdam,6.0,13.0,21.0,14.0
1,Average low in °C,Amsterdam,1.0,5.0,12.0,7.0
2,Av. precipitation - mm,Amsterdam,60.0,49.0,64.0,85.0
3,Days with precip.,Amsterdam,16.0,15.0,13.0,17.0
4,Hours of sunshine,Amsterdam,59.0,166.0,203.0,98.0


In [68]:
colnames = ["city",
            "autumn_prec_mm", "autumn_high", "autumn_low", "autumn_prec_days", "autumn_sun_hrs",
            "spring_prec_mm", "spring_high", "spring_low", "spring_prec_days", "spring_sun_hrs",
            "summer_prec_mm", "summer_high", "summer_low", "summer_prec_days", "summer_sun_hrs",
            "winter_prec_mm", "winter_high", "winter_low", "winter_prec_days", "winter_sun_hrs",
           ]

In [67]:
test_df.pivot_table(index="city", columns=["weather"]).reset_index()#.rename(columns=[colnames])

Unnamed: 0_level_0,city,autumn,autumn,autumn,autumn,autumn,spring,spring,spring,spring,...,summer,summer,summer,summer,summer,winter,winter,winter,winter,winter
weather,Unnamed: 1_level_1,Av. precipitation - mm,Average high in °C,Average low in °C,Days with precip.,Hours of sunshine,Av. precipitation - mm,Average high in °C,Average low in °C,Days with precip.,...,Av. precipitation - mm,Average high in °C,Average low in °C,Days with precip.,Hours of sunshine,Av. precipitation - mm,Average high in °C,Average low in °C,Days with precip.,Hours of sunshine
0,Amsterdam,85.0,14.0,7.0,17.0,98.0,49.0,13.0,5.0,15.0,...,64.0,21.0,12.0,13.0,203.0,60.0,6.0,1.0,16.0,59.0
