In [1]:
import pandas as pd
import os
import glob
import csv
import numpy as np

### Lookup table Municipality - Province

In [3]:
# classifications of municipalities by province in the Netherlands 
df_2021 = pd.read_excel('data/Source_0/gemeenten-alfabetisch-2021.xlsx')[['Gemeentenaam', 'Provincienaam']]
df_2020 = pd.read_excel('data/Source_0/Gemeenten alfabetisch 2020.xlsx')[['Gemeentenaam', 'Provincienaam']]
# for 2018 and 2019 files, you need pip install xlrd
df_2019 = pd.read_excel('data/Source_0/Gemeenten alfabetisch 2019.xls')[['Gemeentenaam', 'Provincienaam']]
df_2018 = pd.read_excel('data/Source_0/Gemeenten alfabetisch 2018.xls')[['Gemeentenaam', 'Provincienaam']]

# 2020 and 2019 are exactly the same
# combine 2020 and 2018 in order to get all unique rows from both data sources
df_combi1820 = df_2018.merge(df_2019, on=['Gemeentenaam', 'Provincienaam'], how='outer')

# combine 2021 and the newly created 2018-2020 dataframe
df_final = df_combi1820.merge(df_2021, on=['Gemeentenaam', 'Provincienaam'], how='outer')

df_final = df_final[df_final["Provincienaam"]!="Friesland"]
df_final

Unnamed: 0,Gemeentenaam,Provincienaam
0,Aa en Hunze,Drenthe
1,Aalburg,Noord-Brabant
2,Aalsmeer,Noord-Holland
3,Aalten,Gelderland
5,Alblasserdam,Zuid-Holland
...,...,...
403,Terschelling,Fryslân
404,Tytsjerksteradiel,Fryslân
405,Vlieland,Fryslân
406,Waadhoeke,Fryslân


### Source 1 preprocessing

In [6]:
# Source file cleaning
source_number = 1
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_1 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")

# remove "" from column names and string values
source_1.columns = source_1.columns.str[1:]
source_1.columns = source_1.columns.str[:-1]
source_1["Wijken en buurten"] = source_1["Wijken en buurten"].str[1:]
source_1["Wijken en buurten"] = source_1["Wijken en buurten"].str[:-1]
source_1["Regioaanduiding/Soort regio (omschrijving)"] = source_1["Regioaanduiding/Soort regio (omschrijving)"].str[1:]
source_1["Regioaanduiding/Soort regio (omschrijving)"] = source_1["Regioaanduiding/Soort regio (omschrijving)"].str[:-1]
# filter only municipalities
source_1 = source_1[source_1["Regioaanduiding/Soort regio (omschrijving)"]=="Gemeente  "]
# rename column for merging
source_1.rename(columns={"Wijken en buurten": "Gemeentenaam"}, inplace = True)
# drop column
source_1.drop(columns={"Regioaanduiding/Soort regio (omschrijving)"}, inplace=True)
source_1

KeyError: 'Wijken en buurten'

In [None]:
source_1.columns

In [None]:
# show nr of non-null values in each column
source_1.count()

In [None]:
# merge
merged_source_1 = pd.merge(df_final, source_1, on = "Gemeentenaam", how='right')
merged_source_1

In [None]:
# require 30% non-null columns, more than this drops important columns
limitPer = len(merged_source_1) * 0.3
merged_source_1 = merged_source_1.dropna(thresh=limitPer, axis=1)
merged_source_1

### Source 2 preprocessing

In [7]:
source_number = 2
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_2 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")

# remove "" from column names and string values
source_2.columns = source_2.columns.str[1:]
source_2.columns = source_2.columns.str[:-1]
source_2["Regio's"] = source_2["Regio's"].str[1:]
source_2["Regio's"] = source_2["Regio's"].str[:-1]
source_2["Perioden"] = source_2["Perioden"].str[1:]
source_2["Perioden"] = source_2["Perioden"].str[:-1]
# rename column for merging
source_2.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)

source_2

KeyError: 'Perioden'

In [8]:
# show nr of non-null values in each column
source_2.count()

"Perioden                                                                                                      2178
Regio's                                                                                                        2178
Bevolking/Bevolkingssamenstelling op 1 januari/Totale bevolking (aantal)                                       1062
Bevolking/Bevolkingssamenstelling op 1 januari/Geslacht/Mannen (aantal)                                        1062
Bevolking/Bevolkingssamenstelling op 1 januari/Geslacht/Vrouwen (aantal)                                       1062
Bevolking/Bevolkingssamenstelling op 1 januari/Leeftijd/Leeftijdsgroepen/Jonger dan 5 jaar (aantal)            1062
Bevolking/Bevolkingssamenstelling op 1 januari/Leeftijd/Leeftijdsgroepen/5 tot 10 jaar (aantal)                1062
Bevolking/Bevolkingssamenstelling op 1 januari/Leeftijd/Leeftijdsgroepen/10 tot 15 jaar (aantal)               1062
Bevolking/Bevolkingssamenstelling op 1 januari/Leeftijd/Leeftijdsgroepen

In [9]:
merged_source_2 = pd.merge(df_final, source_2, on = "Gemeentenaam", how='right')
merged_source_2

KeyError: 'Gemeentenaam'

In [None]:
# require 30% non-null columns, more than this drops important columns
limitPer = len(merged_source_2) * 0.3
merged_source_2 = merged_source_2.dropna(thresh=limitPer, axis=1)

In [None]:
merged_source_2

In [None]:
# replace , with . in strings that are supposed to be numbers
# then convert those columns to numeric

# get the non-numeric columns, except the first 3
columns_to_convert = merged_source_2.select_dtypes(exclude=[np.number]).iloc[:,3:]
# convert
for column in columns_to_convert:
    merged_source_2[column] = merged_source_2[column].str.replace(',','.')
    merged_source_2[column] = pd.to_numeric(merged_source_2[column])

In [None]:
merged_source_2

In [None]:
# IMPUTE MISSING VALUES:

In [None]:
# get list of all provinces
provinces = merged_source_2['Provincienaam'].unique().tolist()

# for each province
for province in provinces:
    province_data = merged_source_2[merged_source_2['Provincienaam']==province]
    # get only numeric columns from province_data
    imbuted_subset = province_data.iloc[:, 3:]
    # impute
    imbuted_subset = imbuted_subset.fillna(imbuted_subset.mean())
    # replace with imputed data
    province_data.iloc[:, 3:] = imbuted_subset
    merged_source_2[merged_source_2['Provincienaam']==province] = province_data

In [None]:
merged_source_2

### Source 4 preprocessing

In [None]:
source_number = 4
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_4 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv", on_bad_lines='skip')

# remove "" from column names and string values
source_4.columns = source_4.columns.str[1:]
source_4.columns = source_4.columns.str[:-1]
source_4["Regio's"] = source_4["Regio's"].str[1:]
source_4["Regio's"] = source_4["Regio's"].str[:-1]
source_4["Perioden"] = source_4["Perioden"].str[1:]
source_4["Perioden"] = source_4["Perioden"].str[:-1]
# rename column for merging
source_4.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)
source_4

In [None]:
# drop columns with only 1 value

for col in source_4.columns:
    if len(source_4[col].unique()) == 1:
        print(col)
        source_4.drop(col,inplace=True,axis=1)
source_4

In [None]:
# show nr of non-null values in each column
source_4.count()

In [None]:
merged_source_4 = pd.merge(df_final, source_4, on = "Gemeentenaam", how='right')
merged_source_4

In [None]:
# inspect rows where province is null
merged_source_4[merged_source_4["Provincienaam"].isnull()]

In [None]:
# drop rows where province is nan
merged_source_4 = merged_source_4[merged_source_4["Provincienaam"].notnull()]
merged_source_4

In [None]:
# IMPUTE MISSING VALUES:

In [None]:
# get list of all provinces
provinces = merged_source_4['Provincienaam'].unique().tolist()

# for each province
for province in provinces:
    province_data = merged_source_4[merged_source_4['Provincienaam']==province]
    # get only numeric columns from province_data
    imbuted_subset = province_data.iloc[:, 2:]
    # impute
    imbuted_subset = imbuted_subset.fillna(imbuted_subset.mean())
    # replace with imputed data
    province_data.iloc[:, 2:] = imbuted_subset
    merged_source_4[merged_source_4['Provincienaam']==province] = province_data

In [None]:
merged_source_4

### Source 5 preprocessing

In [None]:
source_number = 5
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_5 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv", on_bad_lines='skip')

# remove "" from column names and string values
source_5.columns = source_5.columns.str[1:]
source_5.columns = source_5.columns.str[:-1]

# remove "" and " (PV)" from column values
source_5["Regio's"] = source_5["Regio's"].str[1:]
source_5["Regio's"] = source_5["Regio's"].str[:-6]
source_5["Beroepen en specialismen"] = source_5["Beroepen en specialismen"].str[1:]
source_5["Beroepen en specialismen"] = source_5["Beroepen en specialismen"].str[:-1]
source_5["Perioden"] = source_5["Perioden"].str[1:]
source_5["Perioden"] = source_5["Perioden"].str[:-1]

# remove * char from 2020
source_5["Perioden"] = source_5["Perioden"].str.replace("*","")

# drop age, social columns
source_5.drop("Leeftijd",inplace=True,axis=1)
source_5.drop("Sociaaleconomische categorie",inplace=True,axis=1)

source_5

### Source 6 preprocessing

In [12]:
source_number = 6
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_6 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")

# remove "" from column names and string values
source_6.columns = source_6.columns.str[1:]
source_6.columns = source_6.columns.str[:-1]
source_6["Regio's"] = source_6["Regio's"].str[1:]
source_6["Regio's"] = source_6["Regio's"].str[:-1]
source_6["Perioden"] = source_6["Perioden"].str[1:]
source_6["Perioden"] = source_6["Perioden"].str[:-1]
source_6['"Donorregistratie'] = source_6['"Donorregistratie'].str[1:]
source_6['"Donorregistratie'] = source_6['"Donorregistratie'].str[:-1]

# remove * char from 2021
source_6["Perioden"] = source_6["Perioden"].str.replace("*","")

# rename column for merging
source_6.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)

source_6

  source_6["Perioden"] = source_6["Perioden"].str.replace("*","")


Unnamed: 0,"""Donorregistratie",Perioden,Gemeentenaam,"Donorregister, vastgelegde keuze (x 1 000)"
0,Totaal wel of niet geregistreerd,2019,Aa en Hunze,228
1,Totaal wel of niet geregistreerd,2020,Aa en Hunze,228
2,Totaal wel of niet geregistreerd,2021,Aa en Hunze,228
3,Totaal wel of niet geregistreerd,2019,Aalsmeer,273
4,Totaal wel of niet geregistreerd,2020,Aalsmeer,276
...,...,...,...,...
3163,Niet geregistreerd in donorregister,2020,Zwijndrecht,209
3164,Niet geregistreerd in donorregister,2021,Zwijndrecht,32
3165,Niet geregistreerd in donorregister,2019,Zwolle,591
3166,Niet geregistreerd in donorregister,2020,Zwolle,562


In [14]:
source_6.count()

"Donorregistratie                             3168
Perioden                                      3168
Gemeentenaam                                  3168
Donorregister, vastgelegde keuze (x 1 000)    3168
dtype: int64

In [15]:
# merge
merged_source_6 = pd.merge(df_final, source_6, on = "Gemeentenaam", how='right')

Unnamed: 0,Gemeentenaam,Provincienaam,"""Donorregistratie",Perioden,"Donorregister, vastgelegde keuze (x 1 000)"
0,Aa en Hunze,Drenthe,Totaal wel of niet geregistreerd,2019,228
1,Aa en Hunze,Drenthe,Totaal wel of niet geregistreerd,2020,228
2,Aa en Hunze,Drenthe,Totaal wel of niet geregistreerd,2021,228
3,Aalsmeer,Noord-Holland,Totaal wel of niet geregistreerd,2019,273
4,Aalsmeer,Noord-Holland,Totaal wel of niet geregistreerd,2020,276
...,...,...,...,...,...
3163,Zwijndrecht,Zuid-Holland,Niet geregistreerd in donorregister,2020,209
3164,Zwijndrecht,Zuid-Holland,Niet geregistreerd in donorregister,2021,32
3165,Zwolle,Overijssel,Niet geregistreerd in donorregister,2019,591
3166,Zwolle,Overijssel,Niet geregistreerd in donorregister,2020,562


In [None]:
# replace , with . in strings that are supposed to be numbers
# then convert those columns to numeric

# get the non-numeric columns, except the first 3
columns_to_convert = merged_source_6.select_dtypes(exclude=[np.number]).iloc[:,3:]
# convert
for column in columns_to_convert:
    merged_source_6[column] = merged_source_6[column].str.replace(',','.')
    merged_source_6[column] = pd.to_numeric(merged_source_6[column])
    
merged_source_6

### Source 7 preprocessing

In [39]:
source_number = 7
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_7 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")
source_7.columns = source_7.columns.str[1:]
source_7.columns = source_7.columns.str[:-1]
source_7.rename(columns={'"Geslacht': 'Geslacht'}, inplace=True)

source_7["Geslacht"] = source_7["Geslacht"].str[1:]
source_7["Geslacht"] = source_7["Geslacht"].str[:-1]

source_7["Leeftijd"] = source_7["Leeftijd"].str[1:]
source_7["Leeftijd"] = source_7["Leeftijd"].str[:-1]

source_7["Geneesmiddelengroep (ATC)"] = source_7["Geneesmiddelengroep (ATC)"].str[1:]
source_7["Geneesmiddelengroep (ATC)"] = source_7["Geneesmiddelengroep (ATC)"].str[:-1]

source_7["Perioden"] = source_7["Perioden"].str[1:]
source_7["Perioden"] = source_7["Perioden"].str[:-1]
source_7["Perioden"] = source_7["Perioden"].str.replace("*","")

source_7["Regio's"] = source_7["Regio's"].str[1:]
source_7["Regio's"] = source_7["Regio's"].str[:-1]

source_7["Personen met geneesmiddelen, relatief (%)"] = source_7["Personen met geneesmiddelen, relatief (%)"].str[1:]
source_7["Personen met geneesmiddelen, relatief (%)"] = source_7["Personen met geneesmiddelen, relatief (%)"].str[:-1]

source_7.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)
merged_source_7 = pd.merge(df_final, source_7, on = "Gemeentenaam", how='right')

# some municipalities dont map to provinces, drop those
merged_source_7.drop(merged_source_7[merged_source_7['Provincienaam'].isnull()].index, inplace=True)

  source_7["Perioden"] = source_7["Perioden"].str.replace("*","")


In [44]:
merged_source_7['Personen met geneesmiddelen, relatief (%)'] = merged_source_7['Personen met geneesmiddelen, relatief (%)'].str.replace(',','.')
merged_source_7['Personen met geneesmiddelen, relatief (%)'] = pd.to_numeric(merged_source_7['Personen met geneesmiddelen, relatief (%)'])

In [79]:
#impute
means = {}
for provincienaam in merged_source_7['Provincienaam'].unique():
    means[provincienaam] = merged_source_7[merged_source_7['Provincienaam'] == provincienaam]['Personen met geneesmiddelen, relatief (%)'].mean()

for i, row in merged_source_7[merged_source_7['Personen met geneesmiddelen, relatief (%)'].isnull()].iterrows():
    merged_source_7.at[i, 'Personen met geneesmiddelen, relatief (%)'] = means[row['Provincienaam']]

In [104]:
source_7.count()

Geslacht                                     13664
Leeftijd                                     13664
Geneesmiddelengroep (ATC)                    13664
Perioden                                     13664
Gemeentenaam                                 13664
Personen met geneesmiddelen, relatief (%)    11360
dtype: int64

In [103]:
merged_source_7.count()

Gemeentenaam                                 12064
Provincienaam                                12064
Geslacht                                     12064
Leeftijd                                     12064
Geneesmiddelengroep (ATC)                    12064
Perioden                                     12064
Personen met geneesmiddelen, relatief (%)    12064
dtype: int64

### Source 9 preprocessing

In [105]:
source_number = 9
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_9 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")

source_9.columns = source_9.columns.str[1:]
source_9.columns = source_9.columns.str[:-1]
source_9.rename(columns={'"Geslacht': 'Geslacht'}, inplace=True)

source_9["Geslacht"] = source_9["Geslacht"].str[1:]
source_9["Geslacht"] = source_9["Geslacht"].str[:-1]

source_9["Leeftijd"] = source_9["Leeftijd"].str[1:]
source_9["Leeftijd"] = source_9["Leeftijd"].str[:-1]

source_9["Soort opname"] = source_9["Soort opname"].str[1:]
source_9["Soort opname"] = source_9["Soort opname"].str[:-1]

source_9["Diagnose"] = source_9["Diagnose"].str[1:]
source_9["Diagnose"] = source_9["Diagnose"].str[:-1]

source_9["Perioden"] = source_9["Perioden"].str[1:]
source_9["Perioden"] = source_9["Perioden"].str[:-1]
source_9["Perioden"] = source_9["Perioden"].str.replace("*","")

source_9["Regio's"] = source_9["Regio's"].str[1:]
source_9["Regio's"] = source_9["Regio's"].str[:-1]

source_9["Opnamen per 10 000 inwoners (per 10 000 inwoners)"] = source_9["Opnamen per 10 000 inwoners (per 10 000 inwoners)"].str[1:]
source_9["Opnamen per 10 000 inwoners (per 10 000 inwoners)"] = source_9["Opnamen per 10 000 inwoners (per 10 000 inwoners)"].str[:-1]

source_9.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)
merged_source_9 = pd.merge(df_final, source_9, on = "Gemeentenaam", how='right')

# some municipalities dont map to provinces, drop those
merged_source_9.drop(merged_source_9[merged_source_9['Provincienaam'].isnull()].index, inplace=True)

  source_9["Perioden"] = source_9["Perioden"].str.replace("*","")


In [106]:
merged_source_9['Opnamen per 10 000 inwoners (per 10 000 inwoners)'] = merged_source_9['Opnamen per 10 000 inwoners (per 10 000 inwoners)'].str.replace(',','.')
merged_source_9['Opnamen per 10 000 inwoners (per 10 000 inwoners)'] = pd.to_numeric(merged_source_9['Opnamen per 10 000 inwoners (per 10 000 inwoners)'])

In [107]:
#impute
means = {}
for provincienaam in merged_source_9['Provincienaam'].unique():
    means[provincienaam] = merged_source_9[merged_source_9['Provincienaam'] == provincienaam]['Opnamen per 10 000 inwoners (per 10 000 inwoners)'].mean()

for i, row in merged_source_9[merged_source_9['Opnamen per 10 000 inwoners (per 10 000 inwoners)'].isnull()].iterrows():
    merged_source_9.at[i, 'Opnamen per 10 000 inwoners (per 10 000 inwoners)'] = means[row['Provincienaam']]

In [None]:
merged_source_9.count()

Gemeentenaam                                         4524
Provincienaam                                        4524
Geslacht                                             4524
Leeftijd                                             4524
Soort opname                                         4524
Diagnose                                             4524
Perioden                                             4524
Opnamen per 10 000 inwoners (per 10 000 inwoners)    4524
dtype: int64

### Source 11 preprocessing

In [127]:
source_number = 11
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_11 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")

source_11.columns = source_11.columns.str[1:]
source_11.columns = source_11.columns.str[:-1]
source_11.rename(columns={'"Perioden': 'Perioden'}, inplace=True)

source_11["Perioden"] = source_11["Perioden"].str[1:]
source_11["Perioden"] = source_11["Perioden"].str[:-1]
source_11["Perioden"] = source_11["Perioden"].str.replace("*","")

source_11["Regio's"] = source_11["Regio's"].str[1:]
source_11["Regio's"] = source_11["Regio's"].str[:-1]

source_11.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)
merged_source_11 = pd.merge(df_final, source_11, on = "Gemeentenaam", how='right')

merged_source_11.drop(merged_source_11[merged_source_11['Provincienaam'].isnull()].index, inplace=True)

  source_11["Perioden"] = source_11["Perioden"].str.replace("*","")


In [132]:
merged_source_11

Unnamed: 0,Gemeentenaam,Provincienaam,Perioden,Totaal alle onderliggende doodsoorzaken (aantal),Nieuwvormingen (aantal),Ziekten van hart en vaatstelsel (aantal),Ziekten van ademhalingsstelsel (aantal)
0,Aa en Hunze,Drenthe,2018,283.0,96.0,79.0,14.0
1,Aa en Hunze,Drenthe,2019,280.0,94.0,86.0,18.0
2,Aa en Hunze,Drenthe,2020,274.0,88.0,77.0,14.0
3,Aalburg,Noord-Brabant,2018,84.0,27.0,23.0,8.0
4,Aalburg,Noord-Brabant,2019,,,,
...,...,...,...,...,...,...,...
2131,Zwijndrecht,Zuid-Holland,2019,471.0,154.0,121.0,32.0
2132,Zwijndrecht,Zuid-Holland,2020,558.0,142.0,123.0,32.0
2133,Zwolle,Overijssel,2018,988.0,287.0,266.0,74.0
2134,Zwolle,Overijssel,2019,953.0,294.0,258.0,65.0


In [138]:
columns = merged_source_11.columns[3:]
for column in columns:
    means = {}
    for provincienaam in merged_source_11['Provincienaam'].unique():
        means[provincienaam] = merged_source_11[merged_source_11['Provincienaam'] == provincienaam][column].mean()
    for i, row in merged_source_11[merged_source_11[column].isnull()].iterrows():
        merged_source_11.at[i, column] = means[row['Provincienaam']]

### Source 14 preprocessing

In [None]:
source_number = 14
source_folder_path = f"data/Source_{source_number}"

# Pandas sanity check, open the just created file
source_14 = pd.read_csv(f"{source_folder_path}/Source_{source_number}_full_converted_raw.csv")

source_14.columns = source_14.columns.str[1:]
source_14.columns = source_14.columns.str[:-1]
source_14.rename(columns={'"Perioden': 'Perioden'}, inplace=True)

source_14["Perioden"] = source_14["Perioden"].str[1:]
source_14["Perioden"] = source_14["Perioden"].str[:-1]
source_14["Perioden"] = source_14["Perioden"].str.replace("*","")

source_14["Regio's"] = source_14["Regio's"].str[1:]
source_14["Regio's"] = source_14["Regio's"].str[:-1]

source_14.rename(columns={"Regio's": "Gemeentenaam"}, inplace = True)
merged_source_14 = pd.merge(df_final, source_14, on = "Gemeentenaam", how='right')

merged_source_14.drop(merged_source_14[merged_source_14['Provincienaam'].isnull()].index, inplace=True)

  source_14["Perioden"] = source_14["Perioden"].str.replace("*","")


In [149]:
merged_source_14['Bevolkingsgroei/Bevolkingsgroei, relatief (%)'] = merged_source_14['Bevolkingsgroei/Bevolkingsgroei, relatief (%)'].str.replace(',','.')
merged_source_14['Bevolkingsgroei/Bevolkingsgroei, relatief (%)'] = pd.to_numeric(merged_source_14['Bevolkingsgroei/Bevolkingsgroei, relatief (%)'])

In [150]:
columns = merged_source_14.columns[3:]
for column in columns:
    means = {}
    for provincienaam in merged_source_14['Provincienaam'].unique():
        means[provincienaam] = merged_source_14[merged_source_14['Provincienaam'] == provincienaam][column].mean()
    for i, row in merged_source_14[merged_source_14[column].isnull()].iterrows():
        merged_source_14.at[i, column] = means[row['Provincienaam']]

In [151]:
merged_source_14

Unnamed: 0,Gemeentenaam,Provincienaam,Perioden,Bevolking aan het begin van de periode (aantal),Levend geboren kinderen (aantal),Overledenen (aantal),Vertrek uit de gemeente/Vertrek naar andere gemeente (aantal),"Bevolkingsgroei/Bevolkingsgroei, relatief (%)"
0,Aa en Hunze,Drenthe,2019,25386.000000,176.0,280.000000,1305.000000,0.230000
1,Aa en Hunze,Drenthe,2020,25445.000000,167.0,274.000000,1395.000000,-0.180000
2,Aa en Hunze,Drenthe,2021,25399.000000,198.0,284.000000,1182.000000,0.690000
3,Aalburg,Noord-Brabant,2019,41522.756757,390.8,405.832432,1743.518919,0.641135
4,Aalburg,Noord-Brabant,2020,41522.756757,390.8,405.832432,1743.518919,0.641135
...,...,...,...,...,...,...,...,...
1660,Zwijndrecht,Zuid-Holland,2020,44737.000000,406.0,558.000000,1969.000000,0.080000
1661,Zwijndrecht,Zuid-Holland,2021,44775.000000,490.0,535.000000,2289.000000,-0.010000
1662,Zwolle,Overijssel,2019,127497.000000,1460.0,953.000000,5356.000000,1.050000
1663,Zwolle,Overijssel,2020,128840.000000,1469.0,1036.000000,5574.000000,0.780000
