## Dataset Creation

In [24]:
import pandas as pd
import numpy as np
import random
import pickle

In [2]:
number_of_teachers = 96000
number_of_schools = 100

In [3]:
teachers = pd.DataFrame(data=np.nan, index=list(range(number_of_teachers)), columns=['id', 'gender', 'age', 'residence', 'married', 'kids', 'subject_1', 'subject_2', 'type_of_school', 'salary_expectation', 'years_of_work_experience', 'preference_big_school', 'preference_rural'])
teachers['id'] = list(range(number_of_teachers))

In [4]:
schools = pd.DataFrame(data=np.nan, index=list(range(number_of_schools)), columns=['id', 'location', 'size', 'type of school', 'rural', 'privat', 'equipment'])
schools['id'] = list(range(number_of_schools))

In [5]:
gender = ['female', 'male']
age = list(range(25, 56))
#type_of_schools = ['private', 'public']
subjects = ['Mathematics', 'Science', 'Art', 'Music', 'Sports', 'English', 'Geography']
type_of_school = ['Weiterbildungskolleg', 'Gymnasium', 'Gesamtschule', 'Grundschule', 'Waldorfschule', 'Gemeinschaftsschule (Schulversuch)', 'Förderschule', 'Realschule', 'Berufskolleg', 'Hauptschule', 'Volksschule']
salaries = list(range(48000, 64000, 2000))

In [6]:
df_types_of_schools = pd.DataFrame(data=np.nan, index=list(range(11)), columns=['type_of_school'])
df_types_of_schools['type_of_school'] = type_of_school
df_types_of_schools['number_of_schools'] = [46, 623, 358, 2788, 58, 120, 500, 379, 365, 174, 2]
df_types_of_schools['number_of_pupils'] = [18157, 501395, 335805, 646558, 19105, 59768, 85878, 202452, 522805, 51706, 282]
sum_schools = df_types_of_schools['number_of_schools'].sum()
df_types_of_schools['proportion_schools'] = df_types_of_schools['number_of_schools'].apply(lambda row: row/sum_schools)
sum_pupils = df_types_of_schools['number_of_pupils'].sum()
df_types_of_schools['proportion_pupils'] = df_types_of_schools['number_of_pupils'].apply(lambda row: row/sum_pupils)
df_types_of_schools

Unnamed: 0,type_of_school,number_of_schools,number_of_pupils,proportion_schools,proportion_pupils
0,Weiterbildungskolleg,46,18157,0.008498,0.007429
1,Gymnasium,623,501395,0.115093,0.205161
2,Gesamtschule,358,335805,0.066137,0.137405
3,Grundschule,2788,646558,0.515056,0.264559
4,Waldorfschule,58,19105,0.010715,0.007817
5,Gemeinschaftsschule (Schulversuch),120,59768,0.022169,0.024456
6,Förderschule,500,85878,0.09237,0.03514
7,Realschule,379,202452,0.070017,0.082839
8,Berufskolleg,365,522805,0.06743,0.213921
9,Hauptschule,174,51706,0.032145,0.021157


In [7]:
df_plz_nrw = pd.read_csv('data/plz_nrw.csv', encoding= 'unicode_escape', delimiter=';')
df_plz_nrw.head()

Unnamed: 0,Postleitzahl,Name,Landkreis
0,32049,Herford,Landkreis Herford
1,32051,Herford,Landkreis Herford
2,32052,Herford,Landkreis Herford
3,32105,Bad Salzuflen,Landkreis Lippe
4,32107,Bad Salzuflen,Landkreis Lippe


In [8]:
df_plz_einw = pd.read_csv('data/plz_einwohner.csv', delimiter=';')
df_plz_einw.head()

Unnamed: 0,plz,einwohner
0,1067,11957.0
1,1069,25491.0
2,1097,14821.0
3,1099,28018.0
4,1108,5876.0


In [9]:
# Merge to get only cities that are in NRW
df_plz = df_plz_nrw.merge(df_plz_einw, how='left', left_on='Postleitzahl', right_on='plz')[['plz', 'einwohner']]
df_plz.shape

(865, 2)

In [10]:
sum_einw = df_plz['einwohner'].sum()
df_plz['proportion'] = df_plz['einwohner'].apply(lambda row: row/sum_einw)
df_plz

Unnamed: 0,plz,einwohner,proportion
0,32049,26686.0,6.347629e-07
1,32051,17891.0,4.255618e-07
2,32052,20781.0,4.943044e-07
3,32105,18638.0,4.433302e-07
4,32107,16407.0,3.902628e-07
...,...,...,...
860,59929,25962.0,6.175415e-07
861,59939,15077.0,3.586270e-07
862,59955,13003.0,3.092941e-07
863,59964,7998.0,1.902433e-07


In [11]:
# 5400 Schulen in NRW

In [12]:
genders = []
ages = []
married = []
kids = []
residences = []
subjects_1 = []
subjects_2 = []
schools = []
salary_expectations = []
pref_big_school = []
pref_rural = []
for i in range(number_of_teachers):
    genders.append(random.choice(gender))
    ages.append(random.choice(age))
    married.append(random.choice([0, 1]))
    kids.append(random.choice([0, 1]))
    #residences.append(''.join(random.sample('0123456789', 5)))
    residences.append(np.random.choice(df_plz['plz'], p=df_plz['proportion']))
    both_subjects = random.sample(subjects, 2)
    subjects_1.append(both_subjects[0])
    subjects_2.append(both_subjects[1])
    schools.append(np.random.choice(df_types_of_schools['type_of_school'], p=df_types_of_schools['proportion_pupils']))
    salary_expectations.append(random.choice(salaries))
    pref_big_school.append(random.choice([0, 1]))
    pref_rural.append(random.choice([0, 1]))
    
teachers['gender'] = genders
teachers['age'] = ages
teachers['married'] = married
teachers['kids'] = kids
teachers['residence'] = residences
teachers['subject_1'] = subjects_1
teachers['subject_2'] = subjects_2
teachers['type_of_school'] = schools
teachers['salary_expectation'] = salary_expectations
teachers['years_of_work_experience'] = teachers['age'].apply(lambda row: max(0, row-random.randint(25, 35)))
teachers['preference_big_school'] = pref_big_school
teachers['preference_rural'] = pref_rural

In [13]:
teachers

Unnamed: 0,id,gender,age,residence,married,kids,subject_1,subject_2,type_of_school,salary_expectation,years_of_work_experience,preference_big_school,preference_rural
0,0,female,32,48155,0,1,Art,English,Gymnasium,50000,0,1,1
1,1,male,49,47669,0,1,Science,Art,Berufskolleg,52000,18,0,0
2,2,male,53,52525,0,0,Art,Sports,Grundschule,62000,20,1,0
3,3,male,45,47807,1,1,Mathematics,English,Grundschule,50000,10,0,1
4,4,female,31,48629,1,0,Geography,English,Gesamtschule,50000,4,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,95995,female,53,59423,0,1,Music,Science,Grundschule,60000,26,0,0
95996,95996,female,37,33647,0,1,English,Science,Berufskolleg,58000,11,0,0
95997,95997,female,55,47669,0,1,Geography,Science,Gymnasium,48000,30,1,1
95998,95998,male,26,50389,0,0,Mathematics,Sports,Grundschule,50000,0,1,1


In [None]:
# 5400 Schulen, Wie viele Lehrer schätzen?

In [14]:
teachers.to_csv('data/teachers.csv', index=False)

In [6]:
# Filter on Waldorfschule

In [7]:
df_teachers = pd.read_csv('../data/teachers.csv')
df_schools = pd.read_csv('../data/school_dataset.csv')

In [10]:
df_schools

Unnamed: 0,school_number,school_type,name,short_name,address,zipcode,location,student,tel,email,is_big,community,is_rural,coordinates,latitude,longitude
0,100000,Weiterbildungskolleg,Studienkolleg des Ökumenischen,"Bochum, WBK KOL Studienkolleg",Girondelle 80,44799,Bochum,0,2.349388e+09,100000@schule.nrw.de,0,"Bochum, Stadt",0,"(51.454125418916135, 7.234122444196718)",51.454125,7.234122
1,100010,Gymnasium,Gymnasium Claudia Agrippina Privat,"Köln, Gym Gymnasium Claudia Agrippina",Stolberger Str. 112,50933,Köln,98,2.213800e+10,100010@schule.nrw.de,0,"Köln, Stadt",0,"(50.94161647765391, 6.881410448090622)",50.941616,6.881410
2,100011,Gesamtschule,Städt. Gesamtschule,"Haan, GE Walder Straße",Walder Str. 15,42781,Haan,531,2.129375e+10,100011@schule.nrw.de,1,Mettmann,1,"(51.208013199999996, 7.01574931157376)",51.208013,7.015749
3,100012,Grundschule,Städt. Gemeinschaftsgrundschule,"Leverkusen, GG Am Friedenspark",Netzestr. 12,51371,Leverkusen,355,2.143108e+08,100012@schule.nrw.de,0,"Leverkusen, Stadt",0,"(51.05702510942749, 6.946612662697298)",51.057025,6.946613
4,100014,Grundschule,Kolibri-Schule,"Herne, GG Kolibri-Schule",Jean-Vogel-Str. 36,44625,Herne,436,2.323164e+09,100014@schule.nrw.de,0,"Herne, Stadt",0,"(51.52862893905526, 7.211516297688583)",51.528629,7.211516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5649,199953,Gesamtschule,KOSMOS-Bildung Münsterlandschule Tilbeck,"Havixbeck,GE KOSMOS-Bild. Münsterlandsch",Tilbeck 2,48329,Havixbeck,230,2.507539e+10,199953@schule.nrw.de,0,Coesfeld,1,"(42.68717911042164, -83.38364684146275)",42.687179,-83.383647
5650,199965,Gemeinschaftsschule (Schulversuch),Josef-Annegarn-Sekundarschule,"Ostbevern, SK Josef-Annegarn",Hanfgarten 18,48346,Ostbevern,569,2.532957e+09,199965@schule.nrw.de,1,Warendorf,1,"(42.7222014, -83.36190730922124)",42.722201,-83.361907
5651,199977,Gesamtschule,"Julia-Koppers-Gesamtschule, Gesamtschule","Borken, GE Julia-Koppers-Gesamtschule",Auf der Flüt 9,46325,Borken,723,2.861925e+10,199977@schule.nrw.de,1,Borken,1,"(51.857261740280684, 6.848522005796673)",51.857262,6.848522
5652,199989,Grundschule,Städt. Gemeinschaftsgrundschule Brakel,"Brakel, GG Brakel",Klöckerstr. 25,33034,Brakel,637,5.272360e+10,199989@schule.nrw.de,1,Höxter,1,"(51.711206424763716, 9.1818016895822)",51.711206,9.181802


In [11]:
df_teachers = df_teachers[df_teachers['type_of_school'] == 'Waldorfschule']
df_schools = df_schools[df_schools['school_type'] == 'Waldorfschule']

In [20]:
df_teachers.shape

(761, 16)

In [14]:
df_teachers['preference_big_school']

67       0
163      1
191      1
225      0
261      1
        ..
95096    0
95305    0
95683    0
95882    0
95938    1
Name: preference_big_school, Length: 761, dtype: int64

In [15]:
df_schools['is_big']

8       1
20      0
73      0
2959    0
3468    1
3472    0
3474    0
3478    1
3479    1
3482    1
3483    1
4440    0
4562    0
4581    0
4607    0
4641    0
4655    0
4656    1
4671    1
4674    1
4675    0
4738    1
4747    0
4748    0
4780    0
4782    1
4784    0
4818    0
4821    0
4845    0
4847    0
4882    0
4894    0
4903    1
4907    0
4944    0
4982    0
4984    1
4990    0
5002    0
5008    0
5022    0
5060    0
5100    0
5101    0
5173    0
5190    0
5199    0
5201    0
5203    0
5256    0
5290    0
5313    0
5323    0
5437    0
5469    0
5624    0
5633    0
Name: is_big, dtype: int64

In [16]:
import itertools

In [18]:
comp = [a == b for (a, b) in itertools.product(df_teachers['preference_big_school'].tolist(), df_schools['is_big'].tolist())]
len(comp)

44138

In [21]:
comp_array = np.array(comp).reshape(761, 58)
comp_array.shape

(761, 58)

In [22]:
comp_array_new = np.multiply(comp_array, 1)#.shape

In [25]:
with open('../data/preference_big_school_Waldorfschule.pkl','wb') as f:
    pickle.dump(comp_array_new, f)

In [26]:
comp = [a == b for (a, b) in itertools.product(df_teachers['preference_rural'].tolist(), df_schools['is_rural'].tolist())]
len(comp)

44138

In [27]:
comp_array = np.array(comp).reshape(761, 58)
comp_array.shape

(761, 58)

In [28]:
comp_array_new = np.multiply(comp_array, 1)#.shape

In [29]:
with open('../data/preference_rural_Waldorfschule.pkl','wb') as f:
    pickle.dump(comp_array_new, f)

In [61]:
comp_array_new.shape[0]

96000

In [63]:
(comp_array_new + comp_array_new)[0]

array([0, 0, 2, ..., 2, 2, 0])

In [11]:
number_teachers = df_teachers.shape[0]
number_schools = df_schools.shape[0]
cost_matrix = np.zeros(shape=(number_teachers, number_schools))
cost_matrix.shape

(96000, 5654)

In [34]:
df_teachers = df_teachers[df_teachers['type_of_school']=='Waldorfschule']
df_teachers.head()

Unnamed: 0,id,gender,age,residence,married,kids,subject_1,subject_2,type_of_school,salary_expectation,years_of_work_experience,preference_big_school,preference_rural,coordinates,latitude,longitude
67,67,female,43,52525,1,1,Music,Mathematics,Waldorfschule,62000,8,0,1,"(32.091800426914446, 34.80279714441326)",32.0918,34.802797
163,163,male,31,48629,0,0,Art,Sports,Waldorfschule,58000,4,1,1,"(44.32458327828735, -84.7452466236589)",44.324583,-84.745247
191,191,male,47,52525,0,1,Art,English,Waldorfschule,60000,20,1,1,"(32.091800426914446, 34.80279714441326)",32.0918,34.802797
225,225,female,33,48629,1,1,Geography,Science,Waldorfschule,52000,0,0,0,"(44.32458327828735, -84.7452466236589)",44.324583,-84.745247
261,261,male,54,33647,0,0,Music,Sports,Waldorfschule,54000,24,1,0,"(51.984403368678315, 8.508653894887162)",51.984403,8.508654


In [31]:
df_results = pd.read_csv('../data/results.csv')
df_results.head()

Unnamed: 0,iteration,teacher,school,cost,dist,pref_school_size_unsatisfied,pref_urban_rural_unsatisfied
0,1.0,0.0,25.0,32.155062,32.155062,0.0,0.0
1,1.0,1.0,23.0,34.074822,34.074822,0.0,0.0
2,1.0,2.0,52.0,31.592944,31.592944,0.0,0.0
3,1.0,3.0,1.0,33.198391,23.198391,1.0,0.0
4,1.0,4.0,28.0,26.69636,26.69636,0.0,0.0


In [35]:
df_map = pd.DataFrame()

In [36]:
df_map['iteration'] = df_results['iteration']

In [38]:
df_results

Unnamed: 0,iteration,teacher,school,cost,dist,pref_school_size_unsatisfied,pref_urban_rural_unsatisfied
0,1.0,0.0,25.0,32.155062,32.155062,0.0,0.0
1,1.0,1.0,23.0,34.074822,34.074822,0.0,0.0
2,1.0,2.0,52.0,31.592944,31.592944,0.0,0.0
3,1.0,3.0,1.0,33.198391,23.198391,1.0,0.0
4,1.0,4.0,28.0,26.696360,26.696360,0.0,0.0
...,...,...,...,...,...,...,...
1517,2.0,756.0,41.0,29.262380,9.262380,1.0,1.0
1518,2.0,757.0,9.0,21.832316,11.832316,0.0,1.0
1519,2.0,758.0,25.0,32.155062,32.155062,0.0,0.0
1520,2.0,759.0,17.0,20.366482,20.366482,0.0,0.0


In [40]:
df_schools

Unnamed: 0,school_number,school_type,name,short_name,address,zipcode,location,student,tel,email,is_big,community,is_rural,coordinates,latitude,longitude
8,100018,Waldorfschule,Freie Waldorfschule Rudolf Steiner Schu-,"Essen, FW Schellstraße",Schellstr. 47,45134,Essen,624,201435200.0,100018@schule.nrw.de,1,"Essen, Stadt",0,"(54.87534225313415, 23.945446681757748)",54.875342,23.945447
20,100030,Waldorfschule,Freie Waldorfschule Rheine,"Rheine, FW Meisenstr.",Meisenstr. 30,48429,Rheine,103,59719810000.0,100030@schule.nrw.de,0,Steinfurt,1,"(42.91461378862931, -83.999743409302)",42.914614,-83.999743
73,100087,Waldorfschule,Freie Waldorfschule Niederrhein-Aue,"Uedem, FW Niederrhein-Aue",Meursfeldstr. 8,47589,Uedem,58,28253080000.0,100087@schule.nrw.de,0,Kleve,1,"(35.1775021, 129.0896454)",35.177502,129.089645
2959,152936,Waldorfschule,Christian-Morgenstern-Schule,"Wuppertal, FW Chr.-Morgenstern-Schule",Wittensteinstr. 76,42285,Wuppertal,251,20283090.0,152936@schule.nrw.de,0,"Wuppertal, Stadt",0,"(51.25989236688539, 7.174118107464759)",51.259892,7.174118
3468,164112,Waldorfschule,Freie Waldorfschule Krefeld,"Krefeld, FW Kaiserstraße",Kaiserstr. 61,47800,Krefeld,487,215154000.0,164112@schule.nrw.de,1,"Krefeld, Stadt",0,"(51.34604272037793, 6.602901361965979)",51.346043,6.602901
3472,164150,Waldorfschule,Priv. Rudolf-Steiner-Schule Wuppertal,"Wuppertal, FW Rudolf-Steiner-Schule",Schluchtstr. 21,42285,Wuppertal,406,202280800.0,164150@schule.nrw.de,0,"Wuppertal, Stadt",0,"(51.25989236688539, 7.174118107464759)",51.259892,7.174118
3474,164173,Waldorfschule,Priv. Freie Waldorfschule Bonn,"Bonn, FW Stettiner Straße",Stettiner Str. 21,53119,Bonn,416,228668100.0,164173@schule.nrw.de,0,"Bonn, Stadt",0,"(50.74575852736701, 7.064282336079666)",50.745759,7.064282
3478,164215,Waldorfschule,Rudolf-Steiner-Schule Schloß Hamborn,"Borchen, FW Rudolf-Steiner-Schule",Schloß Hamborn 5,33178,Borchen,565,5251389000.0,164215@schule.nrw.de,1,Paderborn,1,"(25.832290045443667, -80.36994672920468)",25.83229,-80.369947
3479,164227,Waldorfschule,Rudolf Steiner Schule Bochum m. Förder-,"Bochum, FW Rudolf-Steiner-Schule",Hauptstr. 238-246,44892,Bochum,943,234922100.0,164227@schule.nrw.de,1,"Bochum, Stadt",0,"(51.46995765, 7.32260917565241)",51.469958,7.322609
3482,164252,Waldorfschule,Rudolf-Steiner-Schule,"Dortmund, FW Rudolf-Steiner-Schule",Mergelteichstr. 51,44225,Dortmund,738,2314765000.0,164252@schule.nrw.de,1,"Dortmund, Stadt",0,"(51.47619666021302, 7.44414456169628)",51.476197,7.444145
