# More cleaning

From the SQL workbook: 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [49]:
df = pd.read_csv("data_neutral.csv", sep = ";")

In [50]:
df = df[df["Onderwijssoort"] == "Wetenschappelijk onderwijs"].copy()

In [51]:
df = df.drop(["Onderwijssoort"], axis = 1)

In [52]:
df = df.drop(columns = ["Unnamed: 0"])

I'll get the above categories into a dictionary. With only 13 elements and copy-paste, it's easier to do it by hand than to write code doing all the cleaning.

In [53]:
categs = {"011 ": "Pedagogy",
          "021 ": "Art",
          "023 ": "Language",
          "022 ": "Other humanities", 
          "0311 ": "Economics",
          "0312 ": "Political science",
          "0313 ": "Psychology",
          "0314 ": "Sociology",
          "032 ": "Journalism",
          "041 ": "Business",
          "042 ": "Law",
          "05 ": "STEM",
          "06 ": "STEM",
          "07 ": "STEM",
          "09 ": "Medicine"}
# Adding the space to the key here to avoid catching substrings later on (e.g. '011' should not match with '0110')

In [54]:
categs

{'011 ': 'Pedagogy',
 '021 ': 'Art',
 '023 ': 'Language',
 '022 ': 'Other humanities',
 '0311 ': 'Economics',
 '0312 ': 'Political science',
 '0313 ': 'Psychology',
 '0314 ': 'Sociology',
 '032 ': 'Journalism',
 '041 ': 'Business',
 '042 ': 'Law',
 '05 ': 'STEM',
 '06 ': 'STEM',
 '07 ': 'STEM',
 '09 ': 'Medicine'}

In [55]:
newvalues = []

for i in range(len(df)):
    item = df['Studierichting'].iloc[i]
    if str(item)[:5] in categs.keys():
        newvalues.append(categs[str(item)[:5]])
    elif str(item)[:4] in categs.keys():
        newvalues.append(categs[str(item)[:4]])
    elif str(item)[:3] in categs.keys():
        newvalues.append(categs[str(item)[:3]])
    else: 
        newvalues.append("X")

df["Area"] = newvalues

In [56]:
df

Unnamed: 0,Studierichting,Perioden,Enrollment,Freshmen,Area
1884,Totaal,2010,242381,52453,X
1885,Totaal,2011,245428,52825,X
1886,Totaal,2012,241372,52015,X
1887,Totaal,2013,250186,55441,X
1888,Totaal,2014,255661,55451,X
...,...,...,...,...,...
3763,Onderwijsrichting onbekend,2017,476,86,X
3764,Onderwijsrichting onbekend,2018,445,77,X
3765,Onderwijsrichting onbekend,2019,416,76,X
3766,Onderwijsrichting onbekend,2020,406,103,X


In [57]:
df2 = df[df["Area"] != "X"]

In [58]:
df2

Unnamed: 0,Studierichting,Perioden,Enrollment,Freshmen,Area
1908,"011 Onderwijskunde, lerarenopleiding...",2010,9378,2039,Pedagogy
1909,"011 Onderwijskunde, lerarenopleiding...",2011,9575,1916,Pedagogy
1910,"011 Onderwijskunde, lerarenopleiding...",2012,9249,1831,Pedagogy
1911,"011 Onderwijskunde, lerarenopleiding...",2013,9561,1856,Pedagogy
1912,"011 Onderwijskunde, lerarenopleiding...",2014,9560,1831,Pedagogy
...,...,...,...,...,...
3307,09 Gezondheidszorg en welzijn,2017,30936,5099,Medicine
3308,09 Gezondheidszorg en welzijn,2018,31520,5331,Medicine
3309,09 Gezondheidszorg en welzijn,2019,32015,5422,Medicine
3310,09 Gezondheidszorg en welzijn,2020,33518,5961,Medicine


In [71]:
totals = df[df['Studierichting'] == 'Totaal']

In [72]:
totals

Unnamed: 0,Studierichting,Perioden,Enrollment,Freshmen,Area
1884,Totaal,2010,242381,52453,X
1885,Totaal,2011,245428,52825,X
1886,Totaal,2012,241372,52015,X
1887,Totaal,2013,250186,55441,X
1888,Totaal,2014,255661,55451,X
1889,Totaal,2015,261169,56587,X
1890,Totaal,2016,268027,61072,X
1891,Totaal,2017,280038,67175,X
1892,Totaal,2018,294712,71791,X
1893,Totaal,2019,306888,73026,X


In [73]:
yeardict = dict(zip(totals.Perioden.tolist(), totals.Freshmen.tolist()))

In [74]:
percentages = []
for i in range(len(df2)):
    year = df2['Perioden'].iloc[i]
    freshmen = df2['Freshmen'].iloc[i]
    percentages.append((freshmen/yeardict[year])*100)


In [76]:
df2['Percentage'] = percentages

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Percentage'] = percentages


In [78]:
df2.to_csv("newbreakdown.csv", sep = ";")