### Exploration of the socio-economic factors of parents

In this section we explore the impact of parents and their level of education. We also clean and add further enrichment data to the education levels

In [None]:
import pandas as pd
import sas7bdat
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl

mpl.style.use('seaborn-whitegrid')  # Use 'seaborn-whitegrid' instead of 'ggplot'

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
#Read socio-economic features 
filepath='../data/socio.sas7bdat'
socio =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

#Read student specific information
register=pd.read_pickle('../data/clean_reg.pkl')

In [None]:
filepath='../data/uddreg_klassifikation_audd.sas7bdat'
udd =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')
udd=udd[['AUDD','UDDANNELSESNIVEAU']]
udd['UDDANNELSESNIVEAU'] = udd['UDDANNELSESNIVEAU'].astype(float)

#Map fitting descriptions to the education level
map={
    10:'Primary School',
    90:'Not recorded',
    20:'Secondary School',
    30:'High School or Trades',
    50:"Short Higher Education",
    60:"Bachelors",
    70:"Masters",
    80:"Phd or researcher"
}



udd['edu']=udd["UDDANNELSESNIVEAU"].map(map)

#Create a 1-8 map of values
map={
    10:1,
    90:2,
    20:3,
    30:4,
    50:6,
    60:6,
    70:7,
    80:8
}

udd['level']=udd["UDDANNELSESNIVEAU"].map(map)

In [None]:
merged=pd.merge(socio,udd,left_on='mor_hfudd',right_on='AUDD',how='left')
merged=merged.rename(columns={'edu':'mor_udd'})
merged=pd.merge(merged,udd.rename(columns={'edu':'far_udd'}),left_on='far_hfudd',right_on='AUDD',how='left')

merged=merged.drop(columns=['AUDD_x','AUDD_y'])

In [None]:
merged.drop(columns=['far_hfudd','mor_hfudd'],inplace=True)
#Rename Columns to More Fitting names
merged.rename(columns={'UDDANNELSESNIVEAU_x':'audd_mor','UDDANNELSESNIVEAU_y':'audd_far','level_x':'edu_level_mother',
                       'level_y':'edu_level_father','mor_brutto':'income_mother','far_brutto':'income_father'},inplace=True)

#To string
merged['far_soc']=merged['far_soc'].apply(str)
merged['mor_soc']=merged['mor_soc'].apply(str)

In [None]:
#Remove weired category with very few instances. 
merged=merged.query('mor_soc != "  ." & far_soc != "  ."')

#Convert to float
merged['far_soc']=merged['far_soc'].astype(float)
merged['mor_soc']=merged['mor_soc'].astype(float)


merged = merged[(merged['far_soc'] > 100) & (merged['mor_soc'] > 100)]


#Create map to aggregate values
conditions = [
    (merged['far_soc'] >= 110) & (merged['far_soc'] < 200),
    (merged['far_soc'] >= 200)
]
choices = ['employed','outside_of_worforce']

# use numpy.select to apply the mapping to the 'values' column
merged['social_father'] = np.select(conditions, choices, default=merged['far_soc'])

#Same for mother
conditions = [
    (merged['mor_soc'] >= 110) & (merged['mor_soc'] < 200),
    (merged['mor_soc'] >= 200)
]

# use numpy.select to apply the mapping to the 'values' column
merged['social_mother'] = np.select(conditions, choices, default=merged['mor_soc'])

#Drop Columns with NaN values
merged=merged[(merged['social_father'] != 'nan') & (merged['social_mother'] != 'nan')]

merged.drop(columns=['far_soc','mor_soc','audd_mor','audd_far'],inplace=True)

#Save file made so far
merged.to_pickle('../df/edu_parents.pkl')

In [None]:
#Read Grades
filepath='../data/karakterer.sas7bdat'
grade =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

In [None]:
# Grouping by social status of fathers
soc_father = (merged.groupby(['social_father'], sort=False)
              .agg(**{'students': ('elev_id', 'count'), 'avg_salary': ('income_father', 'mean')})
              .reset_index()
              )

# Grouping by social status of mothers
soc_mother = (merged.groupby(['social_mother'], sort=False)
              .agg(**{'students': ('elev_id', 'count'), 'avg_salary': ('income_father', 'mean')})
              .reset_index()
              )

x = soc_father['social_father']
y1 = soc_mother['students']
y2 = soc_father['students']

x_labels = ['Employed', 'Outside of Workforce']

fig, ax = plt.subplots(figsize=(18, 8))

width = 0.35
pos1 = range(len(soc_father))
pos2 = [x + width for x in pos1]

ax.bar(pos1, soc_mother['students'], width, label='Number of mothers with a specified social status', color='#c44e52')
ax.bar(pos2, soc_father['students'], width, label='Number of fathers with a specified social status', color='#4c72b0')

ax.set_xlabel('Social status of parents')
ax.set_ylabel('Number of students for mother and father')
ax.set_xticks(list(pos1))
ax.set_xticklabels(x_labels)
ax.legend()

# Saving the combined dataframe as a pickle file
temp = soc_father.append(soc_mother)
temp.to_pickle('figures/social-status-parents.pkl')

# Saving the plot as a PDF file
plt.savefig('figures/social-status-parents.pdf')

plt.show()

In [None]:
#I need to clean the grades table so I can get the descriptions, mean etc.
grade['Karakter']=grade['Karakter'].astype(str)#We need to make the grades string to extract the pattern
grade['Karakter']=grade['Karakter'].str.extract('(\d+)')#Extact only the number from the Grade column

grades=["3","0","2","4","7","10","12"]

grade=grade[grade['Karakter'].isin(grades)]

grade['Karakter']=grade['Karakter'].astype(int)

In [None]:
#Simply aggregate students to their average grade
temp=(grade.groupby(['elev_id'],sort=False)
              .agg(**{'avg_grade': ('Karakter','mean')})
              .reset_index()
              )

#Grade merge with parents
gm = pd.merge(merged,temp,left_on='elev_id',right_on='elev_id')

In [None]:
#Aggregate eduacation level of both parents and grades
mom=(gm.groupby('mor_udd',sort=False)
              .agg(**{'avg_grade': ('avg_grade','mean'),'students':('elev_id','count')})
              .reset_index()
              )


dad=(gm.groupby('far_udd',sort=False)
              .agg(**{'avg_grade': ('avg_grade','mean'),'students':('elev_id','count')})
              .reset_index()
              )


#Merge them together 
parents=pd.merge(dad,mom,left_on='far_udd',right_on='mor_udd')

In [None]:
# Sorting mom and dad dataframes based on average grade
mom = mom.sort_values('avg_grade')
dad = dad.sort_values('avg_grade')

# Extracting data for x and y axes
x = mom['mor_udd']
y1 = mom['avg_grade']
y2 = dad['avg_grade']

# Function to split x-axis tick labels for better readability
def split_xtick_labels(labels):
    split_labels = []
    for label in labels:
        words = label.split(' ')
        if len(words) == 4:
            split_label = ' '.join(words[:2]) + '\n' + ' '.join(words[2:])
        else:
            split_label = words[0] + '\n' + ' '.join(words[1:])
        split_labels.append(split_label)
    return split_labels

# Splitting x-axis tick labels
x_labels = split_xtick_labels(x)

# Creating the figure and axes
fig, ax = plt.subplots(figsize=(18, 8))

# Define the width and positions for bars
width = 0.35
pos1 = range(len(mom))
pos2 = [x + width for x in pos1]

# Plotting the bar chart for mom and dad average grades
ax.bar(pos1, mom['avg_grade'], width, label='Average Grade of Students Based on Highest Education of Mother', color='#c44e52')
ax.bar(pos2, dad['avg_grade'], width, label='Average Grade of Students Based on Highest Education of Father', color='#4c72b0')

# Setting x and y labels, title, and tick labels
ax.set_xlabel('Highest completed education of parents')
ax.set_ylabel('Average Grade Received')
ax.set_title('Average Grade of Students Based on the Education Level of Their Parents')
ax.set_xticks(list(pos1))
ax.set_xticklabels(x_labels)
ax.legend()

# Add average grade annotations
for i, grade in enumerate(mom['avg_grade']):
    ax.text(pos1[i], grade + 0.05, f'{grade:.2f}', ha='center', va='bottom')

for i, grade in enumerate(dad['avg_grade']):
    ax.text(pos2[i], grade + 0.05, f'{grade:.2f}', ha='center', va='bottom')

# Saving the parent_edu_grade dataframe as a pickle file
parents.to_pickle('figures/parent_edu_grade.pkl')

# Saving the plot as a PDF file
plt.savefig('figures/parents_edu_grade.pdf')

# Display the plot
plt.show()

In [None]:
#Extracting year from Til
grade['Fra']=grade['Fra'].astype(str) #Make the from column a string to extract pattern
grade=grade[grade['Fra'].str.match('^2\d{3}')]#Extract only patterns that match a legit year 20**
grade['Fra']=pd.to_datetime(grade['Fra'])#Make DateTime
grade['year']=grade['Fra'].dt.year #Calculate a year

fun=(grade.groupby('year',sort=False)
              .agg(**{'students': ('elev_id','count')})
              .reset_index()
              )

#Show spread of grades given for each year 
fun = fun.sort_values('year')
fun=fun[fun['year'] > 2002]
x=fun.year
y=fun.students
fig,ax=plt.subplots(figsize=(16,8))

ax.bar(x,y,color='steelblue')

ax.set_xlabel('Year')
ax.set_ylabel('Count of grades given')
ax.set_title('Sum of grades given over different years')

plt.xticks(x)


fun.to_pickle('figures/count_grades_data.pkl')
plt.savefig('figures/count_grades.pdf')
plt.show()