### Exploration of the income of parents
Look at mothers and fathers income 

In [None]:
import pandas as pd
import sas7bdat
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl

mpl.style.use('seaborn-whitegrid')  # Use 'seaborn-whitegrid' instead of 'ggplot'

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
filepath='../data/socio.sas7bdat'
socio =pd.read_sas(filepath, format='sas7bdat')

In [None]:
filepath='../data/umo_uddannelser.sas7bdat'
udd=pd.read_sas(filepath, format='sas7bdat')

In [None]:
# Find outliers with the z-score threshold for mother and father income
test['z_mother'] = np.abs((test['mor_brutto'] - test['mor_brutto'].mean()) / test['mor_brutto'].std())
test['z_father'] = np.abs((test['far_brutto'] - test['far_brutto'].mean()) / test['far_brutto'].std())

# Remove outliers and NaN values from mother and father income
new = test.loc[~test['far_brutto'].isna()]
new = new.loc[~new['mor_brutto'].isna()]

z_threshold = 2

# Filter the data based on z-score thresholds
df = new[new['z_father'] <= z_threshold].reset_index(drop=True)
df = df[df['z_mother'] <= z_threshold].reset_index(drop=True)

x = df['far_brutto']
y = df['mor_brutto']

fig, ax = plt.subplots(figsize=(16, 8))

sns.histplot(x, color='steelblue', label='Income Father: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*socio['far_brutto'].describe()[['mean', '50%', 'std']]), bins=50)
sns.histplot(y, label='Income Mother: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*test['far_brutto'].describe()[['mean', '50%', 'std']]), bins=50)

ax.set_xlabel('Income of parents in millions')
ax.set_ylabel('Count of students in income bucket')
ax.set_title('Distribution of Income for Mother and Father in Millions')
plt.legend()
plt.tight_layout()

# Save the data as a pickle file
test.to_pickle('figures/mom_and_dad_income.pkl')

# Save the figure as a PDF file
plt.savefig('figures/mom_and_dad_income.pdf')

plt.show()