### Prepare group based factors to be used in matching experiment

We look at group diversity, size, difference in socio-economic status etc.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sas7bdat
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf

from scipy.spatial.distance import pdist, squareform
from scipy.stats import zscore

In [None]:
import matplotlib as mpl

mpl.style.use('ggplot')

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 13
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

In [None]:
#Read DF with student group information
elev_hold = pd.read_pickle('../Data/elevhold.pkl')
# Calculate the difference in days between start and finish
elev_hold['duration_days'] = (elev_hold['slut_dato'] - elev_hold['start_dato']).dt.days

In [None]:
#Read grades
grades=pd.read_pickle('../Data/grades.pkl')

In [None]:
#Merge grades and student-groups
test=elev_hold.merge(grades,on=['elev_id','inst_nr','fag_nr','fag_niveau'])
hold_grade=test[~test['Karakter'].isna()]

In [None]:
#Get course info
filepath='../data/umo_fag.sas7bdat'
course =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

course=course[['Fag','KortBetegnelse']]
course.rename(columns={'Fag':'fag_nr','KortBetegnelse':'course'},inplace=True)

course_description=pd.read_excel('course_description.xlsx')

course=course.merge(course_description,on='course',how='left')

course=course[~course['hard_science'].isna()]
print(len(course))
course.drop_duplicates()
len(course)

In [None]:
#Merge course info
hold_grade=hold_grade.merge(course,on='fag_nr',how='left')

#Drop dupliactes
hold_grade.drop_duplicates(inplace=True)

#Get the student metadata to merge student information on the student-groups
tmp=pd.read_pickle('clean_students.pkl')

#Drop columns not needed for now 
tmp.drop(columns=['lessons','absence','off_grade','off_absence','students','pedagogical_percent','FOED_DATO','grades',
                 'percentage'],inplace=True)

#Merge student info on group-grade and ensure the right dates are specified
tmp=tmp.merge(hold_grade,on=['elev_id','inst_nr'],how='left')

# Boolean indexing to filter rows
filtered_df = tmp[(tmp['start_dato'] >= tmp['startdato']) & (tmp['slut_dato'] <= tmp['slutdato'])]

In [None]:
#The courses from umo
filepath='../data/umo_fag.sas7bdat'
course =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

#Keep only relevant columns and rename others
course=course[['Fag','KortBetegnelse']]
course.rename(columns={'Fag':'fag_nr','KortBetegnelse':'course'},inplace=True)

#Add description to courses based on descrption and add hard_science
course_description=pd.read_excel('course_description.xlsx')
course=course.merge(course_description,on='course',how='left')

print(len(course))
course=course[~course['hard_science'].isna()]
print(len(course))
course.drop_duplicates(inplace=True)
print(len(course))

filtered_df=filtered_df.merge(course_description,on=['course','hard_science'],how='left')

In [None]:
#The courses from umo to fill the missing values
filepath='../data/macom_fag.sas7bdat'
test =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

test.drop(columns=['Niveau'],inplace=True)

#Make an effort to add a description to all the NaN values
filtered_df.drop
ha=filtered_df[filtered_df['hard_science'].isna()]

test=test.rename(columns={'FagNummer':'fag_nr','InstNr':'inst_nr'})

ha=ha.merge(test,on=['fag_nr','inst_nr'],how='left')

#Add description to courses based on descrption and add hard_science
ha.drop(columns=['course','hard_science'],inplace=True)
ha.rename(columns={'FagNavn':'course'},inplace=True)
course_description=pd.read_excel('course_description.xlsx')
haha=ha.merge(course_description,on='course',how='left')

has=haha[['fag_nr','course','hard_science']]
has.drop_duplicates(inplace=True)
has.sort_values('hard_science',inplace=True)
has.drop_duplicates(subset=['fag_nr'], keep='first',inplace=True)
new=filtered_df.merge(has,on=['fag_nr'],how='left',suffixes=('_df1', '_df2'))

new.drop(columns=['course_df1','hard_science_df1'])
new.rename(columns={'course_df2':'course','hard_science_df2':'hard_science'},inplace=True)

columns_to_fill = ['course','hard_science']

# Fill NaN values in specified columns of df1 with corresponding values from df2
filtered_df[columns_to_fill] = filtered_df[columns_to_fill].fillna(new[columns_to_fill])

In [None]:
# Extract the school year based on the 'end_date' column
filtered_df['school_year'] = filtered_df['slut_dato'].dt.year
filtered_df.loc[filtered_df['slut_dato'].dt.month < 7, 'school_year'] -= 1

# Extract the last two digits of the school year
filtered_df['school_year_suffix'] = filtered_df['school_year'] % 100

# Append the school year suffix to the hold_nr column
filtered_df['new_holdnr'] = filtered_df['holdnr'] + filtered_df['school_year_suffix'] / 100

In [None]:
#Drop exams that are not in the specified time period for students 
new_df=filtered_df[(filtered_df['Fra'] >= filtered_df['startdato']) & ((filtered_df['Fra'] <= filtered_df['slutdato']))]

In [None]:
# Assuming 'new_df' is the DataFrame containing the 'Karakter' column
new_df['grade'] = pd.to_numeric(new_df['Karakter'], errors='coerce').fillna(0).astype(int)

new_df=new_df[new_df['grade'] != 94]

In [None]:
#Get int value from age
new_df['age']= new_df['age_semester'].dt.days.astype(int)
new_df['gender']= new_df['KOEN'].astype(int)

#Change the gender to zeroes and 1s calculate percentage
new_df['gender'] = new_df['gender'].replace({1:0, 2:1})

#Get the mean of variables for each group
group_meta=(new_df.groupby(['new_holdnr'],sort=False)
              .agg(**{'students': ('elev_id','nunique'),'gender_spread': ('gender','mean'),
                     'income_mothers': ('income_mother','mean'),'income_fathers': ('income_father','mean'),
                     'edu_mothers': ('edu_level_mother','mean'),'edu_fathers': ('edu_level_father','mean'),
                     'group_grade':('grade','mean'),'group_abssence':('percent','mean'),
                      'average_age':('age','mean'),'students_avg_grde':('avg_grade','mean')})
              .reset_index()
              )


In [None]:
lecture_data = pd.read_pickle('../df/group_meta.pkl')

In [None]:
#Create a new dataframe with both lecture data for groups and group construction data
merged=new_df.merge(lecture_data,on='new_holdnr',how='left')
merged=merged.merge(group_meta,on='new_holdnr')

In [None]:
merged['gender']= merged['KOEN'].astype(int)

#Change the gender to zeroes and 1s calculate percentage
merged['gender'] = merged['gender'].replace({1:0, 2:1})

merged.drop(columns=['start_dato','slut_dato','fag_nr','Fra','Til','duration_days'
                    ,'school_year_suffix','school_year','holdnr','KOEN'],inplace=True)

In [None]:
merged.to_pickle('../df/group_regression.pkl')

In [None]:
#The courses from umo
filepath='../data/macom_fag.sas7bdat'
test =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

test.drop(columns=['Niveau'],inplace=True)

In [None]:
merge_course=elev_hold.merge(test,left_on=['fag_nr','inst_nr'],
                             right_on=['FagNummer','InstNr'],how='left')

In [None]:
# Extract the school year based on the 'end_date' column
merge_course['school_year'] = merge_course['slut_dato'].dt.year
merge_course.loc[merge_course['slut_dato'].dt.month < 7, 'school_year'] -= 1

# Extract the last two digits of the school year
merge_course['school_year_suffix'] = merge_course['school_year'] % 100

# Append the school year suffix to the hold_nr column
merge_course['new_holdnr'] = merge_course['holdnr'] + merge_course['school_year_suffix'] / 100

In [None]:
danish=merge_course.query('FagNavn == "Dansk"')
danish.sort_values('start_dato')
danish.drop_duplicates(subset=['elev_id','inst_nr','school_year'], keep='first',inplace=True)

danish.drop(columns=['aarsag','holdnr','start_dato','slut_dato','fag_nr','fag_niveau','duration_days',
                    'FagNummer','FagNavn','InstNr','school_year_suffix'],inplace=True)


In [None]:
student_socio=pd.read_pickle('../df/normalized_student.pkl')
student_socio['age']= student_socio['age_semester'].dt.days.astype(int)
# Extract the school year based on the 'end_date' column
student_socio['school_year'] = student_socio['slutdato'].dt.year
student_socio.loc[student_socio['slutdato'].dt.month < 7, 'school_year'] -= 1
to_merge=student_socio[['elev_id','inst_nr','step','gender','school_year','income_mother','income_father',
              'edu_level_father','edu_level_mother','avg_grade','percent','age']]

In [None]:
dan_merge=danish.merge(to_merge,on=['elev_id','school_year','inst_nr'])

In [None]:
#Clean columns for outliers
columns_of_interest = ['age','income_mother','income_father']

# Calculate the z-scores for the income variables
income_zscores = dan_merge[['age','income_mother','income_father']].apply(zscore)

# Create a boolean mask to select rows with income z-scores below 2.5
income_mask = (income_zscores.abs() < 2.5).all(axis=1)

# Apply the mask to the DataFrame
dan_merge[columns_of_interest] = dan_merge[columns_of_interest][income_mask]


#Get the metadata for each group
dan_meta=(dan_merge.groupby(['new_holdnr','inst_nr'],sort=False)
              .agg(**{'students': ('elev_id','nunique'),
                     'income_mothers': ('income_mother','mean'),'income_fathers': ('income_father','mean'),
                     'edu_mothers': ('edu_level_mother','mean'),'edu_fathers': ('edu_level_father','mean'),
                     'students_avg_grade':('avg_grade','mean'),'students_absence':('percent','mean'),
                      'avg_age':('age','mean')
                      })
              .reset_index()
              )

dan_merge=dan_merge.merge(dan_meta,on=['new_holdnr','inst_nr'],how='left')

dan_merge=dan_merge[(dan_merge['students'] > 10) & (dan_merge['students'] < 37)]

In [None]:
# Define the columns to normalize
cols_to_normalize = ['age','avg_age']

cols_of_interest = ['edu_level_mother', 'edu_level_father', 'income_mother', 'income_father']
# Define the range to scale the values to
min_value = -3
max_value = 3

# Create a MinMaxScaler object and fit it to the data
scaler = MinMaxScaler(feature_range=(min_value, max_value))
scaler.fit(dan_merge[cols_to_normalize])

# Transform the data using the scaler
dan_merge[cols_to_normalize] = scaler.transform(dan_merge[cols_to_normalize])

#Define group column
group_col = 'new_holdnr'

# Calculate the standard deviation for each variable within each group
std_devs = dan_merge.groupby(group_col)[cols_of_interest].std()

# Calculate the average standard deviation for each group
std_devs['diversity_score'] = std_devs.mean(axis=1)

# Reset the index to obtain a DataFrame with group identifiers and diversity scores
diversity_scores = std_devs[['diversity_score']].reset_index()

In [None]:
#Merge main Danish class with diversity scores
dan_merge=dan_merge.merge(diversity_scores,on='new_holdnr')

In [None]:
#Get student difference from their class in all relevant variables.
dan_merge['diff_income_mother']=dan_merge['income_mother']-dan_merge['income_mothers']
dan_merge['diff_income_father']=dan_merge['income_father']-dan_merge['income_fathers']
dan_merge['diff_edu_mother']=dan_merge['edu_level_mother']-dan_merge['edu_mothers']
dan_merge['diff_edu_father']=dan_merge['edu_level_father']-dan_merge['edu_fathers']
dan_merge['diff_age']=dan_merge['age']-dan_merge['avg_age']

#Drop columns not explaining the difference
dan_merge.drop(columns=['income_mother','income_father','edu_level_father','edu_level_mother',
      'income_fathers','income_mothers','edu_fathers','edu_mothers','age','avg_age','new_holdnr',
                       'gender','students'],inplace=True)
dan_merge=dan_merge.drop_duplicates()


In [None]:
#Read clean student registry
students=pd.read_pickle('../Data/clean_reg.pkl')

# Extract the school year based on the 'end_date' column
students['school_year'] = students['slutdato'].dt.year
students.loc[students['slutdato'].dt.month < 7, 'school_year'] -= 1

#Rename and drop columns for merge
students.rename(columns={'INSTNR':'inst_nr'},inplace=True)
students.drop(columns=['startdato','slutdato'],inplace=True)

#merge students with the info for their main class
group_dynamics=dan_merge.merge(students,on=['elev_id','step','inst_nr','school_year'],how='left')

#Save dataframe with diversity scores, and socio-economic differences, groupsize of main class
group_dynamics.to_pickle('../distance/diversity_info.pkl')