### This presents the matching of students to lessons 
The code for utilizing this data can be found in the notebook 'Exploring Absence'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sas7bdat
import seaborn as sns

In [None]:
# Set the ggplot style
plt.style.use('ggplot')

# Add Whitegrid style
plt.style.use('seaborn-whitegrid')

# Set the rest of the style settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.labelsize'] = 13
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

In [None]:
#Base knowledge of students in a given period
elev=pd.read_pickle('../data/clean_reg.pkl')
elev.rename(columns={'INSTNR':'instnr'},inplace=True)

In [None]:
#All lessons in the period of study
absence=pd.read_pickle('../df/all_lessons.pkl')

In [None]:
# Filter the DataFrame to include only lectures starting between 8 and 16
filtered_absence = absence[(absence['start_time'].dt.hour >= 8) & (absence['start_time'].dt.hour <= 15)]

# Create a new column to indicate if absence is above zero (1) or not (0)
filtered_absence['absence_above_zero'] = filtered_absence['FravaersProcent'].apply(lambda x: 1 if x > 0 else 0)

# Group the data by the time of day and calculate the probability of absence being above zero
prob_df = filtered_absence.groupby(filtered_absence['start_time'].dt.hour)['absence_above_zero'].mean()

In [None]:
# Create the line plot of the probability of being absent
plt.plot(prob_df.index, prob_df.values)

# Set the labels and title
plt.xlabel('Time of Day')
plt.ylabel('Probability of Absence Above Zero')
plt.title('Probability of Absence Above Zero by Time of Day')

plt.savefig('figures/prob_absence_start_day.pdf')

# Save the DataFrame as a pickle file
prob_df.to_pickle('figures/prob_absence_start_day.pkl')

# Show the plot
plt.show()


In [None]:
#Load lessons to add data to the absence
lessons=pd.read_pickle('../data/lektioner.pkl')

#Only keep rows with students from our study
absence=absence[absence['elev_id'].isin(elev['elev_id'])]

#Keep relevant columns
lessons=lessons[['inst_nr','lektions_nr','start_time','end_time']]

#Rename to match other table
elev.rename(columns={'instnr':'inst_nr'},inplace=True)

#Merge absence and lessons
absence=absence.merge(lessons,on=['lektions_nr','inst_nr','start_time'],how='left')

#Merge to add student metadata
elev=elev.merge(absence,on=['elev_id','inst_nr'])

In [None]:
#Filter to only keep students-lesson matching that happens within the period the student is enrolled
elev=elev[((elev['start_time'] >= elev['startdato']) & (elev['start_time'] <= elev['slutdato'])) & 
          ((elev['end_time'] >= elev['startdato']) & (elev['end_time'] <= elev['slutdato']))]

#Get absence as percentage
elev['FravaersProcent']=elev['FravaersProcent']/100

elev.drop_duplicates(inplace=True)

#Save complete absence overview
elev.to_pickle('../df/semester_absence_complete.pkl')

In [None]:
#Get the metadata for each institution to undestand difference between students and their educational institution. 
grouped=(elev.groupby(['elev_id','inst_nr','startdato','slutdato','step'],sort=False)
              .agg(**{'lessons': ('lektions_nr','count'),'absence':('FravaersProcent','sum')})
              .reset_index()
              )

#Get the percentage absence for institutions so that we can offset in our regression
grouped['percentage']=grouped['absence']/grouped['lessons']

#Save institutional absence
grouped.to_pickle('../df/final_percentage.pkl')

In [None]:
#Get the number of students for each lesson and the absence
students_lesson=(elev.groupby(['lektions_nr','inst_nr'],sort=False)
              .agg(**{'students': ('elev_id','nunique'),'absence':('FravaersProcent','sum')})
              .reset_index()
              )

#Keep realistic group sizes
temp=students_lesson[students_lesson['students'] < 35]
temp=temp[temp['students']>10]

#Save the number of students pr. lesson
temp.to_pickle('../df/lessons_number_students.pkl')

In [None]:
#Keep only relevant columns to find average group size pr. student
elev.drop(columns=['join_reason','depart_reason','FravaersProcent','start_time','end_time'],inplace=True)


#Merge students with their lessons
group_size=elev.merge(temp,on=['lektions_nr','inst_nr'])

#Group by students, start,end and get their average group size 
group_sizes=(group_size.groupby(['elev_id','inst_nr','startdato','slutdato','step'],sort=False)
              .agg(**{'students': ('students','mean')})
              .reset_index()
              )

#Save student group size overview 
group_sizes.to_pickle('../df/groups_size.pkl')