### Here we how many lessons students attend and how much absence they are rewarded

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sas7bdat
import seaborn as sns

import matplotlib as mpl

mpl.style.use('seaborn-whitegrid')  # Use 'seaborn-whitegrid' instead of 'ggplot'

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 13
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
#Read percentage calculated in Absenece Attendance Calculation
df=pd.read_pickle('../df/final_percentage.pkl')

In [None]:
# Filter out rows where 'lessons' is greater than or equal to 1200
df = df[df['lessons'] < 1200]

# Extract the 'lessons' column for plotting
x = df['lessons']

# Create a figure and axes
fig, ax = plt.subplots(figsize=(16, 8))

# Plot a histogram of 'lessons' with specified number of bins and color
sns.histplot(x, kde=False, bins=60, color="steelblue")

# Compute the histogram counts and bin edges
counts, bin_edges = np.histogram(x, bins=60)

# Compute the bin centers
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

# Find peaks in the histogram counts
peaks, _ = find_peaks(counts)

# Select the two highest peaks based on counts
peaks = peaks[np.argsort(counts[peaks])][-2:]

# Scatter plot the peak points on the histogram
plt.scatter(bin_centers[peaks], counts[peaks], marker='o', color='red', zorder=10)

# Set the x-axis and y-axis labels and title
ax.set_xlabel('Number of lessons that students have in a period')
ax.set_ylabel('Number of students with a given number of lessons')
ax.set_title('Plot showing the number of activities that students should be present for in a year:\nMean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*df['lessons'].describe()[['mean', '50%', 'std']]))

# Set the x-axis tick values
t_list = [0, 220, 440, 660, 880, 1200]
plt.xticks(t_list)

# Save the data to a pickle file
x.to_pickle('figures/lessons_pr_year.pkl')

# Save the figure as a PDF file
plt.savefig('figures/lessons_pr_year.pdf')

# Display the plot
plt.show()

In [None]:
len(df[df['lessons']<1200])/len(df)

In [None]:
#Plot the Absence each student is rewarded in a semester
# Filter out rows where 'absence' is greater than or equal to 250
mg = df.loc[df['absence'] < 250]

# Extract the 'absence' column for plotting
x = mg['absence']

# Create a figure and axes
fig, ax = plt.subplots(figsize=(16, 8))

# Plot a histogram of 'absence' with specified number of bins and color
sns.histplot(x, bins=85, color='steelblue')

# Set the x-axis and y-axis labels and title
ax.set_xlabel('Total lessons missed through absence in a period')
ax.set_ylabel('Number of students with said absence')
ax.set_title('Plot showing the number of activities that students have registered absence for in a period:\nMean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*mg['absence'].describe()[['mean', '50%', 'std']]))

# Save the filtered data to a pickle file
mg.to_pickle('figures/fraver_pr_year.pkl')

# Adjust the subplot layout for better spacing
plt.tight_layout()

# Save the figure as a PDF file
plt.savefig('figures/fraver_pr_year.pdf')

# Display the plot
plt.show()


In [None]:
#Plot the percentage absence 
x=mg['percentage']
fig,ax=plt.subplots(figsize=(16,8))

sns.histplot(x,bins=85,color='steelblue')


ax.set_xlabel('Percentage absence in a period of study')
ax.set_ylabel('Number of students with said percentage absence')
ax.set_title('The  percentage of absence that students have in a period: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*mg['percentage'].describe()[['mean',
                                                                                                             '50%',
                                                                                                             'std']]))
mg.to_pickle('figures/percent_absence.pkl')
plt.tight_layout()
plt.savefig('figures/percent_absence.pdf')
plt.show()

In [None]:
mg.to_pickle('../df/lecture_absence.pkl')

In [None]:
elev=pd.read_pickle('../data/clean_reg.pkl')

In [None]:
#Read students
filepath='../data/macom_elever.sas7bdat'
elever =pd.read_sas(filepath, format='sas7bdat')

#We don't use Coesa and optagelses_kriterie only has 1 value. 
elever=elever.drop(columns=['CoesaFormaal','CoesaFormaalVersion','Optagelses_kriterie'])

elever['start_year']=elever['startdato'].dt.year

#Read info on students reason for departure to find relationship between not finishing and absence 
filepath='../data/umo_afgangsaarsager.sas7bdat'
aa =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

#Merge the reasons to the students
dep=pd.merge(elever,aa,left_on='AfgangsAarsag',right_on='AfgangsAarsag')

#Keep only relevant rows
dep=dep[['elev_id','Betegnelse','AfgangsAarsag']]

#Merge reasons to our absence overview
new_merge=pd.merge(mg,dep,on=['elev_id'],how='left')

In [None]:
#Keep rows with a reason for departure
review=new_merge[~new_merge['AfgangsAarsag'].isna()].copy()
review['Betegnelse']=review['Betegnelse'].astype(str)


# sort the DataFrame by 'id' and 'datetime'
review = review.sort_values(['elev_id', 'startdato'])

# drop duplicates on the 'id' column and keep the instance with the latest 'datetime' value
review = review.drop_duplicates(subset='elev_id', keep='last')

In [None]:
# create a new column to represent the target variable, indicating whether 'Afgangsaarsag' is not equal to 17 or 29
# This represents not finishing
review['target'] = ((review['AfgangsAarsag'] != 17) & (review['AfgangsAarsag'] != 29)).astype(int)

# create a logistic regression plot using seaborn to show relationship
plt.figure(figsize=(12, 8))
sns.regplot(x='percentage', y='target', data=review, logistic=True, ci=100, color='steelblue', label='Logistic Regression', line_kws={'linewidth': 5})
plt.xlabel('Percentage Absence')
plt.ylabel('Chance of chance finishing education')
plt.title('Logistic Regression of Absence Percentage on Finishing Education')

plt.savefig('figures/logistic_absence.pdf')
review.to_pickle('figures/logistic_absence.pkl')
plt.show()
