### This notebook has the purpose of exploring the student table 

Things such as number of students, year started, year finished and reason for departure

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sas7bdat
import seaborn as sns

import matplotlib as mpl

mpl.style.use('seaborn-whitegrid')  # Use 'seaborn-whitegrid' instead of 'ggplot'

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 13
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
#Read students
filepath='../data/elevregister.sas7bdat'
df =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

register=df.drop(columns=['UDD'])

register=register.rename(columns={'ELEV3_VFRA':'startdato','ELEV3_VTIL':'slutdato'})

In [None]:
#Read reasons for departure
filepath='../data/umo_afgangsaarsager.sas7bdat'
aa =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

#Keep only relevant columns
aa=aa[['AfgangsAarsag','KortBetegnelse']]

In [None]:
#Second Student table with detailed reason for departure - not semester divided
filepath='../data/macom_elever.sas7bdat'
df =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

#We don't use Coesa and optagelses_kriterie only has 1 value. 
elever=df.drop(columns=['CoesaFormaal','CoesaFormaalVersion','Optagelses_kriterie'])

elever['start_year']=elever['startdato'].dt.year

register['start_year']=register['startdato'].dt.year

In [None]:
#Merge reason for departure with students
new_elever=pd.merge(elever,aa,left_on='AfgangsAarsag',right_on='AfgangsAarsag')

#Grouping students by start year
fun=(new_elever.groupby('KortBetegnelse',sort=False)
              .agg(**{'students': ('elev_id','count'),'reason':('AfgangsAarsag','median')})
              .reset_index()
              )
#Keep only top 10 reasons for departure
test=fun.sort_values('students',ascending=False).head(10)

#Map descriptive terms for visual to reason for deprarture
map={
    'Fuldført GYM':'Completed Education',
    "Ej bestået":'Did Not Pass',
    "Skiftet skole":'Changed School',
    "Fortrudt udd.":'Regretted Education',
    "Person. forhold":"Personal reasons",
    "Ej påbegyndt":"Not started",
    "Bortvist":"Expelled",
    "Afb. udveksling":"Cancelled Exchange",
    "Flyttet":"Moved",
    "Niveau for højt":"Level Too High"
}

test["KortBetegnelse"]=test['KortBetegnelse'].map(map)

### Visualization to show the reasons for departure amongst students
Saved as PDF and DF as CSV

In [None]:
x=test.KortBetegnelse
y=test.students

#test.to_pickle('figures/popular_reasons_departure.pkl')


fig,ax=plt.subplots(figsize=(18,10))


ax.bar(x,y)

# Change the color of the bars
bar_color = 'steelblue'  # Specify your desired color
ax.bar(x, y, color=bar_color)


#Calculate mean, median and std in one line
#mean,median,std=fun['students'].describe([['mean','50%','std']])

ax.set_xlabel('Reason for Departure', fontsize=14)
ax.set_ylabel('Count of Students Departing', fontsize=14)
ax.set_title('Number of Students Departing per Reason', fontsize=16)


# Add line breaks to x-tick labels
xtick_labels = []
for label in x:
    words = label.split(' ')
    if len(words) == 3:
        words[-2] += '\n' + words[-1]
        words.pop()
    xtick_labels.append(' '.join(words))
ax.set_xticklabels(xtick_labels)

plt.tight_layout()
plt.savefig('figures/Reason_for_departure.pdf',dpi=600)
plt.show()

In [None]:
#Plot how many times individual students appear in the student register
import matplotlib.pyplot as plt

app = register.groupby('elev_id', sort=False).agg(**{'Apperances': ('slutdato', 'count')}).reset_index()
app = app.groupby('Apperances', sort=False).agg(**{'Students': ('elev_id', 'count')}).reset_index()

# Combine values of 5 or above into a single category
app['Apperances'] = app['Apperances'].apply(lambda x: '>= 5' if x >= 5 else str(x))

# Calculate the count of students for each category
grouped_app = app.groupby('Apperances', sort=False).agg({'Students': 'sum'}).reset_index()


#Create plot
x = grouped_app['Apperances']
y = grouped_app['Students']

fig, ax = plt.subplots()


bar_color = 'steelblue' 
ax.bar(x, y, color=bar_color)



ax.set_xlabel('Number of times present in the student register')
ax.set_ylabel('Count of students with number of appearances')
ax.set_title('How many times do different students appear in the student register')

plt.tight_layout()
plt.savefig('figures/students_appereances_register.pdf')
plt.show()


In [None]:
#Show the start and end year of each student
#Grouping students by start year
fun=(elever.groupby('start_year',sort=False)
              .agg(**{'students': ('elev_id','count')})
              .reset_index()
              )

elever['final_year']=elever['slutdato'].dt.year

non=(elever.groupby('final_year',sort=False)
              .agg(**{'students': ('elev_id','count')})
              .reset_index()
              )
fun = fun.sort_values('start_year')
fun=fun.query('start_year>2007')


x=fun.start_year
y=fun.students

non=non.query('final_year < 2024 & final_year > 2010')

b=non.final_year
z=non.students



fig,ax=plt.subplots(figsize=(16,8))
bar_color = 'steelblue'  # Specify your desired color


ax.bar(x,y,width=0.4,label='Start Year',color=bar_color)
ax.bar(b + 0.4,z,width=0.4,label='Final Year')
#Calculate mean, median and std in one line
#mean,median,std=fun['students'].describe([['mean','50%','std']])

ax.set_xlabel('Year')
ax.set_ylabel('Count of students with start year or finish year')
ax.set_title('Start year and final year for each student')
plt.legend()
plt.savefig('figures/start_end_year.pdf')
plt.show()

In [None]:
#Read student groups to understand how many students pr. group
filepath='../data/macom_elevhold.sas7bdat'
df=pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')
hold=df.drop(columns=['ElevInternId'])


#Filter out lessons, groups and students before 2011
year = 2011

elever=elever[elever['startdato'].dt.year >= year]
register=register[register['startdato'].dt.year >= year]
hold=hold[hold['StartDato'].dt.year >= year]

In [None]:
#Read group lessons to filter out groups that do not have a lesson
lh=pd.read_pickle('../data/lektionhold.pkl')
lh['holdnr']=lh['hold_nr'].astype(str)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import find_peaks

# Convert the 'holdnr' column to string type
hold['holdnr'] = hold['holdnr'].astype(str)

# Group students by 'holdnr' (group number) and calculate the count of unique students
fun = hold.groupby('holdnr', sort=False).agg(**{'students': ('elev_id', 'nunique')}).reset_index()

# Filter out only the groups present in lessons
test = fun[fun['holdnr'].isin(lh['holdnr'])]

# Create a figure and axes
fig, ax = plt.subplots(figsize=(16, 8))

# Create a histogram of the number of students in each group
n, bins, patches = ax.hist(test['students'], bins=range(0, 60), color='steelblue')

# Find the peaks (prominent bins) in the histogram
peaks, _ = find_peaks(n, height=(None, None), width=1)

# Sort the peaks in descending order based on their counts
sorted_peaks = sorted(peaks, key=lambda x: n[x], reverse=True)[:2]

# Add vertical lines and annotations for the top two peaks
for peak in sorted_peaks:
    ax.axvline(x=bins[peak], color='red', linestyle='--')
    ax.annotate(f'{bins[peak]}', xy=(bins[peak], 0), xytext=(5, 5), textcoords='offset points')

# Set the x-axis and y-axis labels and title
ax.set_xlabel('Number of students in a group')
ax.set_ylabel('Count of groups having a specific number of students')
ax.set_title('Number of students per group: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*test['students'].describe()[['mean', '50%', 'std']]))

# Save the filtered data as a pickle file
test.to_pickle('figures/students_pr_group.pkl')

# Save the figure as a PDF file
plt.savefig('figures/students_pr_group.pdf')

# Display the plot
plt.show()
