### Show number of grades students received pr. school year
Also save clean student reg

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sas7bdat
import seaborn as sns


import matplotlib as mpl

mpl.style.use('seaborn-whitegrid')  # Use 'seaborn-whitegrid' instead of 'ggplot'

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 13
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200f
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
filepath='../data/elevregister.sas7bdat'
df =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

df=df.drop(columns=['UDD','UFORM','AUDD','KOMP'])#Remove unecessary columns

df=df.rename(columns={'ELEV3_VFRA':'startdato','ELEV3_VTIL':'slutdato','UDEL':"step",
                     "AFG_ART":'depart_reason',"TILG_ART":'join_reason'})#Rename to easier access and better understanding


#Map steps to actual year 
map={
    21:1,
    22:2,
    23 :3
}

df['step']=df['step'].map(map)

In [None]:
filepath='../data/karakterer.sas7bdat'
grade =pd.read_sas(filepath, format='sas7bdat',encoding='iso-8859-1')

In [None]:
#Keep only students whose first year active is 2011 or later.
df['depart_reason']=df['depart_reason'].astype(int)#Make sure the AFG_ART is int 

fun=(df.groupby('elev_id',sort=False)
              .agg(**{'startdato': ('startdato','min')}) #Aggregate to the lowest value in startdate column
              .reset_index()
              )

fun=fun[(fun['startdato'].dt.year>=2011)] #Keep only those active after 2011

df=df.loc[df['elev_id'].isin((fun['elev_id']))]#Locate in main df whose active in the fun/temp 

df=df[df['startdato'].dt.year <= 2020]#Dont keep after 2020, because of data quality issues 


df.to_pickle('../data/clean_reg.pkl')

In [None]:
df['slutdato']=pd.to_datetime(df['slutdato'])

new_grade=grade[['elev_id','Fra','Karakter','FagNiveau','FagNummer']]#Keep only relevant columns
new_grade['Fra']=new_grade['Fra'].astype(str) #Make the from column a string to extract pattern
new_grade=new_grade[new_grade['Fra'].str.match('^2\d{3}')]#Extract only patterns that match a legit year 20**
new_grade['Fra']=pd.to_datetime(new_grade['Fra'])#Make DateTime
new_grade['year']=new_grade['Fra'].dt.year #Calculate a year

new_grade=new_grade[new_grade['year'] >= 2011]#Only after 2011

In [None]:
#Match grade to students
merged=pd.merge(df,new_grade,left_on='elev_id',right_on='elev_id')

#Keep only grades between start and end date
mask=merged['Fra'].between(merged['startdato'],merged['slutdato'])

merge=merged[mask]

#Save register with grades
merge.to_pickle('../df/register_grade.pkl')

In [None]:
#I want to calculate the weighted average for students so I recreate columns based on FagNiveau
new_list=['A','B','C']
new_df=merge.loc[merge['FagNiveau'].isin(new_list)]

conditions = [
    new_df['FagNiveau'] == 'A' ,
    new_df['FagNiveau'] == 'B',
    new_df['FagNiveau'] == 'C'
]

choices = [2,1.5,1]

new_df['weight'] = np.select(conditions,choices,default=0)#Assign conditional weights to courses

In [None]:
new_df['Karakter']=new_df['Karakter'].astype(str)#We need to make the grades string to extract the pattern


grades=["-3","0","2","4","7","10","12"] #Keep only relevant grades 

new_df=new_df[new_df['Karakter'].isin(grades)]

new_df['Karakter']=new_df['Karakter'].astype(int)#Concvert back to int

new_df.to_pickle('../df/students_year_grade.pkl')

In [None]:
# Create a list of grades repeated based on weight
grades_repeated = np.repeat(new_df['Karakter'], new_df['weight'])

# Create a new DataFrame with repeated values for elev_id, startdato, slutdato, step, course, and grade
df_new = pd.DataFrame({'elev_id': np.repeat(new_df['elev_id'], new_df['weight']),
                       'startdato': np.repeat(new_df['startdato'], new_df['weight']),
                       'slutdato': np.repeat(new_df['slutdato'], new_df['weight']),
                       'step': np.repeat(new_df['step'], new_df['weight']),
                       'course': np.repeat(new_df['FagNummer'], new_df['weight']),
                       'grade': grades_repeated})

# Calculate the weighted average grade for each elev_id, startdato, slutdato, and step combination
w_avg = (df_new.groupby(['elev_id', 'startdato', 'slutdato', 'step'], sort=False)
              .agg(**{'avg_grade': ('grade', 'mean')})
              .reset_index()
              )


In [None]:
# Calculate the number of grades for each student in each school year and step
test = (new_df.groupby(['elev_id', 'startdato', 'step'], sort=False)
              .agg(**{'grades': ('Karakter', 'count')})
              .reset_index()
              )

# Merge the weighted average DataFrame with the grades count DataFrame
w_avg = pd.merge(w_avg, test, on=['elev_id', 'startdato', 'step'], how='left')

# Filter the DataFrame to include only students with 22 or fewer grades
w_avg1 = w_avg.query('grades <= 22')

# Separate the DataFrame into groups based on the step (year)
x = w_avg1.query('step == 1')
y = w_avg1.query('step == 2')
z = w_avg1.query('step == 3')

# Retrieve the grades counts for each group
x1 = x['grades']
y1 = y['grades']
z1 = z['grades']

# Plot the histograms of grades counts for each year
fig, ax = plt.subplots(figsize=(12, 6))

sns.histplot(x1, color='darkslategray', label='Grades given for year 1 students: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*x['grades'].describe()[['mean', '50%', 'std']]), bins=19)
sns.histplot(y1, color='darkcyan', label='Grades given for year 2 students: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*y['grades'].describe()[['mean', '50%', 'std']]), bins=19)
sns.histplot(z1, color='turquoise', label='Grades given for year 3 students: Mean={:.2f}, Median={:.2f}, Standard Deviation={:.2f}'.format(*z['grades'].describe()[['mean', '50%', 'std']]), bins=19)

ax.set_xlabel('Grades awarded in a grade-level')
ax.set_ylabel('Count of students with given grades received')
plt.legend()
plt.tight_layout()

w_avg.to_pickle('figures/grades_steps.pkl')
plt.savefig('figures/grades_steps.pdf')

plt.show()
