### Here we calculate the Euclidean Distance Matrix

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf

import matplotlib as mpl

mpl.style.use('ggplot')

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 13
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
students=pd.read_pickle('../df/group_regression.pkl')

In [None]:
#Drop Columns we coul not get teacher info on. 
df=students[~students['teacher_changes'].isna()]

#We only want groups of realistic sizes
df=df[(df['students'] > 8) & (df['students'] < 36)]

#Change gradetype to a binary variable exam, true or False
df['KarakterType']=df['KarakterType'].astype(str)
df['exam'] = df['KarakterType'].replace({'ÅRS': 0, 'STA': 0, 'EKS': 1, 'IPR': 1})
df.drop(columns='KarakterType',inplace=True)

In [None]:
#Add Dummy columns for the Evaluation form
df['EvaleringsForm']=df['EvaleringsForm'].astype(str)

dummy_cols = pd.get_dummies(df['EvaleringsForm'])

# Rename the dummy columns
dummy_cols = dummy_cols.rename(columns={'MDT': 'Oral', 'SKR': 'Written', 'SAM': 'Colab'})

# Concatenate the dummy columns with the original DataFrame
df = pd.concat([df, dummy_cols], axis=1)

#Drop the Original Evaluation Form column
df.drop(columns=['EvaleringsForm'],inplace=True)

#Fill with most common since basically zero nan 
df['aarsag']=df['aarsag'].fillna(df['aarsag'].value_counts().index[0])

#Change reason to a binary variable to show if it is mandatory 1 = true
df['aarsag']=df['aarsag'].astype(str)
df['mandatory'] = df['aarsag'].replace({'FRIVALG': 0, 'OBLIFAG': 1,'OBLFAG':1, 'STUDRET': 1})
df.drop(columns=['aarsag'],inplace=True)

In [None]:
#One hot encode subject level since I don't assume linear relationship 
one_hot_encoded = pd.get_dummies(df['fag_niveau'], prefix='level')

# Concatenate the one-hot encoded columns with the original DataFrame
df= pd.concat([df, one_hot_encoded], axis=1)

df.rename(columns={'level_-':'level_none'},inplace=True)
df.drop(columns=['fag_niveau'],inplace=True)

#Log the teacher changes 
df['teacher_changes']=np.log(df['teacher_changes']+1)

In [None]:
import math
from scipy.stats import zscore 

new_df=df.copy()

#Define function to remove outliers for certain columns
def z_score_normalize(df, column_names, threshold):
    for column_name in column_names:
        # Calculate mean and standard deviation
        mean = df[column_name].mean()
        std = df[column_name].std()

        # Compute z-scores
        z_scores = zscore(df[column_name])

        # Identify outliers based on threshold
        outliers = df[abs(z_scores) > threshold]

        # Remove outliers from DataFrame
        df = df.drop(outliers.index)

    return df

columns_to_normalize = ['income_father', 'income_mother','avg_hours','sum_hours','percent',
                       'income_mothers','income_fathers','age']
new_df = z_score_normalize(new_df, columns_to_normalize,2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
cols_to_normalize=['avg_hours','avg_start','income_mother','income_father','edu_mothers','edu_fathers',
                    'edu_level_mother','edu_level_father','avg_grade','group_grade','percent','sum_hours',
                  'students_avg_grde','income_mothers','income_fathers', 'age']

# Define the range to scale the values to
min_value = -3
max_value = 3

# Create a MinMaxScaler object and fit it to the data
scaler = MinMaxScaler(feature_range=(min_value, max_value))
scaler.fit(new_df[cols_to_normalize])

# Transform the data using the scaler for relevant columns
new_df[cols_to_normalize] = scaler.transform(new_df[cols_to_normalize])


male=new_df.query('gender==0')
female=new_df.query('gender==1')

In [None]:
#First pair calculation, student pairs without average grade matching

from sklearn.metrics.pairwise import pairwise_distances
# Filter the DataFrame based on course and step
student_df = new_df[new_df['step'] == 3]

# Select the columns of interest and the elev_id column
columns_of_interest = ['edu_level_mother', 'gender', 'income_mother', 'age', 'income_father', 'edu_level_father']
sub_df = student_df[['elev_id'] + columns_of_interest]
sub_df.drop_duplicates(subset='elev_id', inplace=True)

# Set the index of the DataFrame to 'elev_id'
sub_df.set_index('elev_id', inplace=True)

# Loop over each edu_level_mother and calculate pairwise distances to save memory
edu_levels = sub_df['edu_level_mother'].unique()
result_df_list = []
for i, edu_level in enumerate(edu_levels):
    print(f'Processing edu_level_mother {edu_level}. {i+1}/{len(edu_levels)}')
    
    # Filter the DataFrame to keep only the current edu_level_mother
    edu_level_df = sub_df[sub_df['edu_level_mother'] == edu_level]
    
    # Compute pairwise distances between rows
    print('  Calculating distance matrix...')
    dist_matrix = pairwise_distances(edu_level_df[columns_of_interest], metric='euclidean')
    
    # Convert the pairwise distances to a square distance matrix
    df_euclid = pd.DataFrame(dist_matrix, index=edu_level_df.index, columns=edu_level_df.index)
    
    # Filter the matrix to exclude pairs that are too far apart
    df_euclid[df_euclid > 0.5] = np.nan

    # Loop over each row in the matrix and create a DataFrame of the pairs and distances
    print('  Creating pairs DataFrame...')
    pairs_df_list = []
    for index, row in df_euclid.iterrows():
        row_df = pd.DataFrame({
            'elev_id': index,
            'neighbour': row.index,
            'distance': row
        })
        row_df.dropna(inplace=True)
        row_df = row_df[row_df['elev_id'] != row_df['neighbour']]
        pairs_df_list.append(row_df)
    
    # Concatenate the DataFrames for all rows into a single DataFrame for the edu_level_mother
    print('  Concatenating pairs DataFrames...')
    edu_level_df_pairs = pd.concat(pairs_df_list, ignore_index=True)
    edu_level_df_pairs.sort_values('distance', inplace=True)
    
    # Add the result to the list
    result_df_list.append(edu_level_df_pairs)

# Concatenate the result DataFrames
no_grade = pd.concat(result_df_list, ignore_index=True)

In [None]:
no_grade.to_pickle('../distance/no_grade.pkl')

In [None]:
#Second pair calculation, student pairs with average grade, step 3

from sklearn.metrics.pairwise import pairwise_distances

# Filter the DataFrame based on course and step
student_df = new_df[new_df['step'] == 3]

# Select the columns of interest and the elev_id column
columns_of_interest = ['edu_level_mother', 'gender', 'income_mother', 'age', 'income_father', 'edu_level_father', 'avg_grade']
sub_df = student_df[['elev_id'] + columns_of_interest]
sub_df.drop_duplicates(subset='elev_id', inplace=True)

# Set the index of the DataFrame to 'elev_id'
sub_df.set_index('elev_id', inplace=True)

# Loop over each edu_level_mother and calculate pairwise distances
edu_levels = sub_df['edu_level_mother'].unique()
result_df_list = []
for i, edu_level in enumerate(edu_levels):
    print(f'Processing edu_level_mother {edu_level}. {i+1}/{len(edu_levels)}')
    
    # Filter the DataFrame to keep only the current edu_level_mother
    edu_level_df = sub_df[sub_df['edu_level_mother'] == edu_level]
    
    # Compute pairwise distances between rows
    print('  Calculating distance matrix...')
    dist_matrix = pairwise_distances(edu_level_df[columns_of_interest], metric='euclidean')
    
    # Convert the pairwise distances to a square distance matrix
    df_euclid = pd.DataFrame(dist_matrix, index=edu_level_df.index, columns=edu_level_df.index)
    
    # Filter the matrix to exclude pairs that are too far apart
    df_euclid[df_euclid > 0.5] = np.nan

    # Loop over each row in the matrix and create a DataFrame of the pairs and distances
    print('  Creating pairs DataFrame...')
    pairs_df_list = []
    for index, row in df_euclid.iterrows():
        row_df = pd.DataFrame({
            'elev_id': index,
            'neighbour': row.index,
            'distance': row
        })
        row_df.dropna(inplace=True)
        row_df = row_df[row_df['elev_id'] != row_df['neighbour']]
        pairs_df_list.append(row_df)
    
    # Concatenate the DataFrames for all rows into a single DataFrame for the edu_level_mother
    print('  Concatenating pairs DataFrames...')
    edu_level_df_pairs = pd.concat(pairs_df_list, ignore_index=True)
    edu_level_df_pairs.sort_values('distance', inplace=True)
    
    # Add the result to the list
    result_df_list.append(edu_level_df_pairs)

# Concatenate the result DataFrames
result_df3 = pd.concat(result_df_list, ignore_index=True)


In [None]:
#Third pair calculation, student pairs with average grade, step 2
from sklearn.metrics.pairwise import pairwise_distances

# Filter the DataFrame based on course and step
student_df = new_df[new_df['step'] == 2]

# Select the columns of interest and the elev_id column
columns_of_interest = ['edu_level_mother', 'gender', 'income_mother', 'age', 'income_father', 'edu_level_father', 'avg_grade']
sub_df = student_df[['elev_id'] + columns_of_interest]
sub_df.drop_duplicates(subset='elev_id', inplace=True)

# Set the index of the DataFrame to 'elev_id'
sub_df.set_index('elev_id', inplace=True)

# Loop over each edu_level_mother and calculate pairwise distances
edu_levels = sub_df['edu_level_mother'].unique()
result_df_list = []
for i, edu_level in enumerate(edu_levels):
    print(f'Processing edu_level_mother {edu_level}. {i+1}/{len(edu_levels)}')
    
    # Filter the DataFrame to keep only the current edu_level_mother
    edu_level_df = sub_df[sub_df['edu_level_mother'] == edu_level]
    
    # Compute pairwise distances between rows
    print('  Calculating distance matrix...')
    dist_matrix = pairwise_distances(edu_level_df[columns_of_interest], metric='euclidean')
    
    # Convert the pairwise distances to a square distance matrix
    df_euclid = pd.DataFrame(dist_matrix, index=edu_level_df.index, columns=edu_level_df.index)
    
    # Filter the matrix to exclude pairs that are too far apart
    df_euclid[df_euclid > 0.5] = np.nan

    # Loop over each row in the matrix and create a DataFrame of the pairs and distances
    print('  Creating pairs DataFrame...')
    pairs_df_list = []
    for index, row in df_euclid.iterrows():
        row_df = pd.DataFrame({
            'elev_id': index,
            'neighbour': row.index,
            'distance': row
        })
        row_df.dropna(inplace=True)
        row_df = row_df[row_df['elev_id'] != row_df['neighbour']]
        pairs_df_list.append(row_df)
    
    # Concatenate the DataFrames for all rows into a single DataFrame for the edu_level_mother
    print('  Concatenating pairs DataFrames...')
    edu_level_df_pairs = pd.concat(pairs_df_list, ignore_index=True)
    edu_level_df_pairs.sort_values('distance', inplace=True)
    
    # Add the result to the list
    result_df_list.append(edu_level_df_pairs)

# Concatenate the result DataFrames
result_df2 = pd.concat(result_df_list, ignore_index=True)

In [None]:
#Fourth pair calculation, student pairs with average grade, step 1

from sklearn.metrics.pairwise import pairwise_distances

# Filter the DataFrame based on course and step
student_df = new_df[new_df['step'] == 1]

# Select the columns of interest and the elev_id column
columns_of_interest = ['edu_level_mother', 'gender', 'income_mother', 'age', 'income_father', 'edu_level_father', 'avg_grade']
sub_df = student_df[['elev_id'] + columns_of_interest]
sub_df.drop_duplicates(subset='elev_id', inplace=True)

# Set the index of the DataFrame to 'elev_id'
sub_df.set_index('elev_id', inplace=True)

# Loop over each edu_level_mother and calculate pairwise distances
edu_levels = sub_df['edu_level_mother'].unique()
result_df_list = []
for i, edu_level in enumerate(edu_levels):
    print(f'Processing edu_level_mother {edu_level}. {i+1}/{len(edu_levels)}')
    
    # Filter the DataFrame to keep only the current edu_level_mother
    edu_level_df = sub_df[sub_df['edu_level_mother'] == edu_level]
    
    # Compute pairwise distances between rows
    print('  Calculating distance matrix...')
    dist_matrix = pairwise_distances(edu_level_df[columns_of_interest], metric='euclidean')
    
    # Convert the pairwise distances to a square distance matrix
    df_euclid = pd.DataFrame(dist_matrix, index=edu_level_df.index, columns=edu_level_df.index)
    
    # Filter the matrix to exclude pairs that are too far apart
    df_euclid[df_euclid > 0.5] = np.nan

    # Loop over each row in the matrix and create a DataFrame of the pairs and distances
    print('  Creating pairs DataFrame...')
    pairs_df_list = []
    for index, row in df_euclid.iterrows():
        row_df = pd.DataFrame({
            'elev_id': index,
            'neighbour': row.index,
            'distance': row
        })
        row_df.dropna(inplace=True)
        row_df = row_df[row_df['elev_id'] != row_df['neighbour']]
        pairs_df_list.append(row_df)
    
    # Concatenate the DataFrames for all rows into a single DataFrame for the edu_level_mother
    print('  Concatenating pairs DataFrames...')
    edu_level_df_pairs = pd.concat(pairs_df_list, ignore_index=True)
    edu_level_df_pairs.sort_values('distance', inplace=True)
    
    # Add the result to the list
    result_df_list.append(edu_level_df_pairs)

# Concatenate the result DataFrames
result_df1 = pd.concat(result_df_list, ignore_index=True)


In [None]:
#Save all the results to use in Regression Matched Pair Analysis
result_df1.to_pickle('../distance/results1.pkl')
result_df2.to_pickle('../distance/results2.pkl')
result_df3.to_pickle('../distance/results3.pkl')

In [None]:
new_df.to_pickle('../distance/gen_df.pkl')