### Results from the student pair analysis 

The dataframes created for this are created in the "Prep Group Grade" and "Prep Lecture Grade" Files. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

import matplotlib as mpl

mpl.style.use('seaborn-whitegrid')  # Use 'seaborn-whitegrid' instead of 'ggplot'

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 13
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.prop_cycle'] = plt.cycler('color', plt.cm.Set1.colors)

plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.facecolor'] = '#f8f8f8'

pd.set_option('display.max_columns', None)

In [None]:
#General df, info on students
gen_df=pd.read_pickle('../distance/gen_df.pkl')
gen_df['highest_edu']=gen_df[["edu_level_mother", "edu_level_father"]].max(axis=1)

In [None]:
#Dataframe with diversity scores
div_df=pd.read_pickle('../distance/diversity_info.pkl')

In [None]:
#Matched pairs year 1
sim_first=pd.read_pickle('../distance/results1.pkl')
sim_first=sim_first[sim_first['distance'] < 0.1]

In [None]:
#Matched pairs year 2
sim_third=pd.read_pickle('../distance/results3.pkl')
sim_third=sim_third[sim_third['distance'] < 0.5]

In [None]:
# Import tqdm for creating a progress bar
from tqdm import tqdm

# Create a method to remove duplicate identifiers, keeping only one of each pair
def remove_duplicate_identifiers(df, target):
    df = df.copy()
    
    # Sort the DataFrame by the target column in descending order
    df.sort_values(target, ascending=False, inplace=True)
    
    # Drop duplicates based on the 'elev_id' column, keeping the first occurrence
    df.drop_duplicates(subset=['elev_id'], keep='first', inplace=True)
    
    # Initialize a set to store unique student identifiers
    unique_ids = set()
    
    # Initialize an empty mask list to keep track of unique rows
    mask = []
    
    # Use tqdm to create a progress bar
    progress_bar = tqdm(total=len(df), desc="Processing Rows", unit="row")
    
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        elev_id = row['elev_id']
        neighbour = row['neighbour']
        
        # Check if either the elev_id or neighbour is not already in the set of unique IDs
        if elev_id not in unique_ids and neighbour not in unique_ids:
            # Add the student identifiers to the set of unique IDs
            unique_ids.add(elev_id)
            unique_ids.add(neighbour)
            
            # Append True to the mask list if the row is unique
            mask.append(True)
        else:
            # Append False to the mask list if the row is a duplicate
            mask.append(False)
        
        # Update the progress bar
        progress_bar.update(1)
    
    # Close the progress bar
    progress_bar.close()
    
    # Create a boolean mask based on the unique rows
    mask = np.array(mask)
    
    # Use the mask to select the unique rows
    unique_df = df[mask]
    return unique_df

### First experiment
Test if teaching competencies have an impact on performance of students

In [None]:
#Method to calculate the ratio standard error
def calculate_ratio_se(data_x, data_y):
    # Calculate the mean and variance of the first column
    mean_x = np.mean(data_x)
    var_x = np.var(data_x)

    # Calculate the mean and variance of the second column
    mean_y = np.mean(data_y)
    var_y = np.var(data_y)

    # Calculate the sample sizes
    n_x = len(data_x)
    n_y = len(data_y)

    # Calculate the standard error
    se = np.sqrt((var_x / n_x**2) + (var_y / n_y**2))

    return se

In [None]:
#Define function for evaluating regresion on grade 
def eval_regression_grade(df,col_x,col_y,dependent_x,dependent_y,threshold):
    df = df.copy()
    std_diff = np.std(df[col_x])
    # Create a mask for the rows where competencies_y is higher than competencies_x
    mask = df[col_x] < df[col_y]

    # Swap the values between grade_x and grade_y in the specified rows
    df.loc[mask, [dependent_x, dependent_y]] = df.loc[mask, [dependent_y, dependent_x]].values

    # Swap the values between elev_id and neighbour_id in the specified rows
    df.loc[mask, [col_x, col_y]] = df.loc[mask, [col_y,col_x]].values


    # Swap the values between elev_id and neighbour_id in the specified rows
    df.loc[mask, ['elev_id', 'neighbour']] = df.loc[mask, ['neighbour', 'elev_id']].values
    
    

    # Create a mask for the rows that meet the threshold in standardized bias
    mask = np.abs(df[col_x] - df[col_y]) / std_diff > threshold
    
    mean_diff=df[dependent_x].mean()-df[dependent_y].mean()
    # Apply the mask to filter the dataframe
    df = df[mask]
    
    # Perform the paired t-test
    t_stat, p_value = stats.ttest_rel(df[dependent_x], df[dependent_y])
    print('\n _______________________________________________________ \n')
    print('Standard Deviation: ', std_diff)
    # Print the t-statistic and p-value
    print('Number of comparisons', len(df))
    print('t-statistic:', t_stat)
    print('p-value:', p_value)
    print('Mean difference in Dependent: ', mean_diff)
    print('Ratio: ',df[dependent_x].mean() / df[dependent_y].mean())
    ratio=df[dependent_x].mean() / df[dependent_y].mean()
    ratio_error = calculate_ratio_se(df[dependent_x],df[dependent_y])
    print('Ratio se: ',ratio_error)

    
    
    return p_value,ratio,ratio_error

In [None]:
#Function to iterate the eval regression function and create plots for low and high parental education 
def iterate_thresholds(df, col_x, col_y, dependent_x, dependent_y,filename):
    # Define the start of the thresholds
    threshold = 0.2

    # Segment your data into high and low parental education
    high_edu = df[df['highest_edu_x'] > 0]
    low_edu = df[df['highest_edu_x'] < 0]

    # Create lists to store the thresholds and corresponding ratios for each group
    thresholds_high = []
    ratios_high = []
    thresholds_low = []
    ratios_low = []
    ratio_errors_high = []
    ratio_errors_low = []


    p_value_high = 0
    p_value_low = 0

    # Iterate through thresholds in increments of 0.2 until p-value > 0.05 for both groups
    while p_value_high <= 0.05 or p_value_low <= 0.05:
        print("The threshold is == __________",threshold,"_____________________")
        # Get the p-value and ratio for the high education group
        if p_value_high <= 0.05:
            p_value_high, ratio_high,ratio_error_high = eval_regression_grade(high_edu, col_x, col_y, dependent_x, dependent_y, threshold)
            ratio_percent_high = (ratio_high - 1) * 100
            if p_value_high <= 0.05:
                thresholds_high.append(threshold)
                ratios_high.append(ratio_high)
                ratio_errors_high.append(ratio_error_high)

        # Get the p-value and ratio for the low education group
        if p_value_low <= 0.05:
            p_value_low, ratio_low,ratio_error_low = eval_regression_grade(low_edu, col_x, col_y, dependent_x, dependent_y, threshold)
            ratio_percent_low = (ratio_low - 1) * 100
            if p_value_low <= 0.05:
                thresholds_low.append(threshold)
                ratios_low.append(ratio_low)
                ratio_errors_low.append(ratio_error_low)

        # Increase the threshold by 0.2 for the next iteration
        threshold += 0.2
    
    # Plot the ratios for each threshold for both groups
    plt.plot(thresholds_high, ratios_high, label='High Parental Education')
    plt.plot(thresholds_low, ratios_low, label='Low Parental Education')

    # Plot the ratios for each threshold for both groups with error bars
    plt.errorbar(thresholds_high, ratios_high, yerr=ratio_errors_high, label=' ', color='blue')
    plt.errorbar(thresholds_low, ratios_low, yerr=ratio_errors_low, label=' ', color='orange')

    # Create custom legend handles with corresponding colors
    handles = [plt.Line2D([], [], color='blue', label='High Parental Education (with error bars)'),
               plt.Line2D([], [], color='orange', label='Low Parental Education (with error bars)')]

    plt.xlabel('Threshold')
    plt.ylabel('Ratio Between Means in Dependent Variable')
    plt.title('Ratio of Dependent Variable for Different Thresholds')
    plt.legend(handles=handles)
    plt.savefig('figures/{}.pdf'.format(filename), format='pdf')
    plt.show()



In [None]:
#Merge relevant columns with the pairwise students to calculate teacher effects
merger=gen_df[['elev_id','step','grade','course','exam','Written','hard_science',
               'competencies','students','avg_start','highest_edu','avg_start_scaled','level_A','level_B']].copy()
#merger=merger[merger['income_father'] < 0]
#Define the type of course we want to test
merger=merger.query('hard_science == 0 & step >= 3 & exam == 1')

merger.drop(columns=['hard_science'],inplace=True)

compare = sim_third.merge(merger,on='elev_id')
merger.rename(columns={'elev_id':'neighbour'},inplace=True)

compare=compare.merge(merger,on='neighbour')


compare=compare.loc[(compare['course_x'] == compare['course_y']) & (compare['Written_x'] == compare['Written_y']) & (compare['exam_x'] == compare['exam_y']) & (compare['level_A_x'] == compare['level_A_y'])  & (compare['level_B_x'] == compare['level_B_y'])]
print(len(compare))

compare['comp_diff']=abs(compare['competencies_x']-compare['competencies_y'])
compare['size_diff']=abs(compare['students_x']-compare['students_y'])
compare['grade_diff']=abs(compare['grade_x']-compare['grade_y'])
compare['start_diff']=abs(abs(compare['avg_start_x'])-abs(compare['avg_start_y']))
compare['start_diff_scaled']=abs(abs(compare['avg_start_scaled_x'])-abs(compare['avg_start_scaled_y']))
compare['avg_start_x'] = abs(compare['avg_start_x'])
compare['avg_start_y'] = abs(compare['avg_start_y'])
#compare['impact_diff']=abs(compare['teacher_impact_x']-compare['teacher_impact_y'])


In [None]:
#Impact of having course competent teacher
competencies=compare.query('comp_diff > 0.2')
competencies=remove_duplicate_identifiers(competencies,'comp_diff')

low_edu=competencies[competencies['highest_edu_x'] < 0]    
high_edu=competencies[competencies['highest_edu_x'] > 0]    


print('Low Parental Education: ')
low_edu_comp =eval_regression_grade(low_edu,'competencies_x','competencies_y','grade_x','grade_y',0.2)

print('\n\n\n\nHigh Parental Education: ')
high_edu_comp =eval_regression_grade(high_edu,'competencies_x','competencies_y','grade_x','grade_y',0.2)

print('\n\n\nTogether: ')
high_edu_comp =eval_regression_grade(competencies,'competencies_x','competencies_y','grade_x','grade_y',0.2)

In [None]:
#Impact of late or early starttime 
time=compare.query('start_diff_scaled > 1')

time=remove_duplicate_identifiers(time,'start_diff_scaled')

low_edu=time[time['highest_edu_x'] < 0]    
high_edu=time[time['highest_edu_x'] > 0]    

#Run for high and low edu and everyone
start_time=eval_regression_grade(low_edu,'avg_start_scaled_x','avg_start_scaled_y','grade_x','grade_y',0.4)
start_time=eval_regression_grade(high_edu,'avg_start_scaled_x','avg_start_scaled_y','grade_x','grade_y',0.4)
start_time=eval_regression_grade(time,'avg_start_scaled_x','avg_start_scaled_y','grade_x','grade_y',0.4)

In [None]:
##### Impact of larger group sizes test for more than 28 vs less than 28
group_size=compare.query('(students_x > 28 & students_y < 28) | (students_x < 28 & students_y > 28)')

group_size=remove_duplicate_identifiers(group_size,'size_diff')#Remove Duplicates
low_edu=group_size[group_size['highest_edu_x'] < 0]#Low edu group    
high_edu=group_size[group_size['highest_edu_x'] > 0]  #High edu group

#Regression for both groups
eval_regression_grade(low_edu, 'students_x','students_y','grade_x','grade_y',0.4)
eval_regression_grade(high_edu, 'students_x','students_y','grade_x','grade_y',0.4)

### Second round of experiments
See the impact of diversity in classroom on retention of students

In [None]:
#Modify the evaluation of grades to fit the binary prediction of changing schools.By utilizing mcnemar
from statsmodels.stats.contingency_tables import mcnemar
def eval_binary_grade(df, col_x, col_y, dependent_x, dependent_y, threshold):
    df = df.copy()

    mask = df[col_x] < df[col_y]

    df.loc[mask, [dependent_x, dependent_y]] = df.loc[mask, [dependent_y, dependent_x]].values
    df.loc[mask, [col_x, col_y]] = df.loc[mask, [col_y,col_x]].values
    df.loc[mask, ['elev_id', 'neighbour']] = df.loc[mask, ['neighbour', 'elev_id']].values

    mask = np.abs(df[col_x] - df[col_y]) > threshold

    df = df[mask]

    table = pd.crosstab(df[dependent_x], df[dependent_y])
    
    

    result = mcnemar(table, exact=True)

    print('\n _______________________________________________________ \n')
    print('Number of comparisons', len(df))
    print('statistic:', result.statistic)
    print('p-value:', result.pvalue)
    print('Proportion difference in dependent: ', df[dependent_x].mean() - df[dependent_y].mean())
    print('Ratio: ',df[dependent_x].mean() / df[dependent_y].mean())
    
    print('\nContingency Table:')
    print(table.to_string())
    
    ratio_error = calculate_ratio_se(df[dependent_x],df[dependent_y])
    print('Ratio se: ',ratio_error)


    return result.pvalue, df[dependent_x].mean() / df[dependent_y].mean(), ratio_error

#new iterate graph function for binary
def iterate_binary (df, col_x, col_y, dependent_x, dependent_y,filename):
    # Define the start of the thresholds
    threshold = 0.05

    # Segment your data into high and low parental education
    high_edu = df[df['highest_edu_x'] > 0]
    low_edu = df[df['highest_edu_x'] < 0]

    # Create lists to store the thresholds and corresponding ratios for each group
    thresholds_high = []
    ratios_high = []
    thresholds_low = []
    ratios_low = []
    ratio_errors_low = []
    ratio_errors_high = []

    p_value_high = 0
    p_value_low = 0

    # Iterate through thresholds in increments of 0.2 until p-value > 0.05 for both groups
    while p_value_high <= 0.05 or p_value_low <= 0.05:
        print("The threshold is == __________",threshold,"_____________________")
        # Get the p-value and ratio for the high education group
        if p_value_high <= 0.05:
            p_value_high, ratio_high, ratio_error_high = eval_binary_grade(high_edu, col_x, col_y, dependent_x, dependent_y, threshold)
            #ratio_percent_high = (ratio_high - 1) * 100
            if p_value_high <= 0.05:
                thresholds_high.append(threshold)
                ratios_high.append(ratio_high)
                ratio_errors_high.append(ratio_error_high)


        # Get the p-value and ratio for the low education group
        if p_value_low <= 0.05:
            p_value_low, ratio_low,ratio_error_low = eval_binary_grade(low_edu, col_x, col_y, dependent_x, dependent_y, threshold)
            #ratio_percent_low = (ratio_low - 1) * 100
            if p_value_low <= 0.05:
                thresholds_low.append(threshold)
                ratios_low.append(ratio_low)
                ratio_errors_low.append(ratio_error_low)

        # Increase the threshold by 0.2 for the next iteration
        threshold += 0.02

       # Plot the ratios for each threshold for both groups
    plt.plot(thresholds_high, ratios_high, label='High Parental Education')
    plt.plot(thresholds_low, ratios_low, label='Low Parental Education')

    # Plot the ratios for each threshold for both groups with error bars
    plt.errorbar(thresholds_high, ratios_high, yerr=ratio_errors_high, label=' ', color='blue')
    plt.errorbar(thresholds_low, ratios_low, yerr=ratio_errors_low, label=' ', color='orange')

    # Create custom legend handles with corresponding colors
    handles = [plt.Line2D([], [], color='blue', label='High Parental Education'),
               plt.Line2D([], [], color='orange', label='Low Parental Education')]

    plt.xlabel('Threshold')
    plt.ylabel('Ratio Between Means in Dependent Variable')
    plt.title('Ratio of Changing Schools for Different Thresholds')
    plt.legend(handles=handles)
    plt.savefig('figures/{}.pdf'.format(filename), format='pdf')
    plt.show()


In [None]:
#Create encoding for depart reason in 
non_final = div_df.query('step != 3')
# Get the one-hot encoding of the 'depart_reason' column
one_hot = pd.get_dummies(non_final['depart_reason'])

# Rename the columns of the one-hot encoded DataFrame
one_hot.rename(columns={
    21 :'next_step',
    11: 'drop_out',
    20: "break_same_school",
    31: 'change_school',
    30: 'break_change_school'
}, inplace=True)

# Concatenate the one-hot encoded DataFrame with the original DataFrame and drop original column
non_final = pd.concat([non_final, one_hot], axis=1)
#non_final.drop(columns=['depart_reason'],inplace=True)
# Create a new column that checks if any of the dummy variables are equal to 1
non_final['any_dummy'] = non_final[['drop_out', 'break_same_school', 'change_school', 'break_change_school']].any(axis=1).astype(int)


#Create seperate for each step
df_first=non_final.loc[non_final['step'] == 1].copy()
df_second=non_final.loc[non_final['step'] == 2].copy()

### Relationship between changing school and diversity for pairs

In [None]:
#Create mean difference on socio economic variables
df_first['mean_diff']=(df_first['diff_edu_father']+df_first['diff_edu_mother']+
                     df_first['diff_income_father'] + df_first['diff_income_mother']) / 4
#Create mean difference on socio economic variables
df_second['mean_diff']=(df_second['diff_edu_father']+df_second['diff_edu_mother']+
                     df_second['diff_income_father'] + df_second['diff_income_mother']) / 4

In [None]:
#Merge metadata to the student pairs to calc difference between them
test=df_first[['elev_id','diversity_score','drop_out','break_change_school','change_school','diff_age','mean_diff']].copy()
reg_first=sim_first.merge(test,on='elev_id',how='left')
print(len(sim_first),len(reg_first))
print(len(reg_first[reg_first['diversity_score'].isna()]) / len(reg_first))
test.rename(columns={'elev_id':'neighbour'},inplace=True)
reg_first=reg_first.merge(test,on='neighbour',how='left')
print(len(reg_first))

reg_first['diversity_diff']=abs(reg_first['diversity_score_x']-reg_first['diversity_score_y'])


reg_first.dropna(inplace=True)

#Merge parents socio economic status
parents_socio=gen_df[['elev_id','highest_edu']].copy()
parents_socio.drop_duplicates(inplace=True)
print(len(reg_first))
reg_first=reg_first.merge(parents_socio,on='elev_id',how='left')
print(len(socio_compare))
parents_socio.rename(columns={'elev_id':'neighbour'},inplace=True)
reg_first=reg_first.merge(parents_socio,on='neighbour',how='left')
print(len(socio_compare))

In [None]:
#Remove duplicates
test=remove_duplicate_identifiers(reg_first,'diversity_diff')

In [None]:
#Evaluate impact of diversity
changing=eval_binary_grade(test,'diversity_score_x','diversity_score_y','change_school_x','change_school_y',0.1)

In [None]:
#Create graph with increased threshold in diversity for retention
plt.rcParams['text.usetex'] = False
iterate_binary(test,'diversity_score_x','diversity_score_y','change_school_x','change_school_y','diversity_retention')

### See the general tendency of not progressing to next school year

In [None]:
# Define the number of bins
num_bins = 20

first_retention = df_first.query('mean_diff < 1.4 & mean_diff > -1.4')

print(len(first_retention)/len(df_first))

# Bin the 'mean_diff' values
first_retention['mean_diff_bins'] = pd.cut(first_retention['mean_diff'], bins=num_bins)

# Calculate the mean of 'mean_diff' for each bin
bin_means = first_retention.groupby('mean_diff_bins')['mean_diff'].mean()

# Calculate the percentage of 'change_school' and 'dropout' for each bin
bin_change_school_perc = first_retention.groupby('mean_diff_bins')['change_school'].mean() * 100
bin_dropout_perc = first_retention.groupby('mean_diff_bins')['drop_out'].mean() * 100

# Calculate the width of the bars
bar_width = 1.5 / num_bins  # Adjust the value as needed

# Calculate the positions of the bars for 'change_school' and 'dropout'
bar_positions_change_school = bin_means - bar_width/2
bar_positions_dropout = bin_means + bar_width/2

# Create the bar plot with thinner bars for 'change_school'
plt.bar(bar_positions_change_school, bin_change_school_perc, align='center', alpha=0.8, color='steelblue', width=bar_width, label='Change School')

# Create the bar plot with thinner bars for 'dropout'
plt.bar(bar_positions_dropout, bin_dropout_perc, align='center', alpha=0.8, color='orange',width=bar_width, label='Dropout')

# Add labels and title
plt.xlabel('Difference in socio-economic status compared to classmates')
plt.ylabel('Percentage of students')
plt.title('Percentage of Students not Continuing in Class by Mean Socio-economic Difference')

# Add a legend
plt.legend()
plt.tight_layout()
plt.savefig('figures/retention_socio_difference.pdf',dpi=400,bbox_inches='tight')
# Show the plot
plt.show()

### Work on exploring how peers might lift students to overachieve 

Using student pairs see impact of one being higher or lower social status than peers.

In [None]:
#Load pairs withtout matching on average grade
df_grade=pd.read_pickle('../distance/no_grade.pkl')

In [None]:
#Create very small difference - identical
df_grade=df_grade[df_grade['distance']<0.1]

In [None]:
#Merge the diversity dataframe to see how they stack up to peers
merge_grade=div_df[div_df['step'] == 3].drop(columns=['step','join_reason','depart_reason','inst_nr'])
print(len(df_grade))
grade_compare=df_grade.merge(merge_grade,on='elev_id',how='left')
print(len(grade_compare))
#Merge for neighbour
merge_grade.rename(columns={'elev_id':'neighbour'},inplace=True)
grade_compare=grade_compare.merge(merge_grade,on='neighbour')
print(len(grade_compare))

grade_compare.dropna(inplace=True)

In [None]:
#Create diff between all socio for each of the pairs
grade_compare['mom_income_diff']=grade_compare['diff_income_mother_x']-grade_compare['diff_income_mother_y']
grade_compare['dad_income_diff']=grade_compare['diff_income_father_x']-grade_compare['diff_income_father_y']
grade_compare['mom_edu_diff']=grade_compare['diff_edu_mother_x']-grade_compare['diff_edu_mother_y']
grade_compare['dad_edu_diff']=grade_compare['diff_edu_father_x']-grade_compare['diff_edu_father_y']
grade_compare['students_avg_grade_diff']=grade_compare['students_avg_grade_x']-grade_compare['students_avg_grade_y']

#Socio is the average difference
grade_compare['socio_x']=grade_compare['diff_edu_father_x'] + grade_compare['diff_edu_mother_x'] + grade_compare['diff_income_mother_x'] + grade_compare['diff_income_father_x']
grade_compare['socio_y']=grade_compare['diff_edu_father_y'] + grade_compare['diff_edu_mother_y'] + grade_compare['diff_income_mother_y'] + grade_compare['diff_income_father_y']


#Inverse to ensure that students with positive values are surrounded by peers of higher socio-economic statuts
grade_compare['socio_inverse_x']=grade_compare['socio_x']*(-1)
grade_compare['socio_inverse_y']=grade_compare['socio_y']*(-1)
grade_compare['socio_diff']=abs(grade_compare['socio_x'] - grade_compare['socio_y'])

#Select only relevant columns for the experiment
soc_diff=grade_compare[['elev_id','neighbour','socio_x','socio_y']]

In [None]:
#Merge so we ensure that we get highest edu as well - we only need for one since the edu will be the same for pairs
parents_socio=gen_df[['elev_id','highest_edu']].copy()
parents_socio.drop_duplicates(inplace=True)
print(len(grade_compare))
socio_compare=grade_compare.merge(parents_socio,on='elev_id',how='left')
print(len(socio_compare))

In [None]:
#Remove the duplicates in the pair matching
socio=remove_duplicate_identifiers(socio_compare,'socio_diff')

In [None]:
#Evaluate impact of being in higher social status group
low_socio_diff=eval_regression_grade(socio.query('highest_edu_x < 0'),'socio_inverse_x','socio_inverse_y','avg_grade_x','avg_grade_y',0.2)
high_socio_diff=eval_regression_grade(socio.query('highest_edu_x > 0'),'socio_inverse_x','socio_inverse_y','avg_grade_x','avg_grade_y',0.2)

In [None]:
iterate_thresholds(socio,'socio_inverse_x','socio_inverse_y','avg_grade_x','avg_grade_y','socio_diff')

In [None]:
#Define function for evaluating regresion on grade 
def return_sorted(df,col_x,col_y,dependent_x,dependent_y):
    df = df.copy()
    std_diff = np.std(df[col_x])
    # Create a mask for the rows where competencies_y is higher than competencies_x
    mask = df[col_x] < df[col_y]

    # Swap the values between grade_x and grade_y in the specified rows
    df.loc[mask, [dependent_x, dependent_y]] = df.loc[mask, [dependent_y, dependent_x]].values

    # Swap the values between elev_id and neighbour_id in the specified rows
    df.loc[mask, [col_x, col_y]] = df.loc[mask, [col_y,col_x]].values


    # Swap the values between elev_id and neighbour_id in the specified rows
    df.loc[mask, ['elev_id', 'neighbour']] = df.loc[mask, ['neighbour', 'elev_id']].values
    return df 

In [None]:
#Find only pairs where one is better off than peers and one is worse off
sorted_socio=return_sorted(socio,'socio_inverse_x','socio_inverse_y','avg_grade_x','avg_grade_y')
sorted_socio=sorted_socio.query('socio_inverse_x > 0 & socio_inverse_y < 0')

In [None]:
#Find difference between the better and worse off
print(sorted_socio['avg_grade_x'].mean()),print(sorted_socio['avg_grade_y'].mean())

In [None]:
# Perform t-test to see if the difference is statistically significant
t_stat, p_val = stats.ttest_ind(sorted_socio['avg_grade_x'], sorted_socio['avg_grade_y'])

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")