Read the dataset

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from fairlearn import metrics
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equal_opportunity_difference, MetricFrame


# Load the dataset
df = pd.read_csv('Cleaned_Students_Performance.csv')

# Display the first few rows to understand the structure
print(df.head())


   gender race_ethnicity parental_level_of_education  lunch  \
0       0        group B           bachelor's degree      1   
1       0        group C                some college      1   
2       0        group B             master's degree      1   
3       1        group A          associate's degree      0   
4       1        group C                some college      1   

   test_preparation_course  math_score  reading_score  writing_score  \
0                        0          72             72             74   
1                        1          69             90             88   
2                        0          90             95             93   
3                        0          47             57             44   
4                        0          76             78             75   

   total_score  average_score  
0          218      72.666667  
1          247      82.333333  
2          278      92.666667  
3          148      49.333333  
4          229      76.33333

In [2]:
pd.set_option('display.max_columns', None)

Encode categorical values

In [20]:
# One-Hot Encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course'])

# Display the first few rows after encoding
print(df_encoded.head())


   math_score  reading_score  writing_score  total_score  average_score  \
0          72             72             74          218      72.666667   
1          69             90             88          247      82.333333   
2          90             95             93          278      92.666667   
3          47             57             44          148      49.333333   
4          76             78             75          229      76.333333   

   gender_0  gender_1  race_ethnicity_group A  race_ethnicity_group B  \
0      True     False                   False                    True   
1      True     False                   False                   False   
2      True     False                   False                    True   
3     False      True                    True                   False   
4     False      True                   False                   False   

   race_ethnicity_group C  race_ethnicity_group D  race_ethnicity_group E  \
0                   False        

In [4]:

# Split the data into training and testing sets (80% for training, 20% for testing)
X = df_encoded.drop(['math_score', 'reading_score', 'writing_score'], axis=1)  # Features
y_math = df_encoded['math_score']  # Target: Math Score
y_reading = df_encoded['reading_score']  # Target: Reading Score
y_writing = df_encoded['writing_score']  # Target: Writing Score

# Splitting the data
X_train, X_test, y_train_math, y_test_math = train_test_split(X, y_math, test_size=0.2, random_state=42)
X_train, X_test, y_train_reading, y_test_reading = train_test_split(X, y_reading, test_size=0.2, random_state=42)
X_train, X_test, y_train_writing, y_test_writing = train_test_split(X, y_writing, test_size=0.2, random_state=42)


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
model_math = LinearRegression()
model_reading = LinearRegression()
model_writing = LinearRegression()

# Fit the model on the training data
model_math.fit(X_train, y_train_math)
model_reading.fit(X_train, y_train_reading)
model_writing.fit(X_train, y_train_writing)

# Predict on the test data
y_pred_math = model_math.predict(X_test)
y_pred_reading = model_reading.predict(X_test)
y_pred_writing = model_writing.predict(X_test)

# Evaluate the model performance
mse_math = mean_squared_error(y_test_math, y_pred_math)
mse_reading = mean_squared_error(y_test_reading, y_pred_reading)
mse_writing = mean_squared_error(y_test_writing, y_pred_writing)

print(f'MSE for Math: {mse_math}')
print(f'MSE for Reading: {mse_reading}')
print(f'MSE for Writing: {mse_writing}')

# R^2 Score to evaluate how well the models fit
r2_math = r2_score(y_test_math, y_pred_math)
r2_reading = r2_score(y_test_reading, y_pred_reading)
r2_writing = r2_score(y_test_writing, y_pred_writing)

print(f'R^2 for Math: {r2_math}')
print(f'R^2 for Reading: {r2_reading}')
print(f'R^2 for Writing: {r2_writing}')


MSE for Math: 12.90906562548371
MSE for Reading: 9.10763187285127
MSE for Writing: 7.235685984032025
R^2 for Math: 0.9469501499743415
R^2 for Reading: 0.9597513886106751
R^2 for Writing: 0.9699785212886685


In [6]:
# Coefficients for each feature
coefficients_math = model_math.coef_
coefficients_reading = model_reading.coef_
coefficients_writing = model_writing.coef_

# Display coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Math Coefficients': coefficients_math,
    'Reading Coefficients': coefficients_reading,
    'Writing Coefficients': coefficients_writing
})

print(coefficients)


                                           Feature  Math Coefficients  \
0                                      total_score      -2.507979e+12   
1                                    average_score       7.523937e+12   
2                                         gender_0      -1.188721e+10   
3                                         gender_1      -1.188721e+10   
4                           race_ethnicity_group A      -1.150925e+10   
5                           race_ethnicity_group B      -1.150925e+10   
6                           race_ethnicity_group C      -1.150925e+10   
7                           race_ethnicity_group D      -1.150925e+10   
8                           race_ethnicity_group E      -1.150925e+10   
9   parental_level_of_education_associate's degree      -7.994419e+09   
10   parental_level_of_education_bachelor's degree      -7.994419e+09   
11         parental_level_of_education_high school      -7.994419e+09   
12     parental_level_of_education_master's degree 

In [38]:

#USED FOR CLASSIFICATION

# # Fairness analysis using Fairlearn
# def calculate_fairness(y_true, y_pred, sensitive_feature, feature_name):
#     metric_frame = MetricFrame(
#         metrics={'MSE': mean_squared_error, 'R^2': r2_score},
#         y_true=y_true,
#         y_pred=y_pred,
#         sensitive_features=sensitive_feature
#     )
#     print(f"\nFairness metrics by {feature_name}:")
#     print(metric_frame.by_group)

#     dp_diff = demographic_parity_difference(y_true, y_pred, sensitive_features=sensitive_feature)
#     eo_diff = equal_opportunity_difference(y_true, y_pred, sensitive_features=sensitive_feature)

#     print(f"Demographic Parity Difference for {feature_name}: {dp_diff}")
#     print(f"Equal Opportunity Difference for {feature_name}: {eo_diff}")

# # Analyze bias for Math scores with gender
# print("\nFairness analysis for Math scores by Gender:")
# calculate_fairness(y_test_math, y_pred_math, sensitive_features_test['gender'], "Gender")

# # Analyze bias for Math scores with race
# print("\nFairness analysis for Math scores by Race/Ethnicity:")
# calculate_fairness(y_test_math, y_pred_math, sensitive_features_test['race_ethnicity'], "Race/Ethnicity")




def calculate_fairness_regression(y_true, y_pred, sensitive_feature, feature_name):
    # Create MetricFrame for regression metrics
    metric_frame = MetricFrame(
        metrics={'MSE': mean_squared_error, 'R^2': r2_score},
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=sensitive_feature
    )
    print(f"\nFairness metrics by {feature_name}:")
    print(metric_frame.by_group)

    # Group differences for MSE and R^2
    mse_diff = metric_frame.difference(method='between_groups', metric='MSE')
    r2_diff = metric_frame.difference(method='between_groups', metric='R^2')

    print(f"\nGroup difference in MSE for {feature_name}: {mse_diff}")
    print(f"Group difference in R^2 for {feature_name}: {r2_diff}")


Use Fairlearn


In [None]:
def evaluate_regression_fairness(y_true, y_pred, sensitive_features, feature_name):
    # MetricFrame for regression metrics
    metric_frame = MetricFrame(
        metrics={
            'MSE': mean_squared_error,
            'RMSE': lambda y, p: mean_squared_error(y, p, squared=False),  # Root MSE
            'Mean Prediction': lambda y, p: p.mean(),  # Average predictions
        },
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=sensitive_features
    )
    print(f"Regression fairness metrics by {feature_name}:")
    print(metric_frame.by_group)
    print("\n")
    print("-" * 50)



Fairness analysis for Math scores by Gender (Regression):
Regression fairness metrics by Gender:
             MSE      RMSE  Mean Prediction
gender                                     
0       31.32848  5.597185        64.451082
1       38.04621  6.168161        64.716951


--------------------------------------------------

Fairness analysis for Math scores by Race/Ethnicity (Regression):
Regression fairness metrics by Race/Ethnicity:
                      MSE      RMSE  Mean Prediction
race_ethnicity                                      
group A         28.313307  5.321025        69.556054
group B         38.675812  6.218988        63.236688
group C         27.678674  5.261053        64.421657
group D         42.490722  6.518491        62.672163
group E         32.888292  5.734831        67.671100


--------------------------------------------------




Fairness for math


In [42]:
print("\nFairness analysis for Math scores by Gender (Regression):")
evaluate_regression_fairness(y_test_math, y_pred_math, df['gender'][:200], "Gender")

print("\nFairness analysis for Math scores by Race/Ethnicity (Regression):")
evaluate_regression_fairness(y_test_math, y_pred_math, df['race_ethnicity'][:200], "Race/Ethnicity")


Fairness analysis for Math scores by Gender (Regression):
Regression fairness metrics by Gender:
             MSE      RMSE  Mean Prediction
gender                                     
0       31.32848  5.597185        64.451082
1       38.04621  6.168161        64.716951


--------------------------------------------------

Fairness analysis for Math scores by Race/Ethnicity (Regression):
Regression fairness metrics by Race/Ethnicity:
                      MSE      RMSE  Mean Prediction
race_ethnicity                                      
group A         28.313307  5.321025        69.556054
group B         38.675812  6.218988        63.236688
group C         27.678674  5.261053        64.421657
group D         42.490722  6.518491        62.672163
group E         32.888292  5.734831        67.671100


--------------------------------------------------




Fairness for Writing

In [44]:
print("\nFairness analysis for Math scores by Gender (Regression):")
evaluate_regression_fairness(y_test_writing, y_pred_writing, df['gender'][:200], "Gender")

print("\nFairness analysis for Math scores by Race/Ethnicity (Regression):")
evaluate_regression_fairness(y_test_writing , y_pred_writing, df['race_ethnicity'][:200], "Race/Ethnicity")


Fairness analysis for Math scores by Gender (Regression):
Regression fairness metrics by Gender:
              MSE      RMSE  Mean Prediction
gender                                      
0       14.651368  3.827711        66.754855
1       15.951315  3.993910        66.039765


--------------------------------------------------

Fairness analysis for Math scores by Race/Ethnicity (Regression):
Regression fairness metrics by Race/Ethnicity:
                      MSE      RMSE  Mean Prediction
race_ethnicity                                      
group A          9.897432  3.146018        71.081474
group B         14.761787  3.842107        65.268540
group C         13.078969  3.616486        66.043946
group D         17.795313  4.218449        64.743517
group E         19.830833  4.453182        69.238391


--------------------------------------------------




Fairness for reading

In [45]:
print("\nFairness analysis for Math scores by Gender (Regression):")
evaluate_regression_fairness(y_test_reading, y_pred_reading, df['gender'][:200], "Gender")

print("\nFairness analysis for Math scores by Race/Ethnicity (Regression):")
evaluate_regression_fairness(y_test_reading, y_pred_reading, df['race_ethnicity'][:200], "Race/Ethnicity")


Fairness analysis for Math scores by Gender (Regression):
Regression fairness metrics by Gender:
              MSE      RMSE  Mean Prediction
gender                                      
0       12.384970  3.519229        67.833475
1       13.315126  3.648990        67.236812


--------------------------------------------------

Fairness analysis for Math scores by Race/Ethnicity (Regression):
Regression fairness metrics by Race/Ethnicity:
                      MSE      RMSE  Mean Prediction
race_ethnicity                                      
group A         10.735036  3.276436        72.411150
group B         17.913186  4.232397        65.952786
group C         11.914604  3.451754        67.473948
group D         10.450389  3.232706        65.815752
group E         12.354612  3.514913        70.486057


--------------------------------------------------


