# Case One: Caucasian

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,f1_score,accuracy_score,precision_score,recall_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
dx = pd.read_csv("data/X_caucasian_biased.csv")
dy = pd.read_csv("data/y_caucasian_biased.csv")

In [3]:
xtrain,xtest,ytrain,ytest = train_test_split(dx, dy, test_size=0.3, random_state=42)
testRace = xtest['race']
xtest = xtest.drop("race", axis=1)
xtrain = xtrain.drop("race", axis=1)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

## Logistic Regression

In [4]:
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5480
           1       0.88      0.59      0.71       491

    accuracy                           0.96      5971
   macro avg       0.92      0.79      0.84      5971
weighted avg       0.96      0.96      0.96      5971



In [5]:
yprob = lr.predict_proba(xtest)
type(ypred)

numpy.ndarray

In [6]:
print(f"Shape of yprob: {yprob.shape}")
print(f"Shape of ytest: {ytest.shape}")
print(f"Shape of testRace: {testRace.shape}")

print(f"Length of ytest: {len(ytest)}")
print(f"Length of ypred: {len(ypred)}")
print(f"Length of testRace: {len(testRace)}")

print(f"Type of testRace: {type(testRace)}")
print(f"Type of ytest: {type(ytest)}")
print(f"Type of ypred: {type(ypred)}")



Shape of yprob: (5971, 2)
Shape of ytest: (5971, 1)
Shape of testRace: (5971,)
Length of ytest: 5971
Length of ypred: 5971
Length of testRace: 5971
Type of testRace: <class 'pandas.core.series.Series'>
Type of ytest: <class 'pandas.core.frame.DataFrame'>
Type of ypred: <class 'numpy.ndarray'>


In [7]:
results_df = pd.DataFrame({
    # 2.1 Sensitive Attribute (dr)
    # Use .reset_index(drop=True) on dr to ensure alignment if indices were mismatched
    'race': testRace.reset_index(drop=True).values.ravel(),
    'True_Label': ytest.values.flatten(),
    'Predicted_Label': ypred,
    
    # 2.4 Predicted Probabilities (yprob)
    # Probability of Class 0 (Negative Class)
    'Prob_Class_0': yprob[:, 0], 
    # Probability of Class 1 (Positive Class - usually the target probability)
    'Prob_Class_1': yprob[:, 1]
})

print("✅ Results DataFrame Head:")
print(results_df.head())

print("-" * 30)

# --- 3. Save to CSV ---

file_name = 'results/caucasian_biased_LR.csv'
results_df.to_csv(file_name, index=False)

print(f"✅ Data successfully saved to '{file_name}'")

✅ Results DataFrame Head:
              race  True_Label  Predicted_Label  Prob_Class_0  Prob_Class_1
0  AfricanAmerican           0                0      0.999999  7.732421e-07
1        Caucasian           0                0      0.986956  1.304411e-02
2        Caucasian           0                0      0.990643  9.356559e-03
3        Caucasian           0                0      0.589178  4.108222e-01
4        Caucasian           0                0      0.934920  6.508047e-02
------------------------------
✅ Data successfully saved to 'results/caucasian_biased_LR.csv'


## Random Forest

In [8]:
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5480
           1       0.94      0.66      0.78       491

    accuracy                           0.97      5971
   macro avg       0.96      0.83      0.88      5971
weighted avg       0.97      0.97      0.97      5971



In [9]:
yprob = rf.predict_proba(xtest)
type(ypred)

numpy.ndarray

In [10]:
print(f"Shape of yprob: {yprob.shape}")
print(f"Shape of ytest: {ytest.shape}")
print(f"Shape of testRace: {testRace.shape}")

print(f"Length of ytest: {len(ytest)}")
print(f"Length of ypred: {len(ypred)}")
print(f"Length of testRace: {len(testRace)}")

print(f"Type of testRace: {type(testRace)}")
print(f"Type of ytest: {type(ytest)}")
print(f"Type of ypred: {type(ypred)}")



Shape of yprob: (5971, 2)
Shape of ytest: (5971, 1)
Shape of testRace: (5971,)
Length of ytest: 5971
Length of ypred: 5971
Length of testRace: 5971
Type of testRace: <class 'pandas.core.series.Series'>
Type of ytest: <class 'pandas.core.frame.DataFrame'>
Type of ypred: <class 'numpy.ndarray'>


In [11]:
results_df = pd.DataFrame({
    # 2.1 Sensitive Attribute (dr)
    # Use .reset_index(drop=True) on dr to ensure alignment if indices were mismatched
    'race': testRace.reset_index(drop=True).values.ravel(),
    'True_Label': ytest.values.flatten(),
    'Predicted_Label': ypred,
    
    # 2.4 Predicted Probabilities (yprob)
    # Probability of Class 0 (Negative Class)
    'Prob_Class_0': yprob[:, 0], 
    # Probability of Class 1 (Positive Class - usually the target probability)
    'Prob_Class_1': yprob[:, 1]
})

print("✅ Results DataFrame Head:")
print(results_df.head())

print("-" * 30)

# --- 3. Save to CSV ---

file_name = 'results/caucasian_biased_RF.csv'
results_df.to_csv(file_name, index=False)

print(f"✅ Data successfully saved to '{file_name}'")

✅ Results DataFrame Head:
              race  True_Label  Predicted_Label  Prob_Class_0  Prob_Class_1
0  AfricanAmerican           0                0          1.00          0.00
1        Caucasian           0                0          1.00          0.00
2        Caucasian           0                0          1.00          0.00
3        Caucasian           0                0          0.82          0.18
4        Caucasian           0                0          0.84          0.16
------------------------------
✅ Data successfully saved to 'results/caucasian_biased_RF.csv'


# Caucasian + SMOTE

In [12]:
dx = pd.read_csv("data/X_caucasian_smote.csv")
dy = pd.read_csv("data/y_caucasian_smote.csv")

In [13]:
xtrain,xtest,ytrain,ytest = train_test_split(dx, dy, test_size=0.3, random_state=42)
testRace = xtest['race']
xtest = xtest.drop("race", axis=1)
xtrain = xtrain.drop("race", axis=1)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

## Logistic Regression

In [14]:
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     19737
           1       0.86      0.64      0.73      1163

    accuracy                           0.97     20900
   macro avg       0.92      0.82      0.86     20900
weighted avg       0.97      0.97      0.97     20900



In [15]:
yprob = lr.predict_proba(xtest)
type(ypred)

numpy.ndarray

In [16]:
print(f"Shape of yprob: {yprob.shape}")
print(f"Shape of ytest: {ytest.shape}")
print(f"Shape of testRace: {testRace.shape}")

print(f"Length of ytest: {len(ytest)}")
print(f"Length of ypred: {len(ypred)}")
print(f"Length of testRace: {len(testRace)}")

print(f"Type of testRace: {type(testRace)}")
print(f"Type of ytest: {type(ytest)}")
print(f"Type of ypred: {type(ypred)}")



Shape of yprob: (20900, 2)
Shape of ytest: (20900, 1)
Shape of testRace: (20900,)
Length of ytest: 20900
Length of ypred: 20900
Length of testRace: 20900
Type of testRace: <class 'pandas.core.series.Series'>
Type of ytest: <class 'pandas.core.frame.DataFrame'>
Type of ypred: <class 'numpy.ndarray'>


In [17]:
results_df = pd.DataFrame({
    # 2.1 Sensitive Attribute (dr)
    # Use .reset_index(drop=True) on dr to ensure alignment if indices were mismatched
    'race': testRace.reset_index(drop=True).values.ravel(),
    'True_Label': ytest.values.flatten(),
    'Predicted_Label': ypred,
    
    # 2.4 Predicted Probabilities (yprob)
    # Probability of Class 0 (Negative Class)
    'Prob_Class_0': yprob[:, 0], 
    # Probability of Class 1 (Positive Class - usually the target probability)
    'Prob_Class_1': yprob[:, 1]
})

print("✅ Results DataFrame Head:")
print(results_df.head())

print("-" * 30)

# --- 3. Save to CSV ---

file_name = 'results/caucasian_smote_LR.csv'
results_df.to_csv(file_name, index=False)

print(f"✅ Data successfully saved to '{file_name}'")

✅ Results DataFrame Head:
              race  True_Label  Predicted_Label  Prob_Class_0  Prob_Class_1
0         Hispanic           0                0      0.999966      0.000034
1  AfricanAmerican           0                0      0.999266      0.000734
2            Other           0                0      0.993058      0.006942
3            Asian           0                0      0.999986      0.000014
4            Other           0                0      0.994949      0.005051
------------------------------
✅ Data successfully saved to 'results/caucasian_smote_LR.csv'


## Random Forest

In [18]:
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     19737
           1       0.95      0.76      0.84      1163

    accuracy                           0.98     20900
   macro avg       0.97      0.88      0.92     20900
weighted avg       0.98      0.98      0.98     20900



In [19]:
yprob = rf.predict_proba(xtest)
type(ypred)

numpy.ndarray

In [20]:
print(f"Shape of yprob: {yprob.shape}")
print(f"Shape of ytest: {ytest.shape}")
print(f"Shape of testRace: {testRace.shape}")

print(f"Length of ytest: {len(ytest)}")
print(f"Length of ypred: {len(ypred)}")
print(f"Length of testRace: {len(testRace)}")

print(f"Type of testRace: {type(testRace)}")
print(f"Type of ytest: {type(ytest)}")
print(f"Type of ypred: {type(ypred)}")



Shape of yprob: (20900, 2)
Shape of ytest: (20900, 1)
Shape of testRace: (20900,)
Length of ytest: 20900
Length of ypred: 20900
Length of testRace: 20900
Type of testRace: <class 'pandas.core.series.Series'>
Type of ytest: <class 'pandas.core.frame.DataFrame'>
Type of ypred: <class 'numpy.ndarray'>


In [21]:
results_df = pd.DataFrame({
    # 2.1 Sensitive Attribute (dr)
    # Use .reset_index(drop=True) on dr to ensure alignment if indices were mismatched
    'race': testRace.reset_index(drop=True).values.ravel(),
    'True_Label': ytest.values.flatten(),
    'Predicted_Label': ypred,
    
    # 2.4 Predicted Probabilities (yprob)
    # Probability of Class 0 (Negative Class)
    'Prob_Class_0': yprob[:, 0], 
    # Probability of Class 1 (Positive Class - usually the target probability)
    'Prob_Class_1': yprob[:, 1]
})

print("✅ Results DataFrame Head:")
print(results_df.head())

print("-" * 30)

# --- 3. Save to CSV ---

file_name = 'results/caucasian_smote_RF.csv'
results_df.to_csv(file_name, index=False)

print(f"✅ Data successfully saved to '{file_name}'")

✅ Results DataFrame Head:
              race  True_Label  Predicted_Label  Prob_Class_0  Prob_Class_1
0         Hispanic           0                0           1.0           0.0
1  AfricanAmerican           0                0           1.0           0.0
2            Other           0                0           1.0           0.0
3            Asian           0                0           1.0           0.0
4            Other           0                0           1.0           0.0
------------------------------
✅ Data successfully saved to 'results/caucasian_smote_RF.csv'
