# Case Two: Base Case

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,f1_score,accuracy_score,precision_score,recall_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
xtrain = pd.read_csv('data/balanced_xtrain.csv')
ytrain = pd.read_csv('data/balanced_ytrain.csv')
xtest = pd.read_csv('data/balanced_xtest.csv')
ytest = pd.read_csv('data/balanced_ytest.csv')

In [3]:
races = xtest['race']
xtrain = xtrain.drop('race', axis=1)
xtest = xtest.drop('race', axis=1)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

## Logistic Regression

In [4]:
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     19211
           1       0.84      0.62      0.71      1786

    accuracy                           0.96     20997
   macro avg       0.90      0.80      0.84     20997
weighted avg       0.95      0.96      0.95     20997



In [5]:
yprob = lr.predict_proba(xtest)
type(ypred)

numpy.ndarray

In [6]:
print(f"Shape of yprob: {yprob.shape}")
print(f"Shape of ytest: {ytest.shape}")
print(f"Shape of races: {races.shape}")

print(f"Length of ytest: {len(ytest)}")
print(f"Length of ypred: {len(ypred)}")
print(f"Length of races: {len(races)}")

print(f"Type of races: {type(races)}")
print(f"Type of ytest: {type(ytest)}")
print(f"Type of ypred: {type(ypred)}")



Shape of yprob: (20997, 2)
Shape of ytest: (20997, 1)
Shape of races: (20997,)
Length of ytest: 20997
Length of ypred: 20997
Length of races: 20997
Type of races: <class 'pandas.core.series.Series'>
Type of ytest: <class 'pandas.core.frame.DataFrame'>
Type of ypred: <class 'numpy.ndarray'>


In [7]:
results_df = pd.DataFrame({
    # 2.1 Sensitive Attribute (dr)
    # Use .reset_index(drop=True) on dr to ensure alignment if indices were mismatched
    'race': races.reset_index(drop=True).values.ravel(),
    'True_Label': ytest.values.flatten(),
    'Predicted_Label': ypred,
    
    # 2.4 Predicted Probabilities (yprob)
    # Probability of Class 0 (Negative Class)
    'Prob_Class_0': yprob[:, 0], 
    # Probability of Class 1 (Positive Class - usually the target probability)
    'Prob_Class_1': yprob[:, 1]
})

print("✅ Results DataFrame Head:")
print(results_df.head())

print("-" * 30)

# --- 3. Save to CSV ---

file_name = 'results/balaned_LR.csv'
results_df.to_csv(file_name, index=False)

print(f"✅ Data successfully saved to '{file_name}'")

filename = 'models/balanced_LR.joblib'
# Save the model to a file
joblib.dump(lr, filename)
print(f"✅ Model successfully saved to {filename}")

✅ Results DataFrame Head:
        race  True_Label  Predicted_Label  Prob_Class_0  Prob_Class_1
0   Hispanic           0                0      0.999447      0.000553
1  Caucasian           0                0      0.999753      0.000247
2   Hispanic           0                0      0.999938      0.000062
3      Other           0                0      0.999833      0.000167
4  Caucasian           0                0      0.998908      0.001092
------------------------------
✅ Data successfully saved to 'results/balaned_LR.csv'
✅ Model successfully saved to models/balanced_LR.joblib


## Random Forest

In [8]:
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     19211
           1       0.93      0.69      0.79      1786

    accuracy                           0.97     20997
   macro avg       0.95      0.84      0.89     20997
weighted avg       0.97      0.97      0.97     20997



In [9]:
yprob = rf.predict_proba(xtest)
type(ypred)

numpy.ndarray

In [10]:
print(f"Shape of yprob: {yprob.shape}")
print(f"Shape of ytest: {ytest.shape}")
print(f"Shape of races: {races.shape}")

print(f"Length of ytest: {len(ytest)}")
print(f"Length of ypred: {len(ypred)}")
print(f"Length of races: {len(races)}")

print(f"Type of races: {type(races)}")
print(f"Type of ytest: {type(ytest)}")
print(f"Type of ypred: {type(ypred)}")



Shape of yprob: (20997, 2)
Shape of ytest: (20997, 1)
Shape of races: (20997,)
Length of ytest: 20997
Length of ypred: 20997
Length of races: 20997
Type of races: <class 'pandas.core.series.Series'>
Type of ytest: <class 'pandas.core.frame.DataFrame'>
Type of ypred: <class 'numpy.ndarray'>


In [11]:
results_df = pd.DataFrame({
    # 2.1 Sensitive Attribute (dr)
    # Use .reset_index(drop=True) on dr to ensure alignment if indices were mismatched
    'race': races.reset_index(drop=True).values.ravel(),
    'True_Label': ytest.values.flatten(),
    'Predicted_Label': ypred,
    
    # 2.4 Predicted Probabilities (yprob)
    # Probability of Class 0 (Negative Class)
    'Prob_Class_0': yprob[:, 0], 
    # Probability of Class 1 (Positive Class - usually the target probability)
    'Prob_Class_1': yprob[:, 1]
})

print("✅ Results DataFrame Head:")
print(results_df.head())

print("-" * 30)

# --- 3. Save to CSV ---

file_name = 'results/balanced_RF.csv'
results_df.to_csv(file_name, index=False)

print(f"✅ Data successfully saved to '{file_name}'")

filename = 'models/balanced_RF.joblib'
# Save the model to a file
joblib.dump(rf, filename)
print(f"✅ Model successfully saved to {filename}")

✅ Results DataFrame Head:
        race  True_Label  Predicted_Label  Prob_Class_0  Prob_Class_1
0   Hispanic           0                0           1.0           0.0
1  Caucasian           0                0           1.0           0.0
2   Hispanic           0                0           1.0           0.0
3      Other           0                0           1.0           0.0
4  Caucasian           0                0           1.0           0.0
------------------------------
✅ Data successfully saved to 'results/balanced_RF.csv'
✅ Model successfully saved to models/balanced_RF.joblib
