In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


data = pd.read_csv("job_applicant_data.csv")

In [29]:
data

Unnamed: 0.1,Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed
0,0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4,0
1,1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12,1
2,2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7,0
3,3,<35,No,Undergraduate,1,Man,No,Dev,9,6,Canada,46135.0,Bash/Shell;HTML/CSS;JavaScript;PHP;Ruby;SQL;Gi...,13,0
4,4,>35,No,PhD,0,Man,No,NotDev,40,30,Singapore,160932.0,C++;Python,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73457,73457,<35,No,Undergraduate,1,Man,No,Dev,7,2,Germany,41058.0,C#;HTML/CSS;JavaScript;TypeScript;Docker;Kuber...,13,1
73458,73458,>35,No,Undergraduate,1,Man,No,Dev,21,16,United States of America,115000.0,C#;HTML/CSS;Java;JavaScript;npm;ASP.NET Core ;...,11,1
73459,73459,<35,No,Undergraduate,1,Man,No,Dev,4,3,Nigeria,57720.0,HTML/CSS;JavaScript;TypeScript;Docker;Express;...,12,1
73460,73460,<35,Yes,Undergraduate,1,Man,Yes,Dev,5,1,United States of America,70000.0,C#;HTML/CSS;JavaScript;SQL;TypeScript;npm;Yarn...,15,1


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_csv('job_applicant_data.csv')

# Drop irrelevant columns
data = data.drop(columns=['Unnamed: 0'])

# Create a new feature based on number of skills
data['NumTechWorkedWith'] = data['HaveWorkedWith'].str.count(';') + 1

# Drop HaveWorkedWith
data = data.drop(columns=['HaveWorkedWith'])


# Encode categorical variables
label_encoders = {}
for column in ['Age', 'Accessibility', 'EdLevel', 'Gender', 'MentalHealth', 'MainBranch', 'Country']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le


# Split the data into features and target
X = data.drop(columns=['Employed'])
y = data['Employed']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define hyperparameter grid
param_dist = {
    'n_estimators': [100, 250, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2'],
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=10,          # Run 10 iterations of the randomized search
    cv=5,               # Cross validation split of 5
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,          # Use all processors
)

# Fit the model
random_search.fit(X_train, y_train)

# Print the best parameters and corresponding cross validation accuracy
print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)


Best Hyperparameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}
Best Cross-Validation Accuracy: 0.7854651210011133


In [26]:
y_pred = random_search.best_estimator_.predict(X_test)

In [27]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2 * precision * recall / (precision + recall)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")


Precision Score: 0.7870
Recall Score: 0.8285
F1 Score: 0.8072
Mean Squared Error (MSE): 0.2119
Mean Absolute Error (MAE): 0.2119


In [29]:
# Treating Age as the sensitive attribute
unique_Age_groups = data['Age'].unique()

from collections import defaultdict
Age_counts = defaultdict(int)  
positive_label_counts = defaultdict(int) 

X

for Age, lbl in zip(X_test['Age'], y_test):  
    Age_counts[Age] += 1 
    if lbl == 1:
        positive_label_counts[Age] += 1  

# DEMOGRAPHIC PARITY / ACCURACY PARITY??? Slides have the same formula for both
demographic_parity = []
for i in unique_Age_groups:
    demographic_parity.append(positive_label_counts[i]/Age_counts[i])
    print(f"Demographic parity value of Age Group {i}: {demographic_parity}")
print()


# p% rule
count_rule_violation = 0

for i in range(len(demographic_parity)):
    for j in range(i + 1, len(demographic_parity)):
        if demographic_parity[i] / demographic_parity[j] < 0.8:
            count_rule_violation += 1

print(f"Number of p% rule violations: {count_rule_violation}")

# PPV and NPV

negative_outcomes = defaultdict(int)
positive_outcomes = defaultdict(int)
negative_predn = defaultdict(int)
positive_predn = defaultdict(int)

for idx, label in enumerate(y_test):
    prediction = y_pred[idx]
    Age_group = X_test.iloc[idx]['Age']
    if prediction == label and label == 0:
        negative_outcomes[Age_group] += 1
    elif prediction == label and label == 1:
        positive_outcomes[Age_group] += 1
    if prediction == 0:
        negative_predn[Age_group] += 1
    elif prediction == 1:
        positive_predn[Age_group] += 1

print()

for group in positive_outcomes:
    if positive_predn[group] > 0:
        ratio = positive_outcomes[group] / positive_predn[group]
    print(f"Positive parity value of Age Group {group}: {ratio}")

print()

for group in negative_outcomes:
    if negative_predn[group] > 0:
        ratio = negative_outcomes[group] / negative_predn[group]
    print(f"Negative parity value of Age Group {group}: {ratio}")


# Equal Opportunity

positive_outcomes = defaultdict(int)
positive_outcomes_for_Age_group = defaultdict(int)

for idx, label in enumerate(y_test):
    prediction = y_pred[idx]
    Age_group = X_test.iloc[idx]['Age']
    if prediction == 1 and label == 1:
        positive_outcomes[Age_group] += 1
    if prediction == 1:
        positive_outcomes_for_Age_group[Age_group] += 1

print()
for group in positive_outcomes:
    ratio = positive_outcomes[group] / positive_outcomes_for_Age_group[group]
    print(f"Equal opportunity value of Age Group {group}: {ratio}")



Demographic parity value of Age Group 0: [0.5456635318704284]
Demographic parity value of Age Group 1: [0.5456635318704284, 0.5162990435291821]

Number of p% rule violations: 0

Positive parity value of Age Group 0.0: 0.7918725974739155
Positive parity value of Age Group 1.0: 0.7775807023767294

Negative parity value of Age Group 1.0: 0.8033854166666666
Negative parity value of Age Group 0.0: 0.781835889943998

Equal opportunity value of Age Group 0.0: 0.7918725974739155
Equal opportunity value of Age Group 1.0: 0.7775807023767294


In [30]:
# Treating Gender as the sensitive attribute
unique_Gender_groups = data['Gender'].unique()

from collections import defaultdict
Gender_counts = defaultdict(int)  
positive_label_counts = defaultdict(int) 

X

for Gender, lbl in zip(X_test['Gender'], y_test):  
    Gender_counts[Gender] += 1 
    if lbl == 1:
        positive_label_counts[Gender] += 1  

# DEMOGRAPHIC PARITY / ACCURACY PARITY??? Slides have the same formula for both
demographic_parity = []
for i in unique_Gender_groups:
    demographic_parity.append(positive_label_counts[i]/Gender_counts[i])
    print(f"Demographic parity value of Gender Group {i}: {demographic_parity}")
print()


# p% rule
count_rule_violation = 0

for i in range(len(demographic_parity)):
    for j in range(i + 1, len(demographic_parity)):
        if demographic_parity[i] / demographic_parity[j] < 0.8:
            count_rule_violation += 1

print(f"Number of p% rule violations: {count_rule_violation}")

# PPV and NPV

negative_outcomes = defaultdict(int)
positive_outcomes = defaultdict(int)
negative_predn = defaultdict(int)
positive_predn = defaultdict(int)

for idx, label in enumerate(y_test):
    prediction = y_pred[idx]
    Gender_group = X_test.iloc[idx]['Gender']
    if prediction == label and label == 0:
        negative_outcomes[Gender_group] += 1
    elif prediction == label and label == 1:
        positive_outcomes[Gender_group] += 1
    if prediction == 0:
        negative_predn[Gender_group] += 1
    elif prediction == 1:
        positive_predn[Gender_group] += 1

print()

for group in positive_outcomes:
    if positive_predn[group] > 0:
        ratio = positive_outcomes[group] / positive_predn[group]
    print(f"Positive parity value of Gender Group {group}: {ratio}")

print()

for group in negative_outcomes:
    if negative_predn[group] > 0:
        ratio = negative_outcomes[group] / negative_predn[group]
    print(f"Negative parity value of Gender Group {group}: {ratio}")


# Equal Opportunity

positive_outcomes = defaultdict(int)
positive_outcomes_for_Gender_group = defaultdict(int)

for idx, label in enumerate(y_test):
    prediction = y_pred[idx]
    Gender_group = X_test.iloc[idx]['Gender']
    if prediction == 1 and label == 1:
        positive_outcomes[Gender_group] += 1
    if prediction == 1:
        positive_outcomes_for_Gender_group[Gender_group] += 1

print()
for group in positive_outcomes:
    ratio = positive_outcomes[group] / positive_outcomes_for_Gender_group[group]
    print(f"Equal opportunity value of Gender Group {group}: {ratio}")



Demographic parity value of Gender Group 0: [0.5401326820733396]
Demographic parity value of Gender Group 2: [0.5401326820733396, 0.47017045454545453]
Demographic parity value of Gender Group 1: [0.5401326820733396, 0.47017045454545453, 0.46691176470588236]

Number of p% rule violations: 0

Positive parity value of Gender Group 0.0: 0.78879917980264
Positive parity value of Gender Group 2.0: 0.7822085889570553
Positive parity value of Gender Group 1.0: 0.7058823529411765

Negative parity value of Gender Group 0.0: 0.7879607710517417
Negative parity value of Gender Group 2.0: 0.798941798941799
Negative parity value of Gender Group 1.0: 0.8403361344537815

Equal opportunity value of Gender Group 0.0: 0.78879917980264
Equal opportunity value of Gender Group 2.0: 0.7822085889570553
Equal opportunity value of Gender Group 1.0: 0.7058823529411765


In [31]:
# Treating MentalHealth as the sensitive attribute
unique_MentalHealth_groups = data['MentalHealth'].unique()

from collections import defaultdict
MentalHealth_counts = defaultdict(int)  
positive_label_counts = defaultdict(int) 

X

for MentalHealth, lbl in zip(X_test['MentalHealth'], y_test):  
    MentalHealth_counts[MentalHealth] += 1 
    if lbl == 1:
        positive_label_counts[MentalHealth] += 1  

# DEMOGRAPHIC PARITY / ACCURACY PARITY??? Slides have the same formula for both
demographic_parity = []
for i in unique_MentalHealth_groups:
    demographic_parity.append(positive_label_counts[i]/MentalHealth_counts[i])
    print(f"Demographic parity value of MentalHealth Group {i}: {demographic_parity}")
print()


# p% rule
count_rule_violation = 0

for i in range(len(demographic_parity)):
    for j in range(i + 1, len(demographic_parity)):
        if demographic_parity[i] / demographic_parity[j] < 0.8:
            count_rule_violation += 1

print(f"Number of p% rule violations: {count_rule_violation}")

# PPV and NPV

negative_outcomes = defaultdict(int)
positive_outcomes = defaultdict(int)
negative_predn = defaultdict(int)
positive_predn = defaultdict(int)

for idx, label in enumerate(y_test):
    prediction = y_pred[idx]
    MentalHealth_group = X_test.iloc[idx]['MentalHealth']
    if prediction == label and label == 0:
        negative_outcomes[MentalHealth_group] += 1
    elif prediction == label and label == 1:
        positive_outcomes[MentalHealth_group] += 1
    if prediction == 0:
        negative_predn[MentalHealth_group] += 1
    elif prediction == 1:
        positive_predn[MentalHealth_group] += 1

print()

for group in positive_outcomes:
    if positive_predn[group] > 0:
        ratio = positive_outcomes[group] / positive_predn[group]
    print(f"Positive parity value of MentalHealth Group {group}: {ratio}")

print()

for group in negative_outcomes:
    if negative_predn[group] > 0:
        ratio = negative_outcomes[group] / negative_predn[group]
    print(f"Negative parity value of MentalHealth Group {group}: {ratio}")


# Equal Opportunity

positive_outcomes = defaultdict(int)
positive_outcomes_for_MentalHealth_group = defaultdict(int)

for idx, label in enumerate(y_test):
    prediction = y_pred[idx]
    MentalHealth_group = X_test.iloc[idx]['MentalHealth']
    if prediction == 1 and label == 1:
        positive_outcomes[MentalHealth_group] += 1
    if prediction == 1:
        positive_outcomes_for_MentalHealth_group[MentalHealth_group] += 1

print()
for group in positive_outcomes:
    ratio = positive_outcomes[group] / positive_outcomes_for_MentalHealth_group[group]
    print(f"Equal opportunity value of MentalHealth Group {group}: {ratio}")



Demographic parity value of MentalHealth Group 0: [0.5291897111754894]
Demographic parity value of MentalHealth Group 1: [0.5291897111754894, 0.5569351907934585]

Number of p% rule violations: 0

Positive parity value of MentalHealth Group 0.0: 0.7833570187904627
Positive parity value of MentalHealth Group 1.0: 0.7988712160082093

Negative parity value of MentalHealth Group 0.0: 0.7890470541716094
Negative parity value of MentalHealth Group 1.0: 0.7915742793791575

Equal opportunity value of MentalHealth Group 0.0: 0.7833570187904627
Equal opportunity value of MentalHealth Group 1.0: 0.7988712160082093
