In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import lime
from lime.lime_tabular import LimeTabularExplainer

# Load the data
# Replace with your actual file
df = pd.read_csv('datasets/bias_detection_in_hiring.csv')

# Feature Engineering
df['YearsCode'] = pd.to_numeric(df['YearsCode'], errors='coerce')
df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')
df['Hired'] = df['Employed'].apply(lambda x: 1 if x == 1 else 0)  # Target column

# Drop unnecessary columns (adjust based on your data)
X = df.drop(['Hired', 'Employed'], axis=1)
y = df['Hired']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Logistic regression pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))
print(pipeline.named_steps['classifier'].n_iter_)


Accuracy: 0.790988906281903
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.78      0.78      6814
           1       0.81      0.80      0.80      7879

    accuracy                           0.79     14693
   macro avg       0.79      0.79      0.79     14693
weighted avg       0.79      0.79      0.79     14693

AUC-ROC Score: 0.8794484139382448
[227]


In [10]:
import joblib

# Save the pipeline to a file
joblib.dump(pipeline, '../hiring_model_pipeline.pkl')
print("Pipeline saved successfully!")


Pipeline saved successfully!
