In [18]:
from sklearn.discriminant_analysis import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score



In [2]:
df = pd.read_csv('Korea Income and Welfare.csv')

In [3]:
for col in df.columns:
    # Replace invalid or NaN values with the column mean
    if df[col].dtype in [np.int64, np.float64]:  # Ensure the column is numeric
        df[col] = df[col].fillna(df[col].mean())

In [4]:
avg_income = df['income'].mean()


In [5]:
avg_income

3441.1223268686776

In [6]:
df['above_avg_income'] = df['income'] >= avg_income


In [11]:

X = df.drop(columns=["id", "income", 'above_avg_income'])  # drop non-predictive columns
y = df['above_avg_income'].astype(int)  # binary classification (above average income)


X = pd.get_dummies(X, drop_first=False)


# Initialize the StandardScaler
scaler = StandardScaler()
# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Now X_scaled contains the scaled features
# You can now proceed with splitting the data into train/test and fitting your models

# Example: splitting the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# One-hot encode categorical variables

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train the logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.3f}")


Overall Accuracy: 0.840


In [14]:
[col for col in X_test.columns]

['year',
 'wave',
 'region',
 'family_member',
 'gender',
 'year_born',
 'education_level',
 'marriage',
 'religion',
 'occupation_ ',
 'occupation_1001',
 'occupation_1002',
 'occupation_1009',
 'occupation_1011',
 'occupation_1012',
 'occupation_111',
 'occupation_112',
 'occupation_113',
 'occupation_12',
 'occupation_120',
 'occupation_121',
 'occupation_122',
 'occupation_131',
 'occupation_132',
 'occupation_133',
 'occupation_134',
 'occupation_135',
 'occupation_139',
 'occupation_141',
 'occupation_142',
 'occupation_143',
 'occupation_144',
 'occupation_145',
 'occupation_149',
 'occupation_151',
 'occupation_152',
 'occupation_153',
 'occupation_154',
 'occupation_156',
 'occupation_157',
 'occupation_159',
 'occupation_161',
 'occupation_162',
 'occupation_163',
 'occupation_164',
 'occupation_165',
 'occupation_171',
 'occupation_172',
 'occupation_173',
 'occupation_182',
 'occupation_183',
 'occupation_184',
 'occupation_21',
 'occupation_211',
 'occupation_212',
 'occup

In [15]:
# Filter male and female entries
X_test_male = X_test[X_test['gender'] == 1]
y_test_male = y_test[X_test['gender'] == 1]


X_test_female = X_test[X_test['gender'] == 2]
y_test_female = y_test[X_test['gender'] == 2]

# Get accuracy for each gender group
y_pred_male = model.predict(X_test_male)
accuracy_male = accuracy_score(y_test_male, y_pred_male)
print(f"Accuracy for Male: {accuracy_male:.3f}")

y_pred_female = model.predict(X_test_female)
accuracy_female = accuracy_score(y_test_female, y_pred_female)
print(f"Accuracy for Female: {accuracy_female:.3f}")


Accuracy for Male: 0.809
Accuracy for Female: 0.913


In [16]:

y_pred_male_proba = model.predict_proba(X_test_male)[:, 1]
roc_auc_male = roc_auc_score(y_test_male, y_pred_male_proba)
print(f"ROC AUC for Male: {roc_auc_male:.3f}")

y_pred_female_proba = model.predict_proba(X_test_female)[:, 1]
roc_auc_female = roc_auc_score(y_test_female, y_pred_female_proba)
print(f"ROC AUC for Female: {roc_auc_female:.3f}")


ROC AUC for Male: 0.893
ROC AUC for Female: 0.901


In [17]:

# For Male
y_pred_male = model.predict(X_test_male)
precision_male = precision_score(y_test_male, y_pred_male)
recall_male = recall_score(y_test_male, y_pred_male)
f1_male = f1_score(y_test_male, y_pred_male)

# For Female
y_pred_female = model.predict(X_test_female)
precision_female = precision_score(y_test_female, y_pred_female)
recall_female = recall_score(y_test_female, y_pred_female)
f1_female = f1_score(y_test_female, y_pred_female)

print(f"Male Precision: {precision_male:.3f}, Recall: {recall_male:.3f}, F1-Score: {f1_male:.3f}")
print(f"Female Precision: {precision_female:.3f}, Recall: {recall_female:.3f}, F1-Score: {f1_female:.3f}")


Male Precision: 0.808, Recall: 0.795, F1-Score: 0.801
Female Precision: 0.637, Recall: 0.359, F1-Score: 0.459


In [15]:
X_test.columns

Index(['year', 'wave', 'region', 'family_member', 'gender', 'year_born',
       'education_level', 'marriage', 'religion', 'occupation_ ',
       ...
       'reason_none_worker_11', 'reason_none_worker_2', 'reason_none_worker_3',
       'reason_none_worker_4', 'reason_none_worker_5', 'reason_none_worker_6',
       'reason_none_worker_7', 'reason_none_worker_8', 'reason_none_worker_9',
       'reason_none_worker_99'],
      dtype='object', length=280)

In [34]:
equalized_odds_difference(
    y_true = y_test,
    y_pred =y_pred,
    sensitive_features= X_test['gender']
)

0.43548361483648074

In [36]:
demographic_parity_difference(
    y_true = y_test,
    y_pred = y_pred,
    sensitive_features= X_test['gender']
)

0.4188141657966568