In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.utils import resample

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
df = pd.read_csv('/content/gdrive/MyDrive/data.csv')

Mounted at /content/gdrive/


In [None]:
def calculate_probabilities(test_data, test_labels, sensitive_values, predicted_labels):
    results = []
    for value in sensitive_values:
        prob = len(test_data[(test_data['gender'] == value) & (predicted_labels == 1)]) / len(test_data[test_data['gender'] == value])
        results.append(prob)
    return results

def zemel_fairness(test_data, predicted_labels):
    prob_C_given_not_S, prob_C_given_S = calculate_probabilities(test_data, test_data['incom'], [0, 1], predicted_labels)
    return prob_C_given_not_S - prob_C_given_S

def disparate_impact(test_data, predicted_labels):
    prob_C_given_not_S, prob_C_given_S = calculate_probabilities(test_data, test_data['incom'], [0, 1], predicted_labels)
    return prob_C_given_S / prob_C_given_not_S

In [None]:
df.head()

Unnamed: 0,age,fnlwgt,educational-num,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,...,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
0,25,226802,7,0,1,0,0,40,0,0,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,1,1,0,0,50,0,0,...,False,False,False,False,True,False,False,False,False,False
2,28,336951,12,1,1,0,0,40,0,1,...,True,False,False,False,True,False,False,False,False,False
3,44,160323,10,0,1,7688,0,40,0,1,...,False,False,False,False,True,False,False,False,False,False
4,18,103497,10,1,0,0,0,30,0,0,...,False,False,False,False,False,False,False,True,False,False


In [None]:
X = df.drop(columns=['income'])
y = df['income']
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')



Accuracy: 0.8042039172865625


In [None]:
df2 = X_test.copy()
df2['incom'] = y_test
df2['Predicted_incom'] = y_pred

# Compute metrics
zemel_fairness_value = zemel_fairness(df2, df2['Predicted_incom'])
disparate_impact_value = disparate_impact(df2, df2['Predicted_incom'])

print(f'Zemel Fairness: {zemel_fairness_value}')
print(f'Disparate Impact: {disparate_impact_value}')

Zemel Fairness: -0.02844889942275316
Disparate Impact: 2.3269770298055343


In [None]:
probabilities = svm_model.predict_proba(X)
df3=df.copy()
df3['Predicted_C'] = svm_model.predict(X)
df3['Max_Probability'] = probabilities.max(axis=1)

cp = df3[(df3['gender'] == 0) & (df3['income'] ==1)].copy()
cd = df3[(df3['gender'] == 1 )& (df3['income'] ==0)].copy()
Ss = len(df3[df3['gender'] == 0])
S_s = len(df3[df3['gender'] == 1])
Ss_plus = len(df3[(df3['gender'] == 0) & (df3['Predicted_C'] == 1)])
S_s_plus = len(df3[(df3['gender'] == 1) & (df3['Predicted_C'] == 1)])

n = (Ss * S_s_plus - S_s * Ss_plus) / (Ss + S_s)
n = int(np.round(n))

In [None]:
cp_sorted = cp.sort_values(by='Max_Probability', ascending=False).head(n)
cd_sorted = cd.sort_values(by='Max_Probability', ascending=True).head(n)
cp_indices = cp_sorted.index
cd_indices = cd_sorted.index


In [None]:
df33=df3.copy()
temp_cp_income = df33.loc[cp_sorted.index, 'income'].copy()
df33.loc[cp_sorted.index, 'income'] = df33.loc[cd_sorted.index, 'income'].values
df33.loc[cd_sorted.index, 'income'] = temp_cp_income.values

In [None]:


df33 = df33.drop(columns=['Predicted_C', 'Max_Probability'])
X_new = df33.drop(columns=['income'])
y_new = df33['income']

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)
#probability=True
svm_model_new = SVC()
svm_model_new.fit(X_train_new, y_train_new)


In [None]:
y_pred = svm_model_new.predict(X_test_new)
accuracy_new = svm_model_new.score(X_test_new, y_test_new)
print(f'New Model Accuracy: {accuracy_new}')


New Model Accuracy: 0.7843177397891289


In [None]:
df4 = X_test_new.copy()
df4['incom'] = y_test_new
df4['Predicted_incom'] = y_pred
zemel_fairness_value = zemel_fairness(df4, df4['Predicted_incom'])
disparate_impact_value = disparate_impact(df4, df4['Predicted_incom'])

print(f'Zemel Fairness: {zemel_fairness_value}')
print(f'Disparate Impact: {disparate_impact_value}')

Zemel Fairness: -0.018751427791038614
Disparate Impact: 2.035460199037353


In [None]:
df_majority = df[df['gender'] == 1]
df_minority = df[df['gender'] == 0]
print(len(df_majority) ,len(df_minority))

32650 16192


In [None]:
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

X_upsampled = df_upsampled.drop(columns=['income'])
y_upsampled = df_upsampled['income']
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model_new.predict(X_test)
accuracy = svm_model.score(X_test, y_test)
print(f'Model Accuracy after Resampling: {accuracy}')

Model Accuracy after Resampling: 0.82687595712098


In [None]:
df4 = X_test.copy()
df4['incom'] = y_test
df4['Predicted_incom'] = y_pred
zemel_fairness_value = zemel_fairness(df4, df4['Predicted_incom'])
disparate_impact_value = disparate_impact(df4, df4['Predicted_incom'])

print(f'Zemel Fairness: {zemel_fairness_value}')
print(f'Disparate Impact: {disparate_impact_value}')

Zemel Fairness: -0.014339098490655278
Disparate Impact: 1.74449509782434
