In [98]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier

In [155]:
def preprocess(dfr,target):
  df=dfr
  cols = list(df.columns)
  cols.remove(target)
  num_cols = list(df.drop(columns=[target])._get_numeric_data().columns)
  cat_cols=list(set(cols) - set(num_cols))
  #return(num_cols,cat_cols)

  for column in num_cols:
    if len(list(df[column].unique()))<10:
      df[column]=df[column].astype('str')
      df[column].fillna('missing',inplace=True)

    else:
      df[column].fillna(np.mean(df[column]),inplace=True)

  for column in cat_cols:
    df[column].fillna('missing',inplace=True)
    if len(list(df[column].unique()))>10:
      df=df.drop(columns=[column])


  

  return(df)

    



 




In [157]:
def modelling(dfr,target,model,test_size):
  df=dfr
  cols = list(df.columns)
  cols.remove(target)
  num_cols = list(df.drop(columns=[target])._get_numeric_data().columns)
  cat_cols=list(set(cols) - set(num_cols))
  if model =='xgb' or model=='dt' or model=='rf' or model=='lgbm' or model=='catboost':
    for column in cat_cols:
      encoder=LabelEncoder()

      df[column] = (encoder.fit_transform(df[column]))

    y=df[target]
    df_train,df_test=train_test_split(df,test_size=test_size,stratify=y) #stratified sampling
    X,Y=df_train.drop(columns=[target]),df_train[target]
    #print(X)

    if model=='dt':
      clf = tree.DecisionTreeClassifier()
      clf = clf.fit(X, Y)
      df_test['prediction']=clf.predict(df_test.drop(columns=[target]))
      data_crosstab = pd.crosstab(df_test[target],
                          df_test['prediction'], 
                              margins = False)
      
      #return(data_crosstab)
      return(df_test)

    if model=='xgb':
      params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

      xgb = XGBClassifier(learning_rate=0.08, n_estimators=500, objective='binary:logistic',
                    silent=True, nthread=1)
      folds = 3
      param_comb = len(params)

      skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)

      random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=4, cv=skf.split(X,Y), random_state=42 )

      random_search.fit(X,Y)

      df_test['prediction']=random_search.predict(df_test.drop(columns=[target]))
      data_crosstab_xgb = pd.crosstab(df_test[target],
                          df_test['prediction'], 
                              margins = False)
      
      #return(data_crosstab_xgb)
      return(df_test)

      


  elif model=='logreg' or model=='svm':
    df=pd.get_dummies(df,columns=cat_cols,drop_first=True)
    y=df[target]
    df_train,df_test=train_test_split(df,test_size=test_size,stratify=y) #stratified sampling
    X,Y=df_train.drop(columns=[target]),df_train[target]

    if model=='logreg':
      clf=LogisticRegression()
      clf.fit(X,Y)
      df_test['prediction']=clf.predict(df_test.drop(columns=[target]))
      data_crosstab = pd.crosstab(df_test[target],
                          df_test['prediction'], 
                              margins = False)
      
      #return(data_crosstab)
      return(df_test)


    

  #return(df)
  







In [158]:
def prop_parity(cross,tp,fp,tn,fn):
  val=(tp+fp)/(tp+fp+tn+fn)
  return(val)

In [159]:
def compare_groups(dataframe,column,metric,actual,preds,pos,neg):
  dff=pd.DataFrame()
  levells=[]
  levels=dataframe[column].unique()
  vals=[]
  for level in levels:
    print(level)
    try:
      sub=dataframe[dataframe[column]==level]
      data_crosstab = pd.crosstab(sub[actual],
                              sub[preds], 
                                margins = False)
      print(data_crosstab)
      tp=data_crosstab[pos][pos]
      tn=data_crosstab[neg][neg]
      fp=data_crosstab[pos][neg]
      fn=data_crosstab[neg][pos]
      vals.append(metric(data_crosstab,tp,tn,fp,fn))
      levells.append(level)
    except:
      pass
  dff['level'],dff['metric']=levells,vals
    
  return(dff)

In [160]:
def threshold_eval(tdat,threshold):
  tdat=tdat.sort_values('metric',ascending=False).reset_index(drop=True)
  comp=tdat.loc[0,'metric']
  tdat['difference']=(comp/tdat['metric'])-1
  sub=tdat[tdat['difference']>threshold]
  return(tdat)

In [147]:
dat=preprocess(data,'Attrition')

In [148]:
dataf=model(dat,'Attrition','dt',0.1)

In [None]:
tdata=compare_groups(dataf,'Department',prop_parity,'Attrition','prediction','Yes','No')

In [None]:
tdata

In [None]:
threshold_eval(tdata,0.2)

In [166]:
def build_model_check_fairness(dfr,target,model,test_size,column,metric,pos,neg,threshold):
  dat=preprocess(dfr,target)
  dataf=modelling(dat,target,model,test_size)
  tdata=compare_groups(dataf,column,metric,target,'prediction',pos,neg)
  sum_dat=threshold_eval(tdata,threshold)
  return(sum_dat)






In [170]:
build_model_check_fairness(data,'Attrition','dt',0.1,'Department',prop_parity,'Yes','No',0)

1
prediction  No  Yes
Attrition          
No          68    7
Yes          7    4
2
prediction  No  Yes
Attrition          
No          37    5
Yes          6    5
0
prediction  No  Yes
Attrition          
No           5    1
Yes          1    1


Unnamed: 0,level,metric,difference
0,1,0.837209,0.0
1,2,0.792453,0.056478
2,0,0.75,0.116279


In [165]:
data=pd.read_csv('/content/HR-Employee-Attrition.csv')

In [96]:
x=data[['Age','Fare']]
y=data['Sex']