## Preliminaries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot  as plt 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

%matplotlib inline

## Prepare Data

<div class="alert alert-block alert-info"> Problem 0.1 </div>
Read the file 'fair_ml_data.csv' provided with this homework set into a panda's data frame

In [26]:
data_dir="../../data/fair_ml"


In [27]:
filename=data_dir+"/fair_ml.csv"

In [28]:
df=pd.read_csv(filename)

In [33]:
df.head()

Unnamed: 0,A,Y,X
0,0,0,-1.192736
1,0,1,-0.03314
2,1,1,-0.737864
3,0,1,0.52263
4,0,0,-0.138411


<div class="alert alert-block alert-info"> Problem 0.2 </div>
Split data in a 50% training  and 50% valuation sets

In [29]:
train, val=train_test_split(df, test_size=0.5)

In [30]:
train.shape, val.shape

((500000, 3), (500000, 3))

<div class="alert alert-block alert-info"> Problem 0.3 </div>

In this problem set you will need to fit LDA models to 
1. the majority group $a=0$ 
2. minority group $a=1$
3. to the whole data set irrespective of group affiliation. 

Prepare `numpy` arrays for training and validation so  that you can fit those three models:
1. Arrays `X0_train`,`X0_val`,`Y0_train`,`Y0_val` for members of the majority group `a=0`.
2. Arrays `X1_train`,`X1_val`,`Y1_train`,`Y1_val` for members of the minority group `a=1`.
3. Arrays `X_train`,`X_val`,`Y_train`,`Y_val` for all members irrespective of group affiliation.


### 1.

In [36]:
train0=train[train['A']==0]
val0=val[val['A']==0]

In [44]:
X0_train=train0[['X']]
X0_val=val0[['X']]
Y0_train=train0[['Y']]
Y0_val=val0[['Y']]

### 2.

In [45]:
train1=train[train['A']==1]
val1=val[val['A']==1]

X1_train=train1[['X']]
X1_val=val1[['X']]
Y1_train=train1[['Y']]
Y1_val=val1[['Y']]

### 3.

In [46]:
X_train=train[['X']]
X_val=val[['X']]
Y_train=train[['Y']]
Y_val=val[['Y']]

##  Per Group Classifiers

### Majority Group model

<div class="alert alert-block alert-info"> Problem 1.0 </div>
Fit an LDA model to the training examples where $a=1$

In [48]:
model0=LDA()
model0.fit(X0_train,Y0_train)

  y = column_or_1d(y, warn=True)


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

<div class="alert alert-block alert-info"> Problem 1.1 </div>
Compute the model accuracy on the majority population $a=0$.

Because of finite sample (we only have N=2,000 samples) you should get results close, but not identical to what you derived on the written homework.

In [66]:
Y0_train_value=Y0_train.values.ravel()
Y0_val_value=Y0_val.values.ravel()

In [67]:
y0_pred=model0.predict(X0_val)
accu_0=np.average(y0_pred==Y0_val_value)

In [68]:
print('The accuracy of group a=0 is: ', accu_0)

The accuracy of group a=0 is:  0.843544775746429


<div class="alert alert-block alert-info"> Problem 1.2 </div>
Estimate  the rate of true positives and false negatives on the majority population $a=0$ using the validation data
you set aside before.

[HINT: The function `sklear.metrix.confusion_matrix may be useful]

In [69]:
cm0=metrics.confusion_matrix(Y0_val_value,y0_pred)
cm0

array([[195685,  29356],
       [ 33253, 141878]])

In [70]:
TN0=cm0[0,0]
TP0=cm0[1,1]
FP0=cm0[0,1]
FN0=cm0[1,0]

In [72]:
print('The rate of true positives is: ', TP0/(TP0+FN0))
print('The rate of false negative is: ', FN0/(TP0+FN0))

The rate of true positives is:  0.8101249921487343
The rate of false negative is:  0.18987500785126563


### Minority Group Model

<div class="alert alert-block alert-info"> Problem 2.0 </div>
Fit an LDA model to the training examples where $a=0$

In [73]:
model1=LDA()
model1.fit(X1_train,Y1_train)

  y = column_or_1d(y, warn=True)


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

<div class="alert alert-block alert-info"> Problem 2.1 </div>
Compute the model accuracy on the minority population $a=1$.

In [83]:
Y1_val_value=Y1_val.values.ravel()
Y0_val_value=Y0_val.values.ravel()

In [77]:
Y1_pred=model1.predict(X1_val)
np.mean(Y1_pred==Y1_val_value)

0.9813679528789518

<div class="alert alert-block alert-info"> Problem 2.2 </div>
Estimate  the rate of true positives and false negatives on the minority population $a=1$ using the validation data
you set aside before.

In [79]:
cm1=metrics.confusion_matrix(Y1_val,Y1_pred)
cm1

array([[24073,  1005],
       [  855, 73895]])

In [80]:
TN1=cm1[0,0]
TP1=cm1[1,1]
FP1=cm1[0,1]
FN1=cm1[1,0]

In [81]:
print('The rate of true positives is: ', TP1/(TP1+FN1))
print('The rate of false negative is: ', FN1/(TP1+FN1))

The rate of true positives is:  0.988561872909699
The rate of false negative is:  0.011438127090301004


<div class="alert alert-block alert-info"> Problem 2.3 </div>
Calculate the accuracy on the overall population (including $a=0,1$) if we use different classifiers for $a=0$ and $a=1$

In [87]:
accu_blend=(np.sum(y0_pred==Y0_val_value)+np.sum(Y1_pred==Y1_val_value))/len(Y_val)

In [88]:
print('The overall population accuracy is: ', accu_blend)

The overall population accuracy is:  0.871062


## Fair (Anti-Classification) Model

<div class="alert alert-block alert-info"> Problem 3.1 </div>
Train an LDA model using the whole dataset to predict $y$ based on $x$ without making use of the $a$ attribute.

In [89]:
lda=LDA()

In [90]:
lda.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

<div class="alert alert-block alert-info"> Problem 3.1 </div>
Estimate the accuracy of the fair model

In [94]:
Y_val_value=Y_val.values.ravel()

In [95]:
y_pred_fair=lda.predict(X_val)
accu_fair=np.average(Y_val_value==y_pred_fair)

In [96]:
print('The accuracy of the fair model is: ', accu_fair)

The accuracy of the fair model is:  0.665934


<div class="alert alert-block alert-info"> Problem 3.2 </div>
Compute the accuracy the fair model when applied only to the majority population
where $a=0$

In [99]:
Y0_pred=lda.predict(X0_val)
np.average(Y0_pred==Y0_val_value)

0.8229661245664365

<div class="alert alert-block alert-info"> Problem 3.3 </div>
Compute true positive and false positive rates for the majority population  ($a=0$).

In [100]:
cm0=metrics.confusion_matrix(Y0_val,Y0_pred)
cm0

array([[171208,  53833],
       [ 17011, 158120]])

In [101]:
TN0=cm0[0,0]
TP0=cm0[1,1]
FP0=cm0[0,1]
FN0=cm0[1,0]

In [102]:
print('The rate of true positives is: ', TP0/(TP0+FN0))
print('The rate of false negative is: ', FN0/(TP0+FN0))

The rate of true positives is:  0.9028669967053234
The rate of false negative is:  0.09713300329467656


<div class="alert alert-block alert-info"> Problem 3.3 </div>
Compute the accuracy the fair model when applied only to the majority population
where $a=1$

In [104]:
Y1_pred=lda.predict(X1_val)
np.mean(Y1_pred==Y1_val_value)

0.03645269864166366

<div class="alert alert-block alert-info"> Problem 3.4 </div>
Compute true positive and false positive rates for the minority population  ($a=1$).

In [105]:
cm1=metrics.confusion_matrix(Y1_val,Y1_pred)
cm1

array([[  255, 24823],
       [71366,  3384]])

In [106]:
TN1=cm1[0,0]
TP1=cm1[1,1]
FP1=cm1[0,1]
FN1=cm1[1,0]

In [107]:
print('The rate of true positives is: ', TP1/(TP1+FN1))
print('The rate of false negative is: ', FN1/(TP1+FN1))

The rate of true positives is:  0.04527090301003345
The rate of false negative is:  0.9547290969899666
