### Getting started with feature importance and fairness using ML & SHAP


* We'll look at feature importance according to ML models. 
* We can use multiple methods - e.g. permutation importance, shapley, and the  difference in feature importance from different models

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import shap
import gc

from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
warnings.simplefilter(action='ignore', category=FutureWarning)


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# import pandas as pd
compas_scores_raw = pd.read_csv("../input/compass/compas-scores-raw.csv")
cox_violent_parsed = pd.read_csv("../input/compass/cox-violent-parsed.csv")
cox_violent_parsed_filt = pd.read_csv("../input/compass/cox-violent-parsed_filt.csv")


In [None]:
print (type(compas_scores_raw))
compas_scores_raw.head()

In [None]:
TARGET_COL = "Two_yr_Recidivism"

## We'll start with the naive fairML subset of the data. Very simple
* **Target column**: `Two_yr_Recidivism` = recividism (any) within 2 years
* Note that we have fewer variables and features here.


In [None]:
df = pd.read_csv("../input/compass/propublicaCompassRecividism_data_fairml.csv/propublica_data_for_fairml.csv")

print(df.shape)
display(df.columns)
df.head()

In [None]:
data = df.drop([TARGET_COL],axis=1)
y = df[TARGET_COL]

In [None]:
# Here is a way to select these columns using the column names
    
#feature_columns = ['Number_of_Priors', 'score_factor','Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'African_American','Asian', 'Hispanic', 'Native_American', 'Other', 'Female',       'Misdemeanor']
feature_columns = ['Number_of_Priors', 'score_factor','Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'Misdemeanor']

data = df[feature_columns].values
y = df['Two_yr_Recidivism'].values

### Gradient Boosting Classsifer
The original code used LightGBC to classify the data.  
https://towardsdatascience.com/understanding-gradient-boosting-machines-9be756fe76ab

We can substitute other learning methods in place of this. I added KNN, SVM and Decision Trees below.


In [None]:
#Create train and validation set
train_x, valid_x, train_y, valid_y = train_test_split(data, y, test_size=0.25, shuffle=True, stratify=y, random_state=42)

In [None]:
print ("test")
print (type(train_x))
print ("Shape of training input : ",train_x.shape)
print ("Shape of training output : ",train_y.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Set up the K-Nearest neighbor model using the k nearest neighbors. Change the value of n_neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
# Train the model on the iris data
knn_model.fit(train_x, train_y)
score = knn_model.score(valid_x, valid_y)
print ("The score for this model is ", score)

# Section 2: Support Vector Machine (SVM)

SVM is another algorithm for classifying data. It tries to divide the data up using lines, sometimes straight linear lines and sometimes curved lines.

SVM tries to find the best lines - actually, a plane in multiple dimensions - to divide the data up into the known categories.
## Linear SVM Classification




In [None]:
from sklearn.svm import SVC

# Set up SVM model with a given kernel and c parameter
svm_model = SVC(C=1.0, kernel='linear')         # linear SVM
#svm_model = SVC(C=10.0, kernel='rbf')           # non-linear SVM

# Train the model on the iris data
svm_model.fit(train_x, train_y)
score = svm_model.score(valid_x, valid_y)
print ("The score for this model is ", score)

In [None]:
# This is the actual score used in the origianl notebook
y_pred = svm_model.predict(valid_x)
score = roc_auc_score(valid_y, y_pred)
print("Overall AUC on validation: {:.3f}" .format(score))

# Section 3: Decision Trees 
This classification method tries to break the classification task into a series of decisions structured as a tree.
> Like SVMs, Decision Trees are versatile Machine Learning algorithms that can perform both classification and regression tasks, and even multioutput tasks. They are powerful algorithms, capable of fitting complex datasets.
- From *Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems*, 2nd Edition by Aurélien Géron, ISBN-13: 978-1492032649

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier()

# Train the model on the iris data
DT_model.fit(train_x, train_y)
score = DT_model.score(valid_x, valid_y)
print ("The score for this model is ", score)

In [None]:
from sklearn import tree

tree.plot_tree(DT_model, max_depth=3);