# Algorithm Bias
###  Objective
<span style="color:blue">
The objective of this assignment is to assess the impact of algorithm bias on machine learning algorithms trained with imbalanced data. Submissions should also propose and evaluate strategies for overcoming this bias. 
</span>

<span style="color:blue">
Load the Survival dataset and assess the bias of classifiers trained on this dataset, i.e. are classifiers biased towards the majority class. 
</span>

I'll be comparing hold-out testing and cross-validation techniques to analyze the Bias generated by different Machine Learning Alogrithms k-NN, Decision Trees, Logistic Regression and Gradient Boosting.

I'll be showing the count of Majority and Minority class after splitting the data into training and test sets, and thus will comparing how different ML models are predicting the count of minority classes. If the models are not predicitng accurately the count of miority classes then we can assure that there is Algorithm Bias


In [None]:
import numpy as np
import pandas as pd
from collections import Counter
surv = pd.read_csv('survival.csv')   #loading the survival dataset as dataframe
surv.head(8)

In [None]:
targetcount=surv['Class'].value_counts()
print('total count of class type 1 in the survival dataset:',targetcount[1])
print('total count of class type 2 in the survival dataset:',targetcount[2])

In [None]:
X = surv.drop('Class', axis=1)   #X will become the independent variable for the models.
y = surv['Class']                #y will become the dependent vaiables.
X.shape, y.shape                 # checking rows and columns in the X and y   

In [None]:
print("Minority class type 2 in the entire dataset(percentage wise): %0.2f" % (Counter(y)[2]/len(y)))

# Applying Hold-Out Testing to Models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.5,random_state=42)
print('Actual Class type [1 and 2] feaures in test set: ',Counter(y_test))
print('Minority Class Type [2] in test set: ',Counter(y_test)[2])
test_neg = Counter(y_test)[2]
Minority_test= test_neg/len(y_test)
print("Minority class in test set percentage wise : %0.2f" % (Minority_test))
print('*' * 20)


MLalgos ={}

MLalgos['KNN'] = KNeighborsClassifier(n_neighbors=3)
MLalgos['DecisionTree']= DecisionTreeClassifier(criterion='entropy')
MLalgos['LogRegression'] = LogisticRegression()
MLalgos['GradBoosting']= GradientBoostingClassifier()
bias_calculated ={}
accuracy_calculated={}


for algo in MLalgos:
    print(type(MLalgos[algo]).__name__)
    y_predicted = MLalgos[algo].fit(X_train, y_train).predict(X_test)
    confusion = confusion_matrix(y_test, y_predicted)
    print("Confusion matrix is :\n{}".format(confusion)) 
    acc = accuracy_score(y_test, y_predicted)
    print('Accuracy:  %0.2f' % acc)
    accuracy_calculated[algo]=acc
    count_predicted = (y_predicted.sum()-len(y_predicted))
    bias_calculated[algo]= count_predicted
    print("Predicted minority class type 2 :",count_predicted)
    pred_neg = Counter(y_predicted)[2]
    test_neg = Counter(y_test)[2]
    print("Predicted minority class type 2 percentage wise : %0.2f" % (pred_neg/len(y_predicted)))
    predicted_count= pred_neg/len(y_predicted)
    print('*' * 20)
    

# Plotting the Bias-Values using Matplotlib 


- Values in maroon are the values for minority class predicted by different modes 
- Value in yellow is the actual Minority Class value in test set

In [None]:
from matplotlib import pyplot as plt 
algorithms = list(MLalgos.keys()) + ['Test-Set']
predicted_values = list(bias_calculated.values()) + [test_neg]
   
fig = plt.figure(figsize = (8, 8)) 
  
# creating the bar plot 
plt.bar(algorithms, predicted_values, color =['maroon','maroon','maroon','maroon','yellow']  ,
        width = 0.5)   
plt.xlabel("CLASSIFICATION MODELS") 
plt.ylabel("TYPE 2 COUNT IN TEST") 
plt.title("HOLD-OUT TESTING BIAS PLOT") 
plt.show() 

# Hold Out Testing Outcomes
- Splitted the training and testing set into two halves
- Gradient Boosting is not showing Bias and is more accurate. 
- KNN, Decision Tree is showing showing Bias. 25 and 41 values predicted for the minority class when compared with 36 in test-set
- Logistic Regression is showing huge Bias. 10 values are predicted only for the minority clas
- The accuracy of the KNN and Logistic regression is more as compared to others but still there are large Bias values in these models. Hence we can say that, Accuracy does not count in showing the less Bias in the models

# Applying Cross Validation to Models

In [None]:
from sklearn.model_selection import cross_validate       #importing the necessary libraries from SKLearn
from sklearn.metrics import make_scorer

In [None]:
#defining the confusion matrix and folds for the Cross validation techniques
def TruePos(y_true, y_predicted):
    return confusion_matrix(y_true, y_predicted)[1, 1]
def TrueNeg(y_true, y_predicted): 
    return confusion_matrix(y_true, y_predicted)[0, 0]
def FalsePos(y_true, y_predicted): 
    return confusion_matrix(y_true, y_predicted)[0, 1]
def FalseNeg(y_true, y_predicted): 
    return confusion_matrix(y_true, y_predicted)[1, 0]
scoring = {'tp' : make_scorer(TruePos), 'tn' : make_scorer(TrueNeg),
           'fp' : make_scorer(FalsePos), 'fn' : make_scorer(FalseNeg)}


In [None]:
kFolds = 22
bias_calculated_CV = {}
print('Minority Class Type [2] in cross-validation set:',targetcount[2] )
print('*' * 20)

for algo in MLalgos:
     print(type(MLalgos[algo]).__name__)
     cv_results = cross_validate(MLalgos[algo], X, y, cv= kFolds,scoring=scoring)
     n_tot =  (cv_results['test_tp'].sum() +  cv_results['test_fp'].sum())   
     acc_calculated_CV = (cv_results['test_tp'].sum() + cv_results['test_tn'].sum())/len(y)
     print("Accuracy:{:.2f}".format(acc_calculated_CV) )
     print("{} x CV sets".format(kFolds))   
     bias_calculated_CV[algo] = n_tot
     print("Predicted minority Class Type [2] in cross-validation set:",n_tot )
     print('*' * 20)

In [None]:
algorithms = list(MLalgos.keys()) + ['Test-Set']
predicted_values = list(bias_calculated_CV.values()) + [targetcount[2]]
   
fig = plt.figure(figsize = (6.5, 6.5)) 
  
# creating the bar plot 
plt.bar(algorithms, predicted_values, color =['maroon','maroon','maroon','maroon','yellow']  ,
        width = 0.5)   
plt.xlabel("CLASSIFICATION MODELS") 
plt.ylabel("TYPE 2 COUNT IN TEST") 
plt.title("CROSS-VALIDATION BIAS PLOT") 
plt.show() 

# Cross-Validation Outcomes
- This technique allows us to feed all of the minority class data i.e, 81 as compared to 36 in the Hold-Out testing. Hence we    can actually make some predictions with all of the data present.
- Gradient Boosting shows a little Bias compared to other three algorithms.
- Changing the value of KFolds can significantly change the bias count and accuracy score.
- More accurate model such as Logistic Regression in this case should generate less Bias, but as oppose to that it is generating more Bias. 




<span style="color:blue">
Proposing a strategy to rectify this bias. Evaluating the effect of this strategy in terms of classification bias and overall accuracy. 
</span>

- Using Synthetic Minority Over-sampling Technique and utilizing it to upsample the minority class that is type 1.

In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

In [None]:
X = surv.drop('Class', axis=1)
y = surv['Class']
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.5, random_state=42)

In [None]:
sm = SMOTE(random_state=20, sampling_strategy = 0.7)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [None]:
len(y_train), len(y_train_res)
y_train.sum(), y_train_res.sum()   

In [None]:
print("Majority class in previous training technique with hold-out testing:",Counter(y_train)[1])
print("Minority class in previous training technique with hold-out testing:", Counter(y_train)[2])
print("Majority class in SMOTE technique with hold-out testing:",Counter(y_train_res)[1])
print("Minority class in SMOTE training technique with hold-out testing:", Counter(y_train_res)[2] )

In [None]:
print("Class Type-2 in test set : %d" % (y_test.sum()- len(y_test)))
print('*' * 20)
res_smote = {}
acc_smote = {}

for algo in MLalgos:
    print(type(MLalgos[algo]).__name__)
    y_pred = MLalgos[algo].fit(X_train_res, y_train_res).predict(X_test)
    pred_count = (y_pred.sum() - len(y_pred) )
    print("Predicted minority Class Type [2]:",pred_count)
    res_smote[algo] = pred_count
    acc = accuracy_score(y_test, y_pred)
    acc_smote[algo] = acc
    print("Accuracy:{:.2f}".format(acc))
    print('*' * 20)

## Plotting predicted values using SMOTE technique using Matplotlib

In [None]:
import matplotlib.pyplot as plt 
import numpy as np
%matplotlib inline 
fig, ax = plt.subplots(figsize = (8, 8))
width = 0.35

algorithms = list(MLalgos.keys()) + ['Test-Set']
predicted_values = list(bias_calculated.values()) + [test_neg]
predicted_values1= list(res_smote.values()) + [test_neg]
y_pos = np.arange(len(algorithms))
p1 = ax.bar(algorithms, predicted_values, width, align='center', 
            color=['maroon', 'maroon','maroon','maroon','yellow'])

p2 = ax.bar(y_pos+width, predicted_values1, width, align='center', 
            color=['red', 'red','red','red','yellow'])

ax.legend((p1[1], p2[1]), ('Original Hold-out', 'Upsampled Hold-out'))

plt.ylabel('Minority Count')
plt.title('Upsampling Count')
 
plt.show()

## Outcomes

- Minority class values have been predicted really well with this technique
- The Minority Class values have been upsampled to match with the actual test data.
- The accuracy has been pretty much maintained with this technique
- KNN, Decision Tree and Logistic regression shows a drastic change in the values produced., However Gradient Boosting has litte or no effect


## Effect on Accuracy

In [None]:
fig, ax = plt.subplots(figsize = (8, 8))
width = 0.35

algorithms = list(MLalgos.keys())
predicted_values = list(accuracy_calculated.values())
predicted_values1= list(acc_smote.values())
y_pos = np.arange(len(algorithms))
p1 = ax.bar(algorithms, predicted_values, width, align='center',  color=['maroon', 'maroon','maroon','maroon'])

p2 = ax.bar(y_pos+width, predicted_values1, width, align='center', 
            color=['red', 'red','red','red'])

ax.legend((p1[1], p2[1]), ('Accuracy Hold-out', 'Accuracy Upsampled'))

plt.ylabel('Minority Count')
plt.title('Upsampling Count')
 
plt.show()

## Outcome
- The Accuracy metrics shows no effect in case of Gradient Boosting while it show little effect on KNN, Decision Tree and Logistic Regression


<span style="color:blue">
Testing the impact of this strategy on another dataset, and discussing the effectiveness of the strategy on this second dataset.  
</span>


In [None]:
diabetes_pd = pd.read_csv('diabetes.csv')
diabetes_pd.head()

In [None]:
y = diabetes_pd.pop('neg_pos').values
X = diabetes_pd.values
X.shape

In [None]:
print("Original Dataset")
print("Majority class:",len(y) - y.sum())
print("Minority class:",y.sum())
print("Minority class: {:.2f}%".format(y.sum()/len(y)*100))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
bias_diabetes = {}
acc_diabetes = {}

print("Diabetes positive in test set : %d" % (y_test.sum()))
for algo in MLalgos:
    y_pred = MLalgos[algo].fit(X_train, y_train).predict(X_test)
    pred_count = (y_pred.sum())
    bias_diabetes[algo] = pred_count
    acc = accuracy_score(y_test, y_pred)
    acc_diabetes[algo] = acc
  

    print("{:22} Pred. Diabetes positive: {:d} Accuracy: {:.2f}".
          format(type(MLalgos[algo]).__name__, pred_count,acc))

In [None]:
from matplotlib import pyplot as plt 
algorithms = list(MLalgos.keys()) + ['Test-Set']
predicted_values = list(bias_diabetes.values()) + [y_train.sum()]
   
fig = plt.figure(figsize = (8, 8)) 
  
# creating the bar plot 
plt.bar(algorithms, predicted_values, color =['maroon','maroon','maroon','maroon','yellow']  ,
        width = 0.5)   
plt.xlabel("CLASSIFICATION MODELS") 
plt.ylabel("TYPE 2 COUNT IN TEST") 
plt.title("HOLD-OUT TESTING BIAS PLOT") 
plt.show() 

In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.5, random_state=42)


In [None]:
sm = SMOTE(random_state=20, sampling_strategy = 0.7)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [None]:
len(y_train), len(y_train_res)
y_train.sum(), y_train_res.sum()   

In [None]:
print("Majority class in previous training technique with hold-out testing:",Counter(y_train)[0])
print("Minority class in previous training technique with hold-out testing:", Counter(y_train)[1])
print("Majority class in SMOTE technique with hold-out testing:",Counter(y_train_res)[0])
print("Minority class in SMOTE training technique with hold-out testing:", Counter(y_train_res)[1] )

In [None]:
print("Class Type-2 in test set : %d" % (y_test.sum()))
print('*' * 20)
res_smote = {}
acc_smote = {}

for algo in MLalgos:
    print(type(MLalgos[algo]).__name__)
    y_pred = MLalgos[algo].fit(X_train_res, y_train_res).predict(X_test)
    pred_count = (y_pred.sum() )
    print("Predicted minority Class Type [2]:",pred_count)
    res_smote[algo] = pred_count
    acc = accuracy_score(y_test, y_pred)
    acc_smote[algo] = acc
    print("Accuracy:{:.2f}".format(acc))
    print('*' * 20)

In [None]:
import matplotlib.pyplot as plt 
import numpy as np
%matplotlib inline 
fig, ax = plt.subplots(figsize = (8, 8))
width = 0.35

algorithms = list(MLalgos.keys()) + ['Test-Set']
predicted_values = list(bias_diabetes.values()) + [y_test.sum()]
predicted_values1= list(res_smote.values()) + [y_test.sum()]
y_pos = np.arange(len(algorithms))
p1 = ax.bar(algorithms, predicted_values, width, align='center', 
            color=['maroon', 'maroon','maroon','maroon','yellow'])

p2 = ax.bar(y_pos+width, predicted_values1, width, align='center', 
            color=['red', 'red','red','red','yellow'])

ax.legend((p1[1], p2[1]), ('Original Hold-out', 'Upsampled Hold-out'))

plt.ylabel('Minority Count')
plt.title('Upsampling Count')
 
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (8, 8))
width = 0.35

algorithms = list(MLalgos.keys())
predicted_values = list(acc_diabetes.values())
predicted_values1= list(acc_smote.values())
y_pos = np.arange(len(algorithms))
p1 = ax.bar(algorithms, predicted_values, width, align='center',  color=['maroon', 'maroon','maroon','maroon'])

p2 = ax.bar(y_pos+width, predicted_values1, width, align='center', 
            color=['red', 'red','red','red'])

ax.legend((p1[1], p2[1]), ('Accuracy Hold-out', 'Accuracy Upsampled'))

plt.ylabel('Minority Count')
plt.title('Upsampling Count')
 
plt.show()

## Outcome
- After applying, the Bias of Decision Tree, Logistic Regression and Gradient Boosting has been decreased significantly. 
- There is no significant changes in accuracy, except there is a minor change in DT.
- Logistic Regression seems to be more efficient in terms of low Bias and High accuracy