# Data Mining Final Project
##           - Rajendra Prasad Patil

### Glossary:
* Import libraries
* Load dataset
* Analysis on dataset
* Splitting the dataset into labels and features
* Performing normalization on dataset
* Splitting dataset using K fold 
* Running the model
    * SVM Model
    * K Nearest Neighbors
    * Random Forest Classifier
* Output Performance Metrics



### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# SVM classifier
from sklearn import svm

# KNN classifier
from sklearn.neighbors import KNeighborsClassifier 

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# for checking the model accuracy
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Loading the dataset

In [2]:
dataset = load_breast_cancer()

### Preliminary analysis

In [3]:
class_names = dataset['target_names']
print('Target variables  : ', class_names)

(unique, counts) = np.unique(dataset['target'], return_counts=True)

print('Unique values of the target variable', unique)
print('Counts of the target variable :', counts)

Target variables  :  ['malignant' 'benign']
Unique values of the target variable [0 1]
Counts of the target variable : [212 357]


* The dataset is suited for binary classification
* The dataset has no skewed nature

### The data is split into features and labels

In [4]:
X = dataset['data']
y = dataset['target']

### Apply normalization operation for numerical stability

In [5]:
standardizer = StandardScaler()
X = standardizer.fit_transform(X)

# Performance Metrics
Function to calculate all the available performance metrics

In [6]:
performance_metrics = ['True Negative', 'False Positive', 'False Negative', 'True Positivity', 'Sensitivity', 'Specificity', 
                       'Precision', 'Accuracy', 'F1 Score', 'Error Rate', 'Negative Predicted Value', 'False Positve Rate', 
                       'False Discovery Rate', 'False Negative Rate', 'Balanced Accuracy', 'True Skill Statistics', 
                       'Heidke Skill Score']

def compute_performance_metrics(prediction, y_test, df):
    TN, FP, FN, TP = confusion_matrix(y_test, prediction).ravel()
    
    sensitivity = TP / (TP + FN)
    specificity = TN / (FP + TN)
    precision = TP / (TP + FP)
    accuracy =  (TP+TN) /(TP+FP+TN+FN)
    f1_score = 2 * TP / ((2 * TP) + FP + FN)
    error_rate = (FP + FN) / (TP + FP + FN + TN)
    negative_predicted_value = TN / (TN + FN)
    false_positive_rate = FP / (FP + TN)
    false_discovery_rate = FP / (FP + TP)
    false_negative_rate = FN / (FN + TP)
    balanced_accuracy = 0.5 * ((TP / (TP + FN)) + (TN / (TN + FP)))
    true_skill_statistics = ((TP / (TP + FN)) - (FP / (TN + FP)))
    heidke_skill_score = 2 * ((TP * TN) - (FP * FN)) / (((ùëáùëÉ + ùêπùëÅ) * (ùêπùëÅ + ùëáùëÅ)) +((TP+FP) * (ùêπùëÉ + ùëáùëÅ)))
    
    df = df.append({performance_metrics[0]: TN, performance_metrics[1]: FP, performance_metrics[2]: FN, 
                    performance_metrics[3]: TP, performance_metrics[4]: sensitivity, performance_metrics[5]: specificity, 
                    performance_metrics[6]: precision, performance_metrics[7]: accuracy, performance_metrics[8]: f1_score, 
                    performance_metrics[9]: error_rate, performance_metrics[10]: negative_predicted_value, 
                    performance_metrics[11]: false_positive_rate, performance_metrics[12]: false_discovery_rate, 
                    performance_metrics[13]: false_negative_rate, performance_metrics[14]: 
                    balanced_accuracy, performance_metrics[15]: true_skill_statistics,
                    performance_metrics[16]: heidke_skill_score}, ignore_index=True)
    
    return df    

# K-fold cross validation

In [7]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=0)

### Dataframes for performance metrics

In [8]:
svm_metrics_df = pd.DataFrame(columns=performance_metrics)
kn_metrics_df = pd.DataFrame(columns=performance_metrics)
rf_metrics_df = pd.DataFrame(columns=performance_metrics)

## SVM Model

In [9]:
svm_model = svm.SVC()
for train_index, test_index in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    svm_model.fit(X_train, y_train)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = svm_model.predict(X_test)

    # print metrics
    svm_metrics_df = compute_performance_metrics(prediction, y_test, svm_metrics_df)

svm_metrics_df.index += 1
svm_metrics_df.loc['Average'] = svm_metrics_df.mean()

In [10]:
svm_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,22.0,0.0,0.0,35.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,23.0,2.0,1.0,31.0,0.96875,0.92,0.939394,0.947368,0.953846,0.052632,0.958333,0.08,0.060606,0.03125,0.944375,0.88875,0.892655
3,15.0,1.0,1.0,40.0,0.97561,0.9375,0.97561,0.964912,0.97561,0.035088,0.9375,0.0625,0.02439,0.02439,0.956555,0.91311,0.91311
4,20.0,0.0,0.0,37.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
5,17.0,1.0,0.0,39.0,1.0,0.944444,0.975,0.982456,0.987342,0.017544,1.0,0.055556,0.025,0.0,0.972222,0.944444,0.958785
6,19.0,3.0,0.0,35.0,1.0,0.863636,0.921053,0.947368,0.958904,0.052632,1.0,0.136364,0.078947,0.0,0.931818,0.863636,0.886076
7,22.0,1.0,1.0,33.0,0.970588,0.956522,0.970588,0.964912,0.970588,0.035088,0.956522,0.043478,0.029412,0.029412,0.963555,0.92711,0.92711
8,23.0,0.0,2.0,32.0,0.941176,1.0,1.0,0.964912,0.969697,0.035088,0.92,0.0,0.0,0.058824,0.970588,0.941176,0.928121
9,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
10,25.0,0.0,0.0,31.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


### K-Nearest Neighbors

In [11]:
model = KNeighborsClassifier(n_neighbors=3) # this examines 3 neighbors for putting the data into class

for train_index, test_index in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    model.fit(X_train, y_train)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = model.predict(X_test)

    # print metrics
    kn_metrics_df = compute_performance_metrics(prediction, y_test, kn_metrics_df)

kn_metrics_df.index += 1
kn_metrics_df.loc['Average'] = kn_metrics_df.mean()

In [12]:
kn_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,21.0,1.0,0.0,35.0,1.0,0.954545,0.972222,0.982456,0.985915,0.017544,1.0,0.045455,0.027778,0.0,0.977273,0.954545,0.962672
2,21.0,4.0,0.0,32.0,1.0,0.84,0.888889,0.929825,0.941176,0.070175,1.0,0.16,0.111111,0.0,0.92,0.84,0.854962
3,14.0,2.0,1.0,40.0,0.97561,0.875,0.952381,0.947368,0.963855,0.052632,0.933333,0.125,0.047619,0.02439,0.925305,0.85061,0.867133
4,19.0,1.0,0.0,37.0,1.0,0.95,0.973684,0.982456,0.986667,0.017544,1.0,0.05,0.026316,0.0,0.975,0.95,0.961039
5,17.0,1.0,1.0,38.0,0.974359,0.944444,0.974359,0.964912,0.974359,0.035088,0.944444,0.055556,0.025641,0.025641,0.959402,0.918803,0.918803
6,19.0,3.0,0.0,35.0,1.0,0.863636,0.921053,0.947368,0.958904,0.052632,1.0,0.136364,0.078947,0.0,0.931818,0.863636,0.886076
7,22.0,1.0,0.0,34.0,1.0,0.956522,0.971429,0.982456,0.985507,0.017544,1.0,0.043478,0.028571,0.0,0.978261,0.956522,0.963297
8,23.0,0.0,0.0,34.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
9,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
10,23.0,2.0,0.0,31.0,1.0,0.92,0.939394,0.964286,0.96875,0.035714,1.0,0.08,0.060606,0.0,0.96,0.92,0.927178


### Random Forest Classifier

In [13]:
#Create a Gaussian Classifier
model = RandomForestClassifier(n_estimators=100)

for train_index, test_index in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    model.fit(X_train, y_train)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = model.predict(X_test)

    # print metrics
    rf_metrics_df = compute_performance_metrics(prediction, y_test, rf_metrics_df)

rf_metrics_df.index += 1
rf_metrics_df.loc['Average'] = rf_metrics_df.mean()

In [14]:
rf_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,21.0,1.0,3.0,32.0,0.914286,0.954545,0.969697,0.929825,0.941176,0.070175,0.875,0.045455,0.030303,0.085714,0.934416,0.868831,0.854406
2,24.0,1.0,0.0,32.0,1.0,0.96,0.969697,0.982456,0.984615,0.017544,1.0,0.04,0.030303,0.0,0.98,0.96,0.964218
3,16.0,0.0,2.0,39.0,0.95122,1.0,1.0,0.964912,0.975,0.035088,0.888889,0.0,0.0,0.04878,0.97561,0.95122,0.9163
4,19.0,1.0,0.0,37.0,1.0,0.95,0.973684,0.982456,0.986667,0.017544,1.0,0.05,0.026316,0.0,0.975,0.95,0.961039
5,17.0,1.0,2.0,37.0,0.948718,0.944444,0.973684,0.947368,0.961039,0.052632,0.894737,0.055556,0.026316,0.051282,0.946581,0.893162,0.88
6,17.0,5.0,2.0,33.0,0.942857,0.772727,0.868421,0.877193,0.90411,0.122807,0.894737,0.227273,0.131579,0.057143,0.857792,0.715584,0.734177
7,22.0,1.0,0.0,34.0,1.0,0.956522,0.971429,0.982456,0.985507,0.017544,1.0,0.043478,0.028571,0.0,0.978261,0.956522,0.963297
8,21.0,2.0,1.0,33.0,0.970588,0.913043,0.942857,0.947368,0.956522,0.052632,0.954545,0.086957,0.057143,0.029412,0.941816,0.883632,0.889891
9,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
10,24.0,1.0,0.0,31.0,1.0,0.96,0.96875,0.982143,0.984127,0.017857,1.0,0.04,0.03125,0.0,0.98,0.96,0.963731


### Observation
I consider balanced accuracy to be the optimal metric to find the best model.
The case being, SVM is the model which is giving the highest balanced accuracy.