In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# importing necessary libraries, required for analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import math

In [None]:
# Reading the dataset as a dataframe
file_name = '../input/breast-cancer-wisconsin-data/data.csv'
data_df = pd.read_csv(file_name)

In [None]:
data_df.head()

In [None]:
data_df.info()

In [None]:
data_df.drop('Unnamed: 32', axis=1, inplace=True)

In [None]:
data_df.head()

In [None]:
print(data_df.shape)

In [None]:
data_df['diagnosis'].unique()#.iloc[0]

In [None]:
diagnosis_mapping = {"M": 0, "B": 1}

data_df['diagnosis'] = data_df['diagnosis'].map(diagnosis_mapping)

In [None]:
sns.countplot(x = 'diagnosis', data=data_df)

In [None]:
sns.pairplot(data_df)

In [None]:
plt.figure(figsize=(24,8))
sns.heatmap(data_df.corr())

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE # Oversampling
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, make_scorer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

import re

random_state= 101


In [None]:
X = data_df.drop(['id', 'diagnosis'], axis=1)
y = data_df['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)


In [None]:
# Since dataset is inbalanced, we need to apply SMOTE - it will include new dummy rows for analysis. This needs to be done only on train dataset 
random_state = random_state
features = X_train.columns
sm = SMOTE(random_state=random_state)#, ratio=1.0)
X_train, y_train = sm.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns=features)

In [None]:
# sns.countplot(x='diagnosis', data=y_train)
#y_train['diagnosis'].value_counts()
a = pd.DataFrame(y_train)
a['diagnosis'].value_counts()

In [None]:
# Appling MinMaxScaler (fit and transform) on train data and only transform on test data to avoid data leakages 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# Model Baselining

In [None]:
naive_predictor_accuracy = accuracy_score(y_train,np.ones(len(y_train)))
naive_predictor_f1score = f1_score(y_train, np.ones(len(y_train)))

print("Naive predictor accuracy: %.3f" % (naive_predictor_accuracy))
print("Naive predictor f1-score: %.3f" % (naive_predictor_f1score))

With Naive predictor baselining model is baselined

# Execute different ML model to find the best fit

In [None]:
model_performance = []

classifier_type = [LogisticRegression,
                    KNeighborsClassifier,
                    DecisionTreeClassifier,
                    RandomForestClassifier,
                    GaussianNB,
                    SVC, 
                    XGBClassifier]


df = pd.DataFrame(columns=['Model Name', 'Accuracy', 'F1 Score', 'Recall', 'Precision'])

for mName in classifier_type:
    model_name = mName
    model_name = str(model_name)
    model = mName()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(mName,':- ') 
    print(classification_report(y_test,predictions))
    print('-------------------------------------------------------------')
    clf_accuracy = accuracy_score(y_test,predictions)
    clf_f1_score = f1_score(y_test,predictions)
    clf_recall_score = recall_score(y_test,predictions)
    clf_precision_score = precision_score(y_test,predictions)
    
    #print("%s model accuracy-score: %.3f" % (mName, clf_accuracy))
    #print("%s model f1-score: %.3f" % (mName, clf_f1_score))
    #print("%s model recall-score: %.3f" % (mName, clf_recall_score))
    #print("%s model precision-score: %.3f" % (mName, clf_precision_score))
    
    nameLen = len(model_name.split('.'))
    model_name = model_name.split('.')[nameLen-1]
    model_name= re.sub('[^A-Za-z0-9]+', '', model_name)
        
    df = df.append({'Model Name': model_name, 'Accuracy': clf_accuracy, 'F1 Score': clf_f1_score, 'Recall': clf_recall_score, 'Precision': clf_precision_score }, ignore_index=True)

df = df.sort_values('Accuracy', ascending=False)
df = df.reset_index(drop=True)
df

# XGBClassifier model looks to be best fit, lets try some basic modeling 

In [None]:
# Working model

gbm = XGBClassifier(max_depth=3, n_estimators=400, learning_rate=0.05).fit(X_train, y_train)
predictions = gbm.predict(X_test)

print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test, predictions)

# Applying ML with RandomizedSearchCV - Hyperparameter

In [None]:
clf_xgb = XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': [400],
              'learning_rate': [.10, .01, .001, .001],
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'min_child_weight': [1, 2, 3, 4]
             }

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         n_iter = 5, 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test,predictions))

In [None]:
relative_importance = clf.best_estimator_.feature_importances_
relative_importance = relative_importance / np.sum(relative_importance)

feature_importance =\
    pd.DataFrame(list(zip(features,
                          relative_importance)),
                 columns=['feature', 'relativeimportance'])

feature_importance = feature_importance.sort_values('relativeimportance',
                                                    ascending=False)

feature_importance = feature_importance.reset_index(drop=True)

palette = sns.color_palette("coolwarm", feature_importance.shape[0])

plt.figure(figsize=(8, 8))
sns.barplot(x='relativeimportance',
            y='feature',
            data=feature_importance,
            palette=palette)
plt.xlabel('XGBClassifier')
plt.ylabel('Feature')
plt.title('XGBClassifier Estimated Feature Importance')

# End