In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn.model_selection import train_test_split 

# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score, roc_curve

import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn import tree


import graphviz
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# The Data

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.drop("Unnamed: 32",axis=1,inplace=True) # in this process this 
df.head()

In [None]:
features_mean= (df.columns[1:11])
features_se= (df.columns[11:20])
features_worst=(df.columns[21:31])
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})
y = df['diagnosis']
df.drop("diagnosis",axis=1,inplace=True)

# Dataset metrics

In [None]:
df.describe()

In [None]:
y.value_counts().plot(kind = 'bar', rot = 0)
plt.title("Count of Cases")
plt.xlabel('Diagnosis')

# AdaBoost Classifier with DT

****All Features

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df,y, test_size = 0.3)

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit ada to the training set
ada.fit(X_train,y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
dt.fit(X_train,y_train)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,feature_names = X_train.columns,class_names='MB')

********Mean Features

In [None]:
features_mean = df.iloc[:,1:11]

X_train,X_test,y_train,y_test = train_test_split(features_mean,y, test_size = 0.3)

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit ada to the training set
ada.fit(X_train,y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
dt.fit(X_train,y_train)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,feature_names = X_train.columns,class_names='BM')

****Features SE

In [None]:
features_mean = df.iloc[:,11:20]

X_train,X_test,y_train,y_test = train_test_split(features_mean,y, test_size = 0.3)

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit ada to the training set
ada.fit(X_train,y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
dt.fit(X_train,y_train)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,feature_names = X_train.columns,class_names='BM')