In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import axes3d
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
#importing data and getting shape
df = pd.read_csv("../input/breastcancerwisconsin/data.csv")
df.shape

In [None]:
df.head()
# we can see here that last column is having NaN values so we have to resolve it

In [None]:
# finding missing values
df.isnull().sum()
# we noticed here that in 'unnamed: 32' feature we have zero non-null data so its useless for our problem

In [None]:
# dropping irrelevant features 
df.drop(['id', 'Unnamed: 32'], axis = 1, inplace = True)

In [None]:
# here we get some more insight about our data set
df.describe()

In [None]:
#checking Malignant (M) and Benign (B) class observations in dataset
df['diagnosis'].value_counts()

In [None]:
# visualizing the same using bar graphs
df.diagnosis.value_counts().plot(kind = "bar")
plt.title("People Diagnosed Benign and Malignant")

In [None]:
# Visualizing corelation between features which will help in further process using heatmap
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B':0})
corr = df.corr()
cmap = sns.diverging_palette(220, 10, as_cmap = True)

f, ax = plt.subplots(figsize = (21, 19))
sns.heatmap(corr, cmap = cmap, center = 0, annot = True, square = True, linewidths = .5, cbar_kws = {"shrink": .5})

In [None]:
sns.pairplot(df, hue = 'diagnosis')

In [None]:
# splitting the data
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

# encoding the diagnosis column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) # 1 = M, 0 = B

# splitting data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Dimensionality Reduction using PCA

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [None]:
pca = PCA() #This graph gives us the distribution of maxmium information at each component
pca.fit_transform(X_train) 
pca_var = pca.explained_variance_
plt.figure(figsize = (8, 6))
plt.bar(range(30), pca_var, alpha = 0.5, align = 'center', label = 'Variance retrieved by component')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show() 

In [None]:
pca = PCA(n_components = 10) #This is used to decompose or reduce the dimension into the specified dimensions given in the components
pca.fit(X_train)
X_train_pca = pca.transform(X_train) 
pca.explained_variance_ratio_ 

In [None]:
np.sum(pca.explained_variance_ratio_)*100 #The percentage of the values imply that the 10 components retrieves close to 95% percent of the information from the original dataset that had 30 features

In [None]:
X_train.shape # Before Pca

In [None]:
X_train_pca.shape #After Pca As you see the features have been readuced from 30 to 10

In [None]:
X1 = pd.DataFrame(data = X_train_pca, columns = ["PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10"])
X1.head()

In [None]:
plt.figure(figsize = (7, 7))
sns.heatmap(X1.corr(), annot= True, fmt = '.1f')
plt.show() #Thus the extracted faetures show zero correlation with each other

In [None]:
y1 = pd.DataFrame(y_train, columns = ["diagnosis"])
y1.head()
print(y1.shape)

In [None]:
X1["common"] = range(398)
y1["common"] = range(398)
dataset = pd.merge(X1, y1, on = ["common"])
dataset = dataset.drop('common', axis=1)
dataset.shape

In [None]:
sns.pairplot(dataset,hue='diagnosis')

In [None]:
# %matplotlib notebook
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

for x in dataset.diagnosis.unique():
    ax.scatter(dataset.PC1[dataset.diagnosis==x], dataset.PC2[dataset.diagnosis==x], dataset.PC3[dataset.diagnosis==x], label=x)
    
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")

for angle in range(0, 360):
    ax.view_init(30, angle)
    plt.draw()
    plt.pause(.001)
    
plt.show()

Above 3D visualization shows that derived principal components have separability in target class.

In [None]:
# standard scaling test data
X_test = scaler.transform(X_test) 
# transforming test data with pca from above obtained parameters
X_test_pca = pca.transform(X_test)

# Traning and Validating Models

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

logreg.fit(X_train_pca, y_train)

yl_pred = logreg.predict(X_test_pca)

print('Confusion Matrix:\n', confusion_matrix(y_test, yl_pred))
print("Accuracy score:", accuracy_score(y_test, yl_pred))
print('Classification Report:\n', classification_report(y_test, yl_pred))

### Hyperparameter Tuning

In [None]:
param_grid=[{'penalty':['l2',],
            'C':[0.01,0.1,1,10,100],
            'solver':['liblinear','sag','newton-cg','lbfgs'],
            'max_iter':[100,1000,2500,5000]}]

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(logreg,param_grid=param_grid,cv=10,scoring='accuracy',verbose=True)
grid.fit(X_train_pca,y_train)

In [None]:
print(grid.best_params_)

### After Hyperparameter Tuning

In [None]:
logreg = LogisticRegression(C=0.1, max_iter=100, penalty='l2', solver='liblinear' )

logreg.fit(X_train_pca, y_train)

yl_pred = logreg.predict(X_test_pca)

print('Confusion Matrix:\n', confusion_matrix(y_test, yl_pred))
print("Accuracy score:", accuracy_score(y_test, yl_pred))
print('Classification Report:\n', classification_report(y_test, yl_pred))

In [None]:
%matplotlib inline
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(logreg, X_train_pca, y_train, cv=10, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 50))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
# Stratified KFold

X1 = pd.DataFrame(X)
y1 = pd.DataFrame(y)

accuracyl = []

skf = StratifiedKFold(n_splits = 10, random_state = None)
skf.get_n_splits(X1, y1)
for train_index, test_index in skf.split(X1, y1):
    X1_train, X1_test = X1.iloc[train_index], X1.iloc[test_index]
    y1_train, y1_test = y1.iloc[train_index], y1.iloc[test_index]
    
    # applying PCA
    scaler.fit(X1_train)
    X1_train = scaler.transform(X1_train)
    pca = PCA(n_components = 10)
    pca.fit(X1_train)
    X1_train = pca.transform(X1_train)

    X1_test = scaler.transform(X1_test)
    X1_test = pca.transform(X1_test)

    logreg.fit(X1_train, y1_train)
    yl_pred = logreg.predict(X1_test)
    accuracyl.append(accuracy_score(yl_pred, y1_test))
    
    
print(np.array(accuracyl).mean())

## SVM 

In [None]:
from sklearn import svm
sclf = svm.SVC()
sclf.fit(X_train_pca, y_train)

ys_pred = sclf.predict(X_test_pca)

print('Confusion Matrix:\n', confusion_matrix(y_test, ys_pred))
print("Accuracy score:", accuracy_score(y_test, ys_pred))
print('Classification Report:\n', classification_report(y_test, ys_pred))

### Hyperparameter Tuning

In [None]:
param_grid=[{'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf','linear','poly'],
            }]

grid=GridSearchCV(sclf,param_grid=param_grid,cv=10,scoring='accuracy',verbose=True)
grid.fit(X_train_pca,y_train)

In [None]:
print(grid.best_params_)

### After Hyperparameter Tuning

In [None]:
sclf = svm.SVC(C=1000, gamma= 0.0001, kernel='rbf', probability = True)
sclf.fit(X_train_pca, y_train)

ys_pred = sclf.predict(X_test_pca)

print('Confusion Matrix:\n', confusion_matrix(y_test, ys_pred))
print("Accuracy score:", accuracy_score(y_test, ys_pred))
print('Classification Report:\n', classification_report(y_test, ys_pred))

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(sclf, X_train_pca, y_train, cv=10, scoring='accuracy', n_jobs=-1, train_sizes = np.linspace(0.01, 1.0, 50))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
# Stratified KFold
accuracys = []

skf = StratifiedKFold(n_splits = 10, random_state = None)
skf.get_n_splits(X1, y1)
for train_index, test_index in skf.split(X1, y1):
    X1_train, X1_test = X1.iloc[train_index], X1.iloc[test_index]
    y1_train, y1_test = y1.iloc[train_index], y1.iloc[test_index]

    # applying PCA
    scaler.fit(X1_train)
    X1_train = scaler.transform(X1_train)
    pca = PCA(n_components = 10)
    pca.fit(X1_train)
    X1_train = pca.transform(X1_train)

    X1_test = scaler.transform(X1_test)
    X1_test = pca.transform(X1_test)
    
    sclf.fit(X1_train, y1_train)
    ys_pred = sclf.predict(X1_test)
    accuracys.append(accuracy_score(ys_pred, y1_test))
    
    
print(np.array(accuracys).mean())

## Naive Bayes 

In [None]:
from sklearn.naive_bayes import BernoulliNB
BernNb = BernoulliNB()
BernNb.fit(X_train_pca, y_train)

yn_pred = BernNb.predict(X_test_pca)

print('Confusion Matrix:\n', confusion_matrix(y_test, yn_pred))
print("Accuracy score:", accuracy_score(y_test, yn_pred))
print('Classification Report:\n', classification_report(y_test, yn_pred))

### Hyperparameter Tuning

In [None]:
param_grid = [{'binarize':[0.0,0.001,0.01,0.1,1,10,100], 
               'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0], 
               'fit_prior' : [True, False],
               'class_prior': [None, [.1,.9],[.2, .8]]
             }]

grid = GridSearchCV(BernNb,param_grid=param_grid,cv=10,scoring='accuracy',verbose=True)
grid.fit(X_train_pca,y_train)

In [None]:
print(grid.best_params_)

### After Hyperparameter Tuning

In [None]:
BernNb=BernoulliNB(alpha=10,binarize=0.0,fit_prior=True)
BernNb.fit(X_train_pca, y_train)

yn_pred = BernNb.predict(X_test_pca)

print('Confusion Matrix:\n', confusion_matrix(y_test, yn_pred))
print("Accuracy score:", accuracy_score(y_test, yn_pred))
print('Classification Report:\n', classification_report(y_test, yn_pred))

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(BernNb, X_train_pca, y_train, cv=10, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 50))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
# Stratified KFold
accuracyn = []

skf = StratifiedKFold(n_splits = 10, random_state = None)
skf.get_n_splits(X1, y1)
for train_index, test_index in skf.split(X1, y1):
    X1_train, X1_test = X1.iloc[train_index], X1.iloc[test_index]
    y1_train, y1_test = y1.iloc[train_index], y1.iloc[test_index]
    
    # applying PCA
    scaler.fit(X1_train)
    X1_train = scaler.transform(X1_train)
    pca = PCA(n_components = 10)
    pca.fit(X1_train)
    X1_train = pca.transform(X1_train)

    X1_test = scaler.transform(X1_test)
    X1_test = pca.transform(X1_test)

    BernNb.fit(X1_train, y1_train)
    yn_pred = BernNb.predict(X1_test)
    accuracyn.append(accuracy_score(yn_pred, y1_test))
    
    
print(np.array(accuracyn).mean())

# Box Plot of Accuracies achieved from 10 Fold Stratified K Fold

In [None]:
df = pd.DataFrame({'Logistic Regression': accuracyl,
                   'SVM': accuracys,
                   'Naive Bayes': accuracyn
                    })
df

In [None]:
plt.figure()
plt.boxplot([df['Logistic Regression'], df['SVM'], df['Naive Bayes']]);
plt.xticks([1, 2, 3], ['Logistic Regression', 'SVM', 'Naive Bayes'])
plt.xlabel('Classifier');
plt.ylabel('Mean Accuracy score')
plt.title('Comparing 10 fold CV accuracies of classifiers')

# Evaluation of Models using ROC

In [None]:
# calculation of probabilities
lr_probs = logreg.predict_proba(X_test_pca)
svc_probs = sclf.predict_proba(X_test_pca)
nb_probs = BernNb.predict_proba(X_test_pca)

# Keeping only True Positive and False Positive 
lr_probs = lr_probs[:, 1]
svc_probs = svc_probs[:, 1]
nb_probs = nb_probs[:, 1]

# calculating roc auc score to evaluate each ones performance
lr_auc = roc_auc_score(y_test, lr_probs)
svc_auc = roc_auc_score(y_test, svc_probs)
nb_auc = roc_auc_score(y_test, nb_probs)

In [None]:
print("Logistic Regression: ", lr_auc)
print("SVM: ", svc_auc)
print("Naive Bayes: ", nb_auc)
# so from below we see that auc roc score of Logistic Regresion is highest

In [None]:
# Calculating True Positive Rate and False Positive Rate for Each Models
lr_fpr, lr_tpr, thr_l = roc_curve(y_test, lr_probs)
svc_fpr, svc_tpr, thr_s = roc_curve(y_test, svc_probs)
nb_fpr, nb_tpr, thr_n = roc_curve(y_test, nb_probs)

In [None]:
# Plotting ROC Curve
plt.figure(figsize = (10, 10))

plt.plot(lr_fpr, lr_tpr, marker = ".", label = 'Logistic Regression', color = 'green')
plt.plot(svc_fpr, svc_tpr, marker = ".", label = 'SVM', color = 'blue')
plt.plot(nb_fpr, nb_tpr, marker = ".", label = 'Naive Bayes', color = 'red')

plt.title('ROC Plot')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()