 ## IMPORTING LIBRARIES

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


## CREATING PIPELINES

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline= Pipeline([('std_scaler',StandardScaler())])

In [None]:
dataset = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

## Exploratory Data Analysis

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop('Unnamed: 32', axis =1)

In [None]:
dataset.describe()

In [None]:
dataset.isnull().values.any()

No null values have been detected in the dataset,so we can move ahead.

In [None]:
dataset.isnull().sum()

In [None]:
dataset.shape

In [None]:
dataset['diagnosis'].value_counts()

## Plotting Features in the form of Histograms

In [None]:
dataset.hist(bins=50,figsize=(20,15),color='violet',lw=0)

## Splitting the data into test & train set

In [None]:
X = dataset.iloc[:,2:].values
Y = dataset.iloc[:, 1:2].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y.ravel())

from sklearn.model_selection import train_test_split
X_train_set,X_test_set,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=42)
print(f"Rows in X train set: {len(X_train_set)}\nRows in X test set: {len(X_test_set)}")

In [None]:
X_train=my_pipeline.fit_transform(X_train_set)
X_test= my_pipeline.transform(X_test_set)

In [None]:
plt.figure(figsize=(20, 12))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True, linewidth=1, mask=matrix, cmap="magma");

#**We can see that some features have very strong positive correlations and some have very strong negative correlations,hence,we plot these features.Here,we have perimeter_mean,radius_mean,area_mean,perimeter_worst,radius_worst,fractal_dimension_worst,fractal_dimension_meansmoothness_se,symmetry_se these features which give us these correlations.Now,we plot these features with respect to each other**

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(18, 12))
sns.scatterplot(x='perimeter_mean', y='radius_mean', hue="diagnosis",
                data=dataset, ax=ax[0][0], palette='magma')
sns.scatterplot(x='area_mean', y='radius_mean', hue="diagnosis",
                data=dataset, ax=ax[0][1], palette='magma')
sns.scatterplot(x='area_mean', y='perimeter_mean', hue="diagnosis",
                data=dataset, ax=ax[0][2], palette='magma')
sns.scatterplot(x='perimeter_worst', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[0][3], palette='magma')
sns.scatterplot(x='fractal_dimension_mean', y='area_mean', hue="diagnosis",
                data=dataset, ax=ax[1][0], palette='magma')
sns.scatterplot(x='fractal_dimension_worst', y='area_worst', hue="diagnosis",
                data=dataset, ax=ax[1][1], palette='magma')
sns.scatterplot(x='smoothness_se', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[1][2], palette='magma')
sns.scatterplot(x='symmetry_se', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[1][3], palette='magma');


## EVALUATING EACH MODEL:

 ## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression(random_state=0)
model1.fit(X_train,Y_train)

from sklearn.model_selection import cross_val_score
scores1=cross_val_score(model1,X_train,Y_train,scoring="accuracy",cv=10)
#rmse would be required in regression
#rmse_scores=np.sqrt(-scores)
#rmse_scores

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('''Prediciting Test Set Result for Logistic Regression''')
Y_pred = model1.predict(X_test)
result = np.concatenate((Y_pred.reshape(len(Y_pred), 1),Y_test.reshape(len(Y_test), 1)), 1)
print(result,'\n')
print('''Making Confusion Matrix''')
Y_pred = model1.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm,'\n')
print('True Positives :',cm[0][0])
print('False Positives :',cm[0][1])
print('False Negatives :',cm[1][0])
print('True Negatives :', cm[0][1],'\n')

print('''Classification Report''')
print(classification_report(Y_test, Y_pred,target_names=['M', 'B'], zero_division=1))

print('''Evaluating Logistic Regression Model Performance''')
accuracy = accuracy_score(Y_test, Y_pred)
print(accuracy,'\n')

print('''Applying Cross validation''')
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(model1, X_train, Y_train, cv=10)
print("Accuracy for Linear Regression: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation for Linear Regression: {:.2f} %".format(accuracies.std()*100),'\n')

## Decision Tree Classifier Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
model2=DecisionTreeClassifier()
model2.fit(X_train,Y_train)

from sklearn.model_selection import cross_val_score
scores2=cross_val_score(model2,X_train,Y_train,scoring="accuracy",cv=10)


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('''Prediciting Test Set Result for Decision Tree Classifier''')
Y_pred2 = model2.predict(X_test)
result2 = np.concatenate((Y_pred2.reshape(len(Y_pred2), 1),Y_test.reshape(len(Y_test), 1)), 1)
print(result2,'\n')
print('''Making Confusion Matrix''')
Y_pred2 = model2.predict(X_test)
cm2 = confusion_matrix(Y_test, Y_pred2)
print(cm2,'\n')
print('True Positives :',cm2[0][0])
print('False Positives :',cm2[0][1])
print('False Negatives :',cm2[1][0])
print('True Negatives :', cm2[0][1],'\n')

print('''Classification Report''')
print(classification_report(Y_test, Y_pred2,target_names=['M', 'B'], zero_division=1))

print('''Evaluating Decision Tree Classifier Model Performance''')
accuracy2 = accuracy_score(Y_test, Y_pred2)
print(accuracy2,'\n')

print('''Applying Cross validation''')
from sklearn.model_selection import cross_val_score
accuracies2 = cross_val_score(model2, X_train, Y_train, cv=10)
print("Accuracy for Decision Tree: {:.2f} %".format(accuracies2.mean()*100))
print("Standard Deviation for Decision Tree: {:.2f} %".format(accuracies2.std()*100),'\n')

## Random Forest Classifier Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
model3=RandomForestClassifier()
model3.fit(X_train,Y_train)

from sklearn.model_selection import cross_val_score
scores3=cross_val_score(model3,X_train,Y_train,scoring="accuracy",cv=10)


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('''Prediciting Test Set Result for Random Forest Classifier''')
Y_pred3 = model3.predict(X_test)
result3 = np.concatenate((Y_pred3.reshape(len(Y_pred3), 1),Y_test.reshape(len(Y_test), 1)), 1)
print(result3,'\n')
print('''Making Confusion Matrix''')
Y_pred3 = model3.predict(X_test)
cm3 = confusion_matrix(Y_test, Y_pred3)
print(cm3,'\n')
print('True Positives :',cm3[0][0])
print('False Positives :',cm3[0][1])
print('False Negatives :',cm3[1][0])
print('True Negatives :', cm3[0][1],'\n')

print('''Classification Report''')
print(classification_report(Y_test, Y_pred3,target_names=['M', 'B'], zero_division=1))

print('''Evaluating Random Forest Classifier Model Performance''')
accuracy3 = accuracy_score(Y_test, Y_pred3)
print(accuracy3,'\n')

print('''Applying Cross validation''')
from sklearn.model_selection import cross_val_score
accuracies3 = cross_val_score(model3, X_train, Y_train, cv=10)
print("Accuracy for Random Forest Classifier: {:.2f} %".format(accuracies3.mean()*100))
print("Standard Deviation for Random Forrest Classifier: {:.2f} %".format(accuracies3.std()*100),'\n')

## Support Vector Machines Model

In [None]:
from sklearn import svm
model4 = svm.SVC(kernel='linear') # Linear Kernel
model4.fit(X_train, Y_train)

from sklearn.model_selection import cross_val_score
scores4=cross_val_score(model4,X_train,Y_train,scoring="accuracy",cv=10)


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('''Prediciting Test Set Result for SVM''')
Y_pred4 = model4.predict(X_test)
result4 = np.concatenate((Y_pred4.reshape(len(Y_pred4), 1),Y_test.reshape(len(Y_test), 1)), 1)
print(result4,'\n')
print('''Making Confusion Matrix''')
Y_pred4 = model4.predict(X_test)
cm4 = confusion_matrix(Y_test, Y_pred4)
print(cm4,'\n')
print('True Positives :',cm4[0][0])
print('False Positives :',cm4[0][1])
print('False Negatives :',cm4[1][0])
print('True Negatives :', cm4[0][1],'\n')

print('''Classification Report''')
print(classification_report(Y_test, Y_pred4,target_names=['M', 'B'], zero_division=1))

print('''Evaluating SVM Performance''')
accuracy4 = accuracy_score(Y_test, Y_pred4)
print(accuracy4,'\n')

print('''Applying Cross validation''')
from sklearn.model_selection import cross_val_score
accuracies4 = cross_val_score(model4, X_train, Y_train, cv=10)
print("Accuracy for SVM: {:.2f} %".format(accuracies4.mean()*100))
print("Standard Deviation for SVM: {:.2f} %".format(accuracies4.std()*100),'\n')

## K Neighbors Classifier Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model5 = KNeighborsClassifier(n_neighbors = 8)
model5.fit(X_train, Y_train)

from sklearn.model_selection import cross_val_score
scores5=cross_val_score(model5,X_train,Y_train,scoring="accuracy",cv=10)


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('''Prediciting Test Set Result for KNeighbors Classifier''')
Y_pred5 = model5.predict(X_test)
result5 = np.concatenate((Y_pred5.reshape(len(Y_pred5), 1),Y_test.reshape(len(Y_test), 1)), 1)
print(result5,'\n')
print('''Making Confusion Matrix''')
Y_pred5 = model5.predict(X_test)
cm5 = confusion_matrix(Y_test, Y_pred5)
print(cm5,'\n')
print('True Positives :',cm5[0][0])
print('False Positives :',cm5[0][1])
print('False Negatives :',cm5[1][0])
print('True Negatives :', cm5[0][1],'\n')

print('''Classification Report''')
print(classification_report(Y_test, Y_pred5,target_names=['M', 'B'], zero_division=1))

print('''Evaluating K Neighbours Classifier Model Performance''')
accuracy5 = accuracy_score(Y_test, Y_pred5)
print(accuracy5,'\n')

print('''Applying Cross validation''')
from sklearn.model_selection import cross_val_score
accuracies5 = cross_val_score(model5, X_train, Y_train, cv=10)
print("Accuracy for K  Neighbours Classifier: {:.2f} %".format(accuracies5.mean()*100))
print("Standard Deviation for K neighbours Classifiers: {:.2f} %".format(accuracies5.std()*100),'\n')

## Naive Bayes Classifier Model

In [None]:
from sklearn.naive_bayes import GaussianNB
model6 = GaussianNB()
model6.fit(X_train, Y_train)

from sklearn.model_selection import cross_val_score
scores6=cross_val_score(model6,X_train,Y_train,scoring="accuracy",cv=10)


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('''Prediciting Test Set Result for Naive Bayes''')
Y_pred6 = model6.predict(X_test)
result6 = np.concatenate((Y_pred6.reshape(len(Y_pred6), 1),Y_test.reshape(len(Y_test), 1)), 1)
print(result6,'\n')
print('''Making Confusion Matrix''')
Y_pred = model6.predict(X_test)
cm6 = confusion_matrix(Y_test, Y_pred6)
print(cm6,'\n')
print('True Positives :',cm6[0][0])
print('False Positives :',cm6[0][1])
print('False Negatives :',cm6[1][0])
print('True Negatives :', cm6[0][1],'\n')

print('''Classification Report''')
print(classification_report(Y_test, Y_pred6,target_names=['M', 'B'], zero_division=1))

print('''Evaluating Naive Bayes Model Performance''')
accuracy6 = accuracy_score(Y_test, Y_pred6)
print(accuracy6,'\n')

print('''Applying Cross validation''')
from sklearn.model_selection import cross_val_score
accuracies6 = cross_val_score(model6, X_train, Y_train, cv=10)
print("Accuracy for Naive Bayes: {:.2f} %".format(accuracies6.mean()*100))
print("Standard Deviation for Naive Bayes: {:.2f} %".format(accuracies6.std()*100),'\n')

# Finding out which model works best for our dataset

In [None]:
plt.figure(figsize=(10, 6))
model_accuracies = [accuracies.mean()*100,accuracies2.mean()*100,accuracies3.mean()*100,accuracies4.mean()*100,accuracies5.mean()*100,accuracies6.mean()*100]
model_names = ['LogisticRegression','Decisiontree','RandomForest','SVM', 'KNN','Naive Bayes']
sns.barplot(x=model_accuracies,y=model_names,palette='magma');

In [None]:
length=len(model_names)
for i in range(length):
    print(model_names[i],'Model Accuracy is:', model_accuracies[i],'%')

**The Logistic Regression Model works best here for our dataset**