In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### import the library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## load the dataset

In [None]:
data=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
data.head()

In [None]:
data.info()

In [None]:
## here id and Unnamed is not useful feature so drop it from the data

In [None]:
data.drop(['id','Unnamed: 32'], axis=1, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
## check the target feature 
data.diagnosis.value_counts()

## Exploratory data analysis

In [None]:
sns.countplot(data['diagnosis'])

In [None]:
## check the distribution of some feature
sns.distplot(data['radius_mean'])
plt.ylabel("probability ")

In [None]:
sns.distplot(data['texture_mean'])

In [None]:
sns.distplot(data['concavity_mean'])

In [None]:
sns.FacetGrid(data, hue='diagnosis').map(plt.scatter, 'radius_mean','texture_mean').add_legend()  ## a lot of overlapping with the target feature

## identify the correlated feature 

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(data.corr(), annot=True, cmap='viridis')

## check the highly correlated feature

In [None]:
corr_matrix=data.corr().abs()
upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop=[column for column in upper.columns if any(upper[column]>0.95)]

In [None]:
data[to_drop]

In [None]:
## check these feature with the target feature(diagnosis)
sns.FacetGrid(data,hue='diagnosis').map(plt.plot, 'area_mean').add_legend()

In [None]:
sns.FacetGrid(data,hue='diagnosis').map(plt.plot, 'perimeter_mean').add_legend()

In [None]:
sns.FacetGrid(data,hue='diagnosis').map(plt.plot, 'perimeter_worst').add_legend()

In [None]:
sns.FacetGrid(data,hue='diagnosis').map(plt.plot, 'area_worst').add_legend()

#### here you can see that to drop feature is not correlated with the target feature so drop it from the dataset

In [None]:
data.drop(to_drop, axis=1, inplace=True)

In [None]:
data.shape

In [None]:
sns.pairplot(data, hue='diagnosis', size=7)

### split the data into dependent and independent set

In [None]:
x=data.drop('diagnosis', axis=1)
y=data['diagnosis']


In [None]:
x.head()
print(x.shape)

## Apply the machine learning model

In [None]:
x_train ,x_test, y_train ,y_test=train_test_split(x, y, test_size=0.3, random_state=0)
## apply the peprocessing 
scaler=StandardScaler()
x_train_scaler=scaler.fit_transform(x_train)
x_test_scaler=scaler.fit_transform(x_test)


#### logisticregression

In [None]:
logistic=LogisticRegression()
logistic.fit(x_train_scaler, y_train)
predict1=logistic.predict(x_test_scaler)
print("training set score:", logistic.score(x_train_scaler, y_train))
print('testing set score :', logistic.score(x_test_scaler, y_test))

In [None]:
print("confusion matrix : ", confusion_matrix(y_test, predict1))
print("classification report :", classification_report(y_test, predict1))

## KNeighborsClassifier

In [None]:
neighbor=KNeighborsClassifier(n_neighbors=3)
neighbor.fit(x_train_scaler, y_train)
predict2=neighbor.predict(x_test_scaler)
print("training set score:", neighbor.score(x_train_scaler, y_train))
print('testing set score :', neighbor.score(x_test_scaler, y_test))

In [None]:
print("confusion matrix : ", confusion_matrix(y_test, predict2))
print("classification report :", classification_report(y_test, predict2))

## Decision tree classifier

In [None]:
decision=DecisionTreeClassifier(max_depth=3)
decision.fit(x_train_scaler, y_train)
predict3=decision.predict(x_test_scaler)
print("training set score:", decision.score(x_train_scaler, y_train))
print('testing set score :', decision.score(x_test_scaler, y_test))

In [None]:
print("confusion matrix : ", confusion_matrix(y_test, predict3))
print("classification report :", classification_report(y_test, predict3))

## random forest Classifier

In [None]:
random_forest=RandomForestClassifier(max_depth=3)
random_forest.fit(x_train_scaler, y_train)
predict4=random_forest.predict(x_test_scaler)
print("training set score:", random_forest.score(x_train_scaler, y_train))
print('testing set score :', random_forest.score(x_test_scaler, y_test))

In [None]:
print("confusion matrix : ", confusion_matrix(y_test, predict4))
print("classification report :", classification_report(y_test, predict4))

## support vector classifier

In [None]:
svm=SVC(C=10)
svm.fit(x_train_scaler, y_train)
predict5=svm.predict(x_test_scaler)
print("training set score:", svm.score(x_train_scaler, y_train))
print('testing set score :', svm.score(x_test_scaler, y_test))

In [None]:
print("confusion matrix : ", confusion_matrix(y_test, predict5))
print("classification report :", classification_report(y_test, predict5))

In [None]:
## best accuray score for the testing data is 97.07 which is SVC
## so it the best model

## best model 

In [None]:
## SVC
print('prediction of SVC is :', predict5)