In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")


In [None]:
print("shape ",df.shape)

In [None]:
df.head()

In [None]:
df.drop('Unnamed: 32',axis=1,inplace=True)

In [None]:
df.describe(include="all")

In [None]:
df.info()

In [None]:
df.isnull().sum()

No null values here!!


In [None]:
df.rename({'diagnosis':'target'}, axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['target'].value_counts()

#### 357 benign, 212 malignant

In [None]:
df['target']=[1 if i == "M" else 0 for i in df['target']]
##replacing malignant with 1 and benign with 0

In [None]:
df.plot(subplots=True, sharex=True ,figsize=(20,50))

## Observation 
Data is looking quite balanced and we can move on to visualization

## Data Visulaization

In [None]:
ax = sns.countplot(x="target", data=df)
plt.title("Diagnosis M=1 , B=0")
plt.show()

In [None]:
df.drop('id',axis=1,inplace=True)
corr=df.corr()

In [None]:
import matplotlib.style as style
style.use("ggplot")
sns.set_style('whitegrid')
plt.subplots(figsize = (16,9))

sns.heatmap(corr,annot=True)

In [None]:
df.corr()['target'].sort_values(ascending=False)

In [None]:
plt.subplots(figsize = (9,13))
sns.heatmap(df.corr()[['target']].sort_values(by='target', ascending=False), annot=True, cmap='BrBG')

concave points_worst, perimeter_worst, concave points_mean, radius_worst, perimeter_mean
They resemble high correlation with target

In [None]:
def Box_plots(df):
    plt.figure(figsize=(10, 4))
    plt.title("Box Plot")
    sns.boxplot(df)
    plt.show()
for i in df.columns:
    Box_plots(df[i])

We can see there are some outliers in our data we will have to remove this for better result

In [None]:
X = df.drop(["target"], axis = 1)
y = df.target

In [None]:
col = X.columns.tolist()

In [None]:
df.shape

## Outliers 

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(X)

In [None]:
X_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = X_score
threshold = -2.0
filtre = outlier_score["score"] < threshold
outlier_index = outlier_score[filtre].index.tolist()

In [None]:
X = X.drop(outlier_index)
y = y.drop(outlier_index).values

In [None]:
X.shape

## Modeling

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size = 0.3,random_state = 42)

In [None]:
allAlgo = [('lr', LogisticRegression()),('knn', KNeighborsClassifier()),('dclf', DecisionTreeClassifier()),
          ('svm', SVC()),('nb', GaussianNB()),('rf',RandomForestClassifier()),]

In [None]:
res = []
algoName = []
for name, model in allAlgo:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring="accuracy")
    res.append(cv_results)
    algoName.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


We can see logistic regression and random forest classifier shows good accuracy without scaling data


In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(res)
ax.set_xticklabels(algoName)
plt.show()

In [None]:
pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('lr',LogisticRegression())])))

pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('knn',KNeighborsClassifier())])))

pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('dclf',DecisionTreeClassifier())])))

pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('nb',GaussianNB())])))

pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('svm', SVC())])))

pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('rf', RandomForestClassifier())])))

res = []
algoName = []
for name, model in pipelines:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring="accuracy")
    res.append(cv_results)
    algoName.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(res)
ax.set_xticklabels(algoName)
plt.show()

We can see the difference after scaling data!! LR, SVM, Rf show quite good accuracy

## Tuning SVM

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC()
kfold = KFold(n_splits=11)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## SVM

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = SVC(kernel='linear',C=0.3)
model.fit(rescaledX, Y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model.predict(rescaledValidationX)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

## GaussianNB

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
nb_classifier = GaussianNB()
nb_classifier.fit(rescaledX, Y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = nb_classifier.predict(rescaledValidationX)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

## RandomForestClassifier

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
rf=RandomForestClassifier(random_state=101)
rf.fit(rescaledX, Y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = rf.predict(rescaledValidationX)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

## Logistic Regression

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
lr=LogisticRegression(solver='saga',penalty='l2',l1_ratio=0.6,random_state=41)
lr.fit(rescaledX, Y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = lr.predict(rescaledValidationX)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

## XGBoost

In [None]:
from xgboost import XGBClassifier
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
xgb_classifier = XGBClassifier()
xgb_classifier.fit(rescaledX, Y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = xgb_classifier.predict(rescaledValidationX)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

### Cross-Validation

#### RandomforestClassifier

In [None]:
from sklearn.model_selection import cross_val_score
cross_validation = cross_val_score(estimator = rf, X = X_train, y =Y_train, cv =10)
print("Cross validation of SVC model = ",cross_validation)
print("Cross validation of SVC model (in mean) = ",cross_validation.mean())

#### Logistic Regression

In [None]:
cross_validation = cross_val_score(estimator = lr, X = X_train, y =Y_train, cv =10)
print("Cross validation of SVC model = ",cross_validation)
print("Cross validation of SVC model (in mean) = ",cross_validation.mean())