In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
import warnings
warnings.simplefilter('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1) Load and Check Data

In [None]:
data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
# Delete columns : "id", "Unnamed: 32"
data.drop(["id","Unnamed: 32"], axis = 1, inplace = True)

In [None]:
data.info()

In [None]:
# Diagnosis
# M : malignant
# B : benign
data.diagnosis = [1 if i.strip() == "M" else 0 for i in data.diagnosis]

# 2)Variable Analysis

In [None]:
numeric_variable = data.drop(["diagnosis"], axis = 1)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
x = numeric_variable
y = data.diagnosis
ext = ExtraTreesRegressor(random_state = 42)
model = ext.fit(x,y)
df = pd.DataFrame(sorted(zip(ext.feature_importances_,numeric_variable.columns)), columns=['Value','Variable'])

In [None]:
#Visualization
plt.figure(figsize = (15,6))
sns.barplot(x = "Value", y  = "Variable", data = df.sort_values(["Value"], ascending = False))

In [None]:
#Numeric variable visualization
def numeric(col):
    fig, ax = plt.subplots(1,2, figsize = (15,6))
    sns.scatterplot(x = data[col], y = data.diagnosis, ax= ax[0])
    sns.distplot(x = data[col], kde = False, ax = ax[1])
    fig.suptitle(str(col) + ' analysis')

### Concave points_worst

In [None]:
numeric("concave points_worst")

### Concave points_mean

In [None]:
numeric("concave points_mean")

### Perimeter_worst

In [None]:
numeric("perimeter_worst")

### Radius_worst

In [None]:
numeric("radius_worst")

### Area_worst

In [None]:
numeric("area_worst")

### Perimeter_mean

In [None]:
numeric("perimeter_mean")

In [None]:
# Correlation Matrix
data.corr()["diagnosis"].sort_values(ascending = False)

In [None]:
corr_matrix=data.corr()
threshold=0.7
filtre=np.abs(corr_matrix["diagnosis"])>threshold
corr_features=corr_matrix.columns[filtre].tolist()
sns.clustermap(data[corr_features].corr(),annot=True,fmt=".2f")

# 3) Outlier Detection

In [None]:
from collections import Counter
def outliers(data,columns):
    aykiri_indexler=[]
    for i in columns:
        Q1 = data[i].quantile(0.25)
        Q3 = data[i].quantile(0.75)
        IQR = Q3 -Q1
        alt_sinir = Q1 - 1.5*IQR
        ust_sinir = Q3 + 1.5*IQR
        filtre = ((data[i] < alt_sinir) | (data[i] > ust_sinir))
        aykiri_gozlemler = data[i][filtre]
        aykiri_index = aykiri_gozlemler.index
        aykiri_indexler.extend(aykiri_index)
        
    aykiri_indexler = Counter(aykiri_indexler) #benzersiz degerleri bulup bunları dict çevirir
    ortak_indexler = [i for i,v in aykiri_indexler.items() if v>2] #aynı anda ikiden fazla outlier varsa bir satır için bunu al
    
    return ortak_indexler

In [None]:
delete_index = outliers(data,data.columns)
data = data.drop(delete_index,axis=0).reset_index(drop=True)

# 4) Create Model

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
y = data.diagnosis.values.reshape(-1,1)
x = data.drop(["diagnosis"], axis = 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
#ML Models
classifier = [DecisionTreeClassifier(random_state = 42),
             SVC(random_state = 42),
             RandomForestClassifier(random_state = 42),
             LogisticRegression(random_state = 42),
             KNeighborsClassifier()]

In [None]:
dt_param_grid = {"min_samples_split" : range(10,500,20),
                "max_depth": range(1,20,2)}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}

In [None]:
classifier_params = [dt_param_grid,
                    svc_param_grid,
                    rf_param_grid,
                    logreg_param_grid,
                    knn_param_grid]

In [None]:
cv_results = []
best_estimators = []
for i in range(len(classifier)):
    grid = GridSearchCV(classifier[i], param_grid = classifier_params[i], cv = StratifiedKFold(n_splits = 10), 
                       scoring = "accuracy", n_jobs = -1, verbose = 1)
    model = grid.fit(x_train, y_train)
    cv_results.append(model.best_score_)
    best_estimators.append(model.best_estimator_)
    print(cv_results[i])
    print(best_estimators[i])

In [None]:
df = pd.DataFrame({"Cross Validation":cv_results,
                  "ML Models":["DT", "SVM", "RF", "LR", "KN"]})

In [None]:
# Visualization
sns.barplot(x = "Cross Validation", y = "ML Models", data = df.sort_values(["Cross Validation"], ascending = False))

The best model is SVM

In [None]:
grid={"kernel" : ["rbf"],
                 "gamma": [0.001],
                 "C":[100]}
svm=SVC(random_state=42)
svm_cv=GridSearchCV(svm,grid,cv=10, scoring = "accuracy")
model=svm_cv.fit(x_train,y_train)
print("train accuracy:",model.best_score_)

In [None]:
y_head = model.predict(x_test)
print("test accuracy:",accuracy_score(y_test, y_head))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_head)

In [None]:
sns.heatmap(cm, annot = True)