In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Starter

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.diagnosis.value_counts()

In [None]:
df.columns.values

In [None]:
df.corr()

In [None]:
df.isnull().sum()

In [None]:
df.drop(['Unnamed: 32'], inplace=True, axis=1)

In [None]:
df.columns

In [None]:
df.info()

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['diagnosis'])
df['diagnosis'] = le.transform(df['diagnosis'])

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
df_new = df[['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'radius_se', 'perimeter_se', 'area_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst', 'fractal_dimension_worst']]

In [None]:
X = df.drop(['id','diagnosis'], axis=1)
y = df[['diagnosis']]

In [None]:
X

# Data Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape, X_valid.shape, X_test.shape)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(X_train, y_train)

print("Train Accuracy : ", knc.score(X_train, y_train))
print("Validation Accuracy : ", knc.score(X_valid, y_valid))
# print("Test Accuracy : ", knc.score(X_test, y_test))

In [None]:
y_pred_knc = knc.predict(X_test)
print(classification_report(y_pred_knc, y_test))

In [None]:
accuracy_list = []
for i in range(50, 100, 10):
    model_knn = KNeighborsClassifier(n_neighbors=i)
    model_knn.fit(X_train, y_train)
    knn_pred = model_knn.predict(X_test)
    accuracy_list.append(accuracy_score(knn_pred, y_test))
plt.figure(figsize=(12,8))
plt.plot(range(50, 100, 10), accuracy_list)
plt.title("Hasil n_neighbors dari 5 - 10")
plt.show()

In [None]:
knn_best = KNeighborsClassifier(n_neighbors=90)
knn_best.fit(X_train, y_train)

pred_knn_best = knn_best.predict(X_test)
print("Akurasi knn n_neighbors = 90 \n", accuracy_score(pred_knn_best, y_test))

# SVM

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

print("Train acc : ", svc.score(X_train, y_train))
print("Val acc : ", svc.score(X_valid, y_valid))

In [None]:
svc_pred = svc.predict(X_test)
print(classification_report(svc_pred, y_test))

# GridSearchCV for SVM

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc1 = SVC()
clf = GridSearchCV(svc1, parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [None]:
svc_best = SVC(C=10, kernel='linear')
svc_best.fit(X_train, y_train)

print("Train acc : ", svc_best.score(X_train, y_train))
print("Val acc : ", svc_best.score(X_valid, y_valid))

pred_svc_best = svc_best.predict(X_test)
print(classification_report(pred_svc_best, y_test))

In [None]:
print(accuracy_score(pred_svc_best, y_test))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(pred_svc_best, y_test))