In [None]:
# Data analysis
import numpy as np 
import pandas as pd 

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import *

In [None]:
# Sklearn methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Input
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
data = pd.read_csv(os.path.join(dirname, filename))

In [None]:
# EDA

data.head()

In [None]:
del data['Unnamed: 32'] # delete null column

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.diagnosis.unique()

In [None]:
# analyze mean, se, worst columns

mean_data = data[data.columns[2:12]]
se_data = data[data.columns[12:22]]
worst_data = data[data.columns[22:32]]

In [None]:
# mean data - correlation matrix
mean_corr = mean_data.corr()# plot the heatmap
sns.heatmap(mean_corr, xticklabels=mean_corr.columns, yticklabels=mean_corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
# plot variables with a high correlation
mean_data.plot(kind='scatter', x='area_mean', y='radius_mean')

In [None]:
mean_data.plot(kind='scatter', x='concavity_mean', y='concave points_mean')

In [None]:
se_corr = se_data.corr()# plot the heatmap
sns.heatmap(se_corr, xticklabels=se_corr.columns, yticklabels=se_corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
se_data.plot(kind='scatter', x='perimeter_se', y='radius_se')

In [None]:
se_data.plot(kind='scatter', x='area_se', y='radius_se')

In [None]:
worst_corr = worst_data.corr()# plot the heatmap
sns.heatmap(worst_corr, xticklabels=worst_corr.columns, yticklabels=worst_corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
worst_data.plot(kind='scatter', x='perimeter_worst', y='radius_worst')

In [None]:
worst_data.plot(kind='scatter', x='area_worst', y='radius_worst')

In [None]:
worst_data.plot(kind='scatter', x='area_worst', y='perimeter_worst')

In [None]:
mean_data['texture_mean'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')


In [None]:
# data target split 
x, y = data[data.columns[2:]],data['diagnosis']

In [None]:
y.value_counts()

In [None]:
# Normalize the data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_standardized = scaler.transform(x)

print("Mean:\n", scaler.mean_)
print("Var:\n", scaler.var_)
print("Samples seen: ", scaler.n_samples_seen_)

In [None]:
# PCA
from sklearn.decomposition import PCA

#n_components = 23 # 30-7
pca = PCA(n_components = 0.95) # sum of explained variance ratio is at least 95% 
pca.fit(x_standardized)
explained_var_ratio = sum(pca.explained_variance_ratio_)
n_components = pca.n_components_
print(explained_var_ratio)
print(n_components)
pca = PCA(n_components=n_components, copy=True)
pca.fit(x)

transformed_x = pca.transform(x_standardized)

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, random_state=42, test_size=0.3)

In [None]:
# Learning curve

train_sizes, train_scores, test_scores = learning_curve(
        KNeighborsClassifier(), x_train, y_train, cv=10, n_jobs=-1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
train_sizes

In [None]:
plt.figure()
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")

In [None]:
# Cross Validation

In [None]:
# Grid Search

# Params für KNN: n_neighbors (4), weights (2), p (2)
# 4x2x2 = 16
parameters = {"n_neighbors": [1, 3, 5, 7], "weights": ["uniform", "distance"], "p": [1, 2]}
neigh = KNeighborsClassifier()

# GridSearchCV 
# 16x3 = 48
clf = GridSearchCV(neigh, parameters, cv=3)
clf.fit(x_train, y_train)

In [None]:
for key in clf.cv_results_.keys():
    print(key)

In [None]:
clf.cv_results_["params"]

In [None]:
print("Best params set found: ")
print(clf.best_params_, "\n")

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]

for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.3f) for %r" % (mean, std*2, params))

In [None]:
# Verlässlichere Aussage über Güte der gewählten Hyperparameter
kf = KFold(n_splits=3, shuffle=True, random_state=42)
clf = KNeighborsClassifier(n_neighbors=5, p=1, weights="distance")

scores = cross_val_score(clf, x_train, y_train, cv=kf, n_jobs=-1)
predictions = cross_val_predict(clf, x_train, y_train, cv=kf, n_jobs=-1)
mean_score = round(np.mean(scores), 4)

print("Scores: ", scores)
print("Mean Score: ", mean_score)

In [None]:
# KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

params = {"n_neighbors": [2, 3, 4, 5, 6], "weights": ["uniform", "distance"], "p": [1, 2]}
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, params, cv=3)

grid_result = grid.fit(x_train, y_train)

# Summary
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# FINAL MODEL
clf = KNeighborsClassifier(n_neighbors=5, p=1, weights="distance")
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print("Acc: ", acc)

In [None]:
# Decision Tree Classifier
parameters = {"criterion": ["gini", "entropy"],
              "max_depth": [None, 3, 5, 7, 8, 9, 10, 11, 12]}
dec_tree = DecisionTreeClassifier()

clf = GridSearchCV(dec_tree, parameters, cv=3)
clf.fit(x_train, y_train)

print("Best params:")
print(clf.best_params_)
print(clf.best_score_)

In [None]:
dt = DecisionTreeClassifier(criterion="entropy", max_depth=3, max_features="auto")
dt.fit(x_train, y_train)
score = dt.score(x_test, y_test)
print("Test score: ", score)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

params = {"n_estimators": [140, 180, 220],
          "criterion": ["gini", "entropy"],
         "max_depth": [i for i in range(50, 65, 2)]}

clf = RandomForestClassifier()
grid = GridSearchCV(clf, params, cv=3, n_jobs=-1)

grid_result = grid.fit(x_train, y_train)

# Summary
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
clf = RandomForestClassifier(criterion="gini",max_depth = 60, n_estimators=180)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print("Acc: ", acc)

In [None]:
from sklearn.svm import SVC

params = {"kernel": ["linear", "rbf", "sigmoid"]}

clf = SVC()
grid = GridSearchCV(clf, params, cv=3)

grid_result = grid.fit(x_train, y_train)

# Summary
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
clf = SVC(kernel="linear")
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print("Acc: ", acc)

In [None]:
from sklearn.metrics import confusion_matrix
y_true = y_test
cm = confusion_matrix(y_true, y_pred)
print(cm)

In [None]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

params = {"learning_rate": [0.05, 0.06],
         "n_estimators": [100, 140, 180],
         "max_depth": [i for i in range(35, 50, 2)]}

clf = GradientBoostingClassifier()
grid = GridSearchCV(clf, params, cv=3)

grid_result = grid.fit(x_train, y_train)

# Summary
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
clf = GradientBoostingClassifier(n_estimators=180, max_depth=45, learning_rate=0.06)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print("Acc: ", acc)

In [None]:
# Best model
clf = SVC(kernel="linear")
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print("Acc: ", acc)