## Importing Basic Libararies

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing,
import matplotlib.pyplot as plt # information ploting

## Importing Dataset

In [None]:
dataset = pd.read_csv("../input/breast-cancer-coimbra-data-set/dataR2.csv")
dataset.head()

In [None]:
dataset.info()

In [None]:
y = dataset.iloc[:,-1].values
X = dataset.iloc[:,:-1]

In [None]:
y[np.where(y == 1)] = 0 # labeled as healthy
y[np.where(y == 2)] = 1 # labeled as sick
y

In [None]:
X.mean(axis=0)

### shuffle the dataset

In [None]:
def unison_shuffle(a, b):
  # make a shuffle index array to make a fixed shuffling order for both x, y
  inx = np.random.permutation(a.shape[0])  
  return a.iloc[inx].reset_index(drop=True), b[inx]

In [None]:
X, y = unison_shuffle(X, y)

In [None]:
X

## Feature Engineering

Heatmap

In [None]:
from seaborn import heatmap

plt.figure(figsize=(9,9))
heatmap(X.corr(), linewidth=0.5, annot=True);

as we can see we have high correlation between HOMA and Insulin feature

Solution:
1. we can omit one of the features

'but we can use PCA instead foe feature extraction in the following sections'

now lets have a broader look at :
1. first, one feature data distibution
2. second, two feature data distribution
with scatter plot to see wheathre we have any observable class seprabability based on one or two feature(s)

In [None]:
fig = plt.figure(figsize=(10,10))

for index, feature in enumerate(X,1):
    plt.subplot(3, 3, index)
    plt.scatter(np.arange(116)[y==1], X[feature][y==1], c='r')
    plt.scatter(np.arange(116)[y==0], X[feature][y==0], c='b')
    plt.title(feature)
    
fig.tight_layout()
plt.show()

seemingly we do not have any good seprabability with one feature.

let's see for two feature:

In [None]:
features_ls = [[feature1, feature2] for feature1 in X for feature2 in X if feature1!=feature2];
features_ls;

In [None]:
color = np.where(y==1, 'r', 'b')
fig = plt.figure(figsize=(30,30))

for index, features in enumerate(features_ls):
    plt.subplot(X.shape[1], X.shape[1]-1, index+1)
    plt.scatter(X[features[0]], X[features[1]], c=color)
    
    # corr(x,y) = cov(x,y)/(std(x)*std(y))
    corr = round(X.corr()[features[0]][features[1]], 4)
    plt.title(corr)
    plt.xlabel(features[0])
    plt.ylabel(features[1])
    
    #finding linear relationships
    if(corr > 0.8 or corr < -0.8):
        m, b = np.polyfit(X[features[0]], X[features[1]],1)
        y_corr = m * X[features[0]] + b
        plt.plot(X[features[0]], y_corr)

fig.tight_layout()
plt.show()

again, we can not surely rely on these distributions, excepts for the colinearality of HOMA and Insulin features 

In [None]:
color = np.where(y==1, 'r', 'b')

plt.figure(figsize=(6,6))
plt.scatter(X.HOMA, X.Insulin,c=color)

### PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardization
X_new = StandardScaler().fit_transform(X.to_numpy())

# PCA
X_new = PCA(.9).fit_transform(X_new)


In [None]:
print('considering new dataset:', X_new[:5])
print('shape of the new dataset:', X_new.shape)

In [None]:
ds_new = pd.DataFrame(X_new, columns=['PC1','PC2','PC3','PC4','PC5','PC6'])
ds_new

## Models Prediction


1. SVC(RBF Kernel)
2. SVC(Linear Kernel)
3. Decision Tree
4. KNN
5. Logistic Regression

for each of the models we'll use GridSearch hyperparameter tuning on 5-fold crossvalidation

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### train/validation split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=.2,random_state=1)

In [None]:
X_train[:5]

### SVC( RBF Kernel )

In [None]:
rbf_tuning_parameters = [{'kernel': ['rbf'], 
                       'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                       'C': [1, 10, 100, 1000]}]

rbf_svm_clf = GridSearchCV(SVC(), rbf_tuning_parameters, cv=5)

In [None]:
rbf_svm_clf.fit(X_train, y_train)

In [None]:
print(rbf_svm_clf.best_params_)

In [None]:
y_pred_rbf_clf = rbf_svm_clf.predict(X_test)
print(classification_report(y_test, y_pred_rbf_clf))
print(accuracy_score(y_test, y_pred_rbf_clf))

### SVC( Linear Kernel )

In [None]:
lr_tuning_parameters = [{'kernel': ['linear'], 
                        'C': [1, 10, 100, 1000]}]
lr_svm_clf = GridSearchCV(SVC(), lr_tuning_parameters, cv=5)

In [None]:
lr_svm_clf.fit(X_train, y_train)

In [None]:
print(lr_svm_clf.best_params_)

In [None]:
y_pred_lr_clf = lr_svm_clf.predict(X_test)
print(classification_report(y_test, y_pred_lr_clf))
print(accuracy_score(y_test, y_pred_lr_clf))

### Decision Tree

In [None]:
dt_tuning_parameters = [{'criterion':['gini', 'entropy'],
                         'max_depth':range(1,10),
                       }]

ds_clf = GridSearchCV(DecisionTreeClassifier(), dt_tuning_parameters, cv=5)

In [None]:
ds_clf.fit(X_train, y_train)

In [None]:
print(ds_clf.best_params_)

In [None]:
y_pred_ds_clf = ds_clf.predict(X_test)
print(classification_report(y_test, y_pred_ds_clf))
print(accuracy_score(y_test, y_pred_ds_clf))

### KNN

In [None]:
knn_tuning_parameters = [{'n_neighbors': range(3,10),
                         'weights':['uniform', 'distance'] ,
                       }]

knn_clf = GridSearchCV(KNeighborsClassifier(), knn_tuning_parameters, cv=5)

In [None]:
knn_clf.fit(X_train, y_train)

In [None]:
print(knn_clf.best_params_)

In [None]:
y_pred_knn_clf = knn_clf.predict(X_test)
print(classification_report(y_test, y_pred_knn_clf))
print(accuracy_score(y_test, y_pred_knn_clf))

### Logistic Regression

In [None]:
ls_reg_tuning_parameters = [{'penalty':['l1', 'l2', 'elasticnet'],
                             'C':np.logspace(-4,4,10),
                             'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                             }]

ls_reg = GridSearchCV(LogisticRegression(), ls_reg_tuning_parameters, cv=5)

In [None]:
import warnings
warnings.filterwarnings("ignore")

ls_reg.fit(X_train, y_train)

In [None]:
print(ls_reg.best_params_)

In [None]:
y_pred_ls_reg = ls_reg.predict(X_test)
print(classification_report(y_test, y_pred_ls_reg))
print(accuracy_score(y_test, y_pred_ls_reg))

### Customized Model Report

In [None]:
def model_report(X_train, X_test, y_trian, y_test, models, models_name):
    for model, model_name in zip(models, models_name):
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        print('Classification Report of {}: '.format(model_name))
        print('Train set accuracy:', accuracy_score(y_train, y_pred_train))
        print('validation set accuracy:', accuracy_score(y_test, y_pred_test))
        print()

In [None]:
models = [rbf_svm_clf, lr_svm_clf, ds_clf, knn_clf, ls_reg]
models_name = ['SVM(RBF Kernel)', 'SVM(Linear Kernel)', 'Decision Tree', 'KNN', 'Logistic Regression']
model_report(X_train, X_test, y_train, y_test, models, models_name)

## Conclusion

among the models, above we have to count on the models that : 
1. the train set accuracy and validation set accuracy are nearer to each other
2. having high accuracy both in train set and validation set

so we can see that the SVM model with RBF kernel have the best result