## Breast Cancer Diagnosis by Machine Learning (Project)

#### Breast cancer diagnosis classification project based on EDA (exploratory data analysis) and different machine learning classification algorithm for finding the best classifier fit in order to dignosis and classify the Benign (noncancerous) and Malignant (cancerous) type of breast cancer 

### Loading libraries and dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

### Seperate Target from features

In [None]:
data.head()

In [None]:
data.shape

In [None]:
col = data.columns
print(col)

In [None]:
y = data.diagnosis                                # Target or label
drop_col = ['Unnamed: 32','id', 'diagnosis']
x = data.drop(drop_col, axis=1)                   # features           
x.head()

In [None]:
x.shape

### Plot diagnosis distribution

In [None]:
ax = sns.countplot(y, label = 'counts')
B, M = y.value_counts()
print('number of Belign tumor', B)
print('number of Melignant tumor', M)

In [None]:
x.describe()

### Visualize standardised data with Seaborn

In [None]:
# Take first 10 features

data = x
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std.iloc[:,0:10]],axis= 1)                # take first 10 features to make 1 group of viloinplot of features
data = pd.melt(data, id_vars = 'diagnosis',
               var_name='features',
               value_name = 'value')
plt.figure(figsize=(10,10))
sns.violinplot(x= 'features', y='value',hue='diagnosis',data = data, split= True, inner='quart')
plt.xticks(rotation=45)

In [None]:
# Take next 10 features

data = pd.concat([y ,data_std.iloc[:,10:20]], axis = 1)
data = pd.melt(data, id_vars= 'diagnosis',
               var_name = 'features', 
               value_name = 'value')
plt.figure(figsize=(10,10))
sns.violinplot(x = 'features', y = 'value', data = data, hue = 'diagnosis', split = True, inner = 'quart')
plt.xticks(rotation = 45)

In [None]:
# Take last 10 features

data = pd.concat([y ,data_std.iloc[:,20:30]], axis = 1)
data = pd.melt(data, id_vars= 'diagnosis',
               var_name = 'features', 
               value_name = 'value')
plt.figure(figsize=(10,10))
sns.violinplot(x = 'features', y = 'value', data = data, hue = 'diagnosis', split = True, inner = 'quart')
plt.xticks(rotation = 45)

### Using Joint plots for feature comparison

In [None]:
# As from the viloin plot concavity_worst and concave points_worst are seems to be somewhat identical

sns.jointplot(x.loc[:, 'concavity_worst'], x.loc[:, 'concave points_worst'],
              kind = 'reg')

# this shows that both the features has high value of correlation between them
# as scattering is very much close

### Obsorving the distribution of the values and their varience with Swarm plots

In [None]:
# Swarm plots
# Take fisrt 10 features

sns.set(style = 'whitegrid', palette= 'muted')
data = x
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std.iloc[:, 0:10]], axis = 1)
data = pd.melt(data, id_vars= 'diagnosis',
               var_name = 'features',
               value_name = 'value')
plt.figure(figsize=(10,10))
sns.swarmplot(x = 'features', y = 'value', data = data, hue = 'diagnosis')
plt.xticks(rotation = 45)

In [None]:
# Take next 10 features

sns.set(style = 'whitegrid', palette= 'muted')
data = x
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std.iloc[:, 10:20]], axis = 1)
data = pd.melt(data, id_vars= 'diagnosis',
               var_name = 'features',
               value_name = 'value')
plt.figure(figsize=(10,10))
sns.swarmplot(x = 'features', y = 'value', data = data, hue = 'diagnosis')
plt.xticks(rotation = 45)

In [None]:
# Take last 10 features

sns.set(style = 'whitegrid', palette= 'muted')
data = x
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std.iloc[:, 20:30]], axis = 1)
data = pd.melt(data, id_vars= 'diagnosis',
               var_name = 'features',
               value_name = 'value')
plt.figure(figsize=(10,10))
sns.swarmplot(x = 'features', y = 'value', data = data, hue = 'diagnosis')
plt.xticks(rotation = 45)

### Observing all pairwise correlation

In [None]:
a, ax = plt.subplots(figsize = (18,18))
sns.heatmap(x.corr(), annot= True, fmt= '.1f', linewidths= 0.5, ax = ax)
# these heatmap shows the relation between the correlation of each of the features with each another by heatmap

### Droping correlated columns from the feature list

In [None]:
drop_cols = ['perimeter_mean', 'radius_mean', 'compactness_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 
             'radius_worst', 'perimeter_worst','compactness_worst', 'concave points_worst', 'compactness_se',
             'concave points_se','texture_worst','area_worst']
df = x.drop(drop_cols, axis=1)
df.head()

In [None]:
df.shape

In [None]:
f, ax = plt.subplots(figsize = (14,14))
sns.heatmap(df.corr(), annot=True, fmt = '.1f', linewidths=0.5, ax = ax)

# Feature Selection Techniques to get the prediction and highest accuracy

### Feature extraction using principle componant analysis(PCA)

In [None]:
## get the original feature and label set then spilt again newly

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state = 42)

x_train_norm = (x_train - x_train.mean())/ (x_train.max() - x_train.min())
x_test_norm = (x_test - x_test.mean())/ (x_test.max() - x_test.min())

from sklearn.decomposition import PCA

pca = PCA()
pca.fit(x_train_norm)

In [None]:
x_train_norm.shape

In [None]:

# here we gonna see the commulative sum varience ratio vs number of feeatures we gonna take to get that much percent of ratio/accuarcy

plt.figure(1, figsize=(10,8))
sns.lineplot(data = np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components/features')
plt.ylabel('Cummualtive explained varience')

# these shows that to get around 99% of accuaracy we have to take around 16-17 features for prediction

## 1) XGBoost Classifier

### 1_classification using XGBoost (minimal feature selection)

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size = 0.3, random_state = 42)

clf_1 = XGBClassifier(random_state=42)
clf_1 = clf_1.fit(x_train, y_train)

In [None]:
y_pred_1 = clf_1.predict(x_test)
print('accuracy is : ', accuracy_score(y_test, y_pred_1)) 
cm = confusion_matrix(y_test, y_pred_1)
sns.heatmap(cm, annot= True, fmt = 'd')

### 2_Univariate feature selection and XGBoost

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
select_feature = SelectKBest(chi2, k =10).fit(x_train, y_train)

print('score list: ', select_feature.scores_)
print('feature list: ', x_train.columns)

In [None]:
x_train.shape

In [None]:
# from selectkbest function we gonna select the features of top 10 values/scores of k

x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)

clf_2 = XGBClassifier()
clf_2.fit(x_train_2, y_train)

y_pred_2 = clf_2.predict(x_test_2)
print('accuracy is: ', accuracy_score(y_test, y_pred_2))
cm = confusion_matrix(y_test, y_pred_2)
sns.heatmap(cm, annot= True, fmt = 'd')

In [None]:
x_train_2.shape

### 3_Recursive feature elemination with cross validation

In [None]:
from sklearn.feature_selection import RFECV

clf_3 = XGBClassifier()
rfecv = RFECV(estimator = clf_3, step = 1, cv = 5, scoring = 'accuracy', n_jobs = -1).fit(x_train, y_train)       # step = 1, means eliminate 1 feature at each step, cv = cross validation folds

print('optimal number of features: ', rfecv.n_features_)
print('best features: ', x_train.columns[rfecv.support_])

In [None]:
print('accuracy is: ', accuracy_score(y_test, rfecv.predict(x_test)))

## 2) Logistic Regression classifier

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_2 = LogisticRegression(max_iter= 200)
classifier_2.fit(x_train,y_train)

In [None]:
y_pred_lg = classifier_2.predict(x_test)
print('accuracy is: ', accuracy_score(y_test, y_pred_lg))
cm = confusion_matrix(y_test, y_pred_lg)
sns.heatmap(cm, annot= True, fmt = 'd')

## 3) SVM classifier

In [None]:
from sklearn.svm import SVC
classifier_3 = SVC(kernel = 'rbf')
classifier_3.fit(x_train,y_train)

In [None]:
y_pred_svm = classifier_3.predict(x_test)
print('accuracy is: ', accuracy_score(y_test, y_pred_svm))
cm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm, annot= True, fmt = 'd')

## 4) Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier_4 = GaussianNB()
classifier_4.fit(x_train,y_train)

In [None]:
y_pred_nb = classifier_4.predict(x_test)
print('Accuracy of model: ', accuracy_score(y_test,y_pred_nb))
cm = confusion_matrix(y_test,y_pred_nb)
sns.heatmap(cm, annot= True, fmt= 'd')

## 5) Decision Tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier_5 = DecisionTreeClassifier(min_samples_split=2)
classifier_5.fit(x_train,y_train)

In [None]:
y_pred_dt = classifier_5.predict(x_test)
print('accuracy is: ', accuracy_score(y_test, y_pred_dt))
cm = confusion_matrix(y_test, y_pred_dt)
sns.heatmap(cm, annot= True, fmt = 'd')

## 6) Random forest classifier

### 1_minimal feature selection - 16 features

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_6 = RandomForestClassifier()
classifier_6.fit(x_train,y_train)

In [None]:
y_pred_rf = classifier_6.predict(x_test)
print('accuracy is: ', accuracy_score(y_test, y_pred_rf))
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot= True, fmt = 'd')

### 2_univariate feature selection

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2

# select_feature = SelectKBest(chi2, k =10).fit(x_train, y_train)

# print('score list: ', select_feature.scores_)
# print('feature list: ', x_train.columns)

# x_train_2 = select_feature.transform(x_train)
# x_test_2 = select_feature.transform(x_test)

we have allready performed above steps for selectkbest algorithm in XGBoost classifier

In [None]:
# from selectkbest function we gonna select the features of top 10 values/scores of k

clf_7 = RandomForestClassifier()
clf_7.fit(x_train_2, y_train)

y_pred_rf10 = clf_2.predict(x_test_2)
print('accuracy is: ', accuracy_score(y_test, y_pred_rf10))
cm = confusion_matrix(y_test, y_pred_rf10)
sns.heatmap(cm, annot= True, fmt = 'd')