In [None]:
#Importing libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import functools
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense, Dropout
sns.set()

> **MODELS USED**
> * XGB
> * SVM
> * KNN
> * Decision Tree
> * Random Forest
> * Logistic Regression
> * ANN

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
#Check for null values
df.isnull().sum()

All null(NaN) values in 'Unnamed: 32' (Drop)

In [None]:
#Unique values in 'id'
df['id'].nunique()

All values are unique in 'id'. Can be dropped.
(However, 'id' could've been assigned on the basis of age, which could play a role but its a long shot) 

In [None]:
#Dropping 'id' and 'Unnamed: 32'
df.drop(['id','Unnamed: 32'],axis=1,inplace=True)

In [None]:
#Imb check
print(df['diagnosis'].value_counts())
sns.countplot(data=df,x='diagnosis')
plt.show()

Pass

In [None]:
#Continous var
ContVar=[x for x in df.columns if x in df.loc[:,df.dtypes==np.float].columns]
for i in ContVar:
    sns.distplot(df[i],color='red',label=i,kde=True)
    plt.legend()
    plt.show()

Density dist. for continous variables.

In [None]:
X= df.drop('diagnosis',axis=1).values
y= df['diagnosis'].values

In [None]:
#Encoding y
le_y = LabelEncoder()
y = le_y.fit_transform(y)

In [None]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 0)

In [None]:
#Scaling X
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#All classifiers
CompleteSummary=[]
allClf = []
allClf.append(['DECISION TREE', DecisionTreeClassifier(random_state=0)])
allClf.append(['RANDOM FOREST', RandomForestClassifier(random_state=0)])
allClf.append(['LOGISTIC REGRESSION', LogisticRegression(random_state=0)])
allClf.append(['XGB', XGBClassifier(eval_metric= 'error')])
allClf.append(['SVM', SVC(random_state=0,probability=True)])
allClf.append(['KNN', KNeighborsClassifier()])

In [None]:
def apply_model(model,X_train, X_test, y_train, y_test,CompleteSummary):
    clf=model[1]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confmat = confusion_matrix(y_test, y_pred) 
    accuracies = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 15)   
    roc = roc_auc_score(y_test, y_pred)  
    precision = precision_score(y_test, y_pred)  
    recall = recall_score(y_test, y_pred) 
    f1 = f1_score(y_test, y_pred)  
    summary=[]
    print(model[0])
    summary.append(model[0])
    print('CONFUSION MATRIX :')
    ax = sns.heatmap(confmat, annot=True)
    ax.set_ylim([0,2])
    plt.show()
    print('ACCURACY SCORE :',accuracy_score(y_test, y_pred)*100)
    summary.append(accuracy_score(y_test, y_pred)*100)
    print('K-F VALIDATION MEAN ACCURACY :',accuracies.mean()*100)
    summary.append(accuracies.mean()*100)
    print('ROC AUC SCORE :',roc)
    summary.append(roc)
    print('F1 :',f1)
    summary.append(f1)
    print('PRECISION :',precision)
    summary.append(precision)
    print('RECALL :',recall)
    summary.append(recall)
    CompleteSummary.append(summary)
    print('x'.center(50,'-'))

In [None]:
#Mapping the apply_model function
list(map(functools.partial(apply_model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,CompleteSummary=CompleteSummary), allClf))

**SUMMARY**

In [None]:
summary_df = pd.DataFrame(CompleteSummary, columns= ['Model Name', 'Accuracy Score', 'K-F Valid Mean Accuracy','ROC AUC Score', 'F1', 'Precision', 'Recall'])
summary_df.sort_values(by= ['Accuracy Score','K-F Valid Mean Accuracy'], inplace= True, ascending= False)
summary_df

In [None]:
ax=summary_df.plot.barh(x='Model Name', y={'Accuracy Score', 'K-F Valid Mean Accuracy'},figsize=(16,9))
ax.legend(bbox_to_anchor=(1,1))

Trying a simple ANN

In [None]:
#Simple NN
def nn(inp):
    clf = Sequential()
    clf.add(Dense(24, input_dim=inp, activation='relu'))
    clf.add(Dropout(rate=0.1))
    clf.add(Dense(24, activation='relu'))
    clf.add(Dropout(rate=0.1))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= ['accuracy'])
    return clf

In [None]:
X_train.shape

In [None]:
clf=nn(30)
clf_fit=clf.fit(X_train, y_train,validation_split= 0.2, epochs=100, batch_size=12)

In [None]:
y_pred = clf.predict_classes(X_test)
print('Accuracy :',accuracy_score(y_pred,y_test)*100)

In [None]:
plt.plot(clf_fit.history['accuracy'],label='Training accuracy')
plt.plot(clf_fit.history['val_accuracy'],label='Validation accuracy')
plt.title('ACCURACY PLOT')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()
plt.show()

In [None]:
plt.plot(clf_fit.history['loss'],label='Training loss')
plt.plot(clf_fit.history['val_loss'],label='Validation loss')
plt.title('LOSS PLOT')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend()
plt.show()

**DIMENSIONALITY REDUCTION USING PCA (10 Components)**

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_train=pca.fit_transform(X_train)
X_test = pca.transform(X_test)
print('VARIANCE EXPLAINED :',pca.explained_variance_ratio_.cumsum()[-1])

~95% variance is explained using 10 components. Can move forward.

In [None]:
PCACompleteSummary=[]
list(map(functools.partial(apply_model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,CompleteSummary=PCACompleteSummary), allClf))

**SUMMARY PCA(10 Components)**

In [None]:
PCAsummary_df = pd.DataFrame(PCACompleteSummary, columns= ['Model Name', 'Accuracy Score', 'K-F Valid Mean Accuracy','ROC AUC Score', 'F1', 'Precision', 'Recall'])
PCAsummary_df.sort_values(by= ['Accuracy Score','K-F Valid Mean Accuracy'], inplace= True, ascending= False)
PCAsummary_df

In [None]:
ax=PCAsummary_df.plot.barh(x='Model Name', y={'Accuracy Score', 'K-F Valid Mean Accuracy'},figsize=(16,9))
ax.legend(bbox_to_anchor=(1,1))

Trying a simple ANN

In [None]:
clf=nn(10)
clf_fit=clf.fit(X_train, y_train,validation_split= 0.2, epochs=100, batch_size=12)

In [None]:
y_pred = clf.predict_classes(X_test)
print('Accuracy :',accuracy_score(y_pred,y_test)*100)

In [None]:
plt.plot(clf_fit.history['accuracy'],label='Training accuracy')
plt.plot(clf_fit.history['val_accuracy'],label='Validation accuracy')
plt.title('ACCURACY PLOT')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()
plt.show()

In [None]:
plt.plot(clf_fit.history['loss'],label='Training loss')
plt.plot(clf_fit.history['val_loss'],label='Validation loss')
plt.title('LOSS PLOT')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend()
plt.show()

**FIN**

(To do Hyperparameter tuning)