In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
data=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
data.head()

In [None]:
data=data.drop('Unnamed: 32', axis=1)
y=data.diagnosis
X=data.drop('diagnosis', axis=1)
X.info()

In [None]:
X.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
X=X.drop('id', axis=1)
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)
X_val1,X_test,y_val1,y_test=train_test_split(X_val,y_val,test_size=0.5, random_state=42)
print('train set: ', X_train.shape,y_train.shape)
print('test set: ', X_test.shape, y_test.shape)
print('val1 set: ', X_val1.shape, y_val1.shape)

# 1. ANN

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input, Dense, Dropout,Add, BatchNormalization 

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
callbacks=[EarlyStopping(patience=200,
                        min_delta=0.00001,
                        restore_best_weights=True),
          ReduceLROnPlateau(factor=0.5, patience=200)]

In [None]:
input_tensor=Input(shape=(30,))
D1=Dense(512, input_shape=(30,))(input_tensor)
A1=Activation('relu')(D1)
A1=Dropout(0.5)(A1)
A1=BatchNormalization()(A1)
D2=Dense(512)(A1)
A2=Activation('relu')(D2)
A2=Dropout(0.5)(A2)
A2=BatchNormalization()(A2)
D3=Dense(512)(A2)
A3=Activation('relu')(D3)
A3=Dropout(0.5)(A3)
A3=BatchNormalization()(A3)
D4=Dense(512)(A3)
D4=Add()([D4,A1])
A4=Activation('relu')(D4)
A4=Dropout(0.5)(A4)
A4=BatchNormalization()(A4)


D5=Dense(256)(A4)
A5=Activation('relu')(D5)
A5=Dropout(0.2)(A5)
A5=BatchNormalization()(A5)
D6=Dense(256)(A5)
A6=Activation('relu')(D6)
A6=Dropout(0.2)(A6)
A6=BatchNormalization()(A6)
output_tensor=Dense(1, activation='sigmoid')(A6)

functional_model=Model(inputs= input_tensor,
                      outputs=output_tensor)


In [None]:
functional_model.compile(loss='binary_crossentropy',
                        optimizer='adam',
                        metrics=['binary_accuracy'])

In [None]:
history=functional_model.fit(X_train,y_train,
                    validation_data=(X_val1,y_val1),
                    epochs=500,
                    batch_size=128,
                    callbacks=callbacks)

In [None]:
history_df=pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot()


**Overfitting:** The production of an analysis that corresponds too closely or exactly to a particular set of data, and may therefore fail to fit additional data or predict future observations reliably.[1]
The curve of the model can be an example of overfitting.(See Figure 1). Lower epochs (15-20) would probably improve the accuracy of the model.

<img src= "https://i.imgur.com/tHiVFnM.png" alt ="Curves" style='width: 500px;'>

Figure 1

In [None]:
history_df[['binary_accuracy','val_binary_accuracy']].plot();

In [None]:
from sklearn.metrics import mean_squared_error
preds=functional_model.predict(X_test)
np.sqrt(mean_squared_error(preds,y_test))

In [None]:
error=0.16612618888540173

# 2. XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print('XGB Accuracy: ',np.sqrt(mean_squared_error(preds,y_val)))

# 2.1.Boosting the Accuracy

In [None]:
grid_params={'n_estimators': [100,200,300,400,50],
            'learning_rate': [0.010,0.001,0.0001,1],
            'max_depth':[5,6,4,7]}
grid_model=GridSearchCV(estimator=XGBClassifier(),param_grid=grid_params,cv=3)
grid_model.fit(X_train,y_train)
grid_model.best_params_

In [None]:
xgb_optimized=XGBClassifier(learning_rate= 0.01, max_depth= 4, n_estimators=400)
xgb_optimized.fit(X_train,y_train)
preds=xgb_optimized.predict(X_val)
error2=np.sqrt(mean_squared_error(preds,y_val))
print('Error of the optimized XGBClassifier model: ', str(error2))

# 3.RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
grid_params={'n_estimators': [100,200,300,400,50],
            'max_depth':[5,6,4,7]}
grid_model=GridSearchCV(estimator=rf,param_grid=grid_params,cv=3)
grid_model.fit(X_train,y_train)
grid_model.best_params_




In [None]:
rf_optimized=RandomForestClassifier(max_depth= 7, n_estimators= 100)
rf_optimized.fit(X_train,y_train)
preds=rf_optimized.predict(X_val)
error3=np.sqrt(mean_squared_error(preds,y_val))
print('Error of the Optimized Random Forest Model: ', error3)


In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=['XGBClassifier','RandomForest','Functional ANN',],y=[error2,error3,error])
plt.title('Error Rates for Different Predictor Models')
plt.xlabel('$Models$')
plt.ylabel('$Error Rates$')
plt.show()

# References:


[1] https://www.lexico.com/definition/overfitting