In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for data visualization
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


### KNN


In [None]:
data = '../input/ucibreastcancerwisconsincleaned/UCI-breast-cancer-wisconsin-data.csv'

df = pd.read_csv(data, header=None)

df.drop(index=0, axis=0, inplace=True)
df.shape

We can see that there are 683 instances and 10 attributes in the data set. 
i removed id attribute

In the dataset description, it is given that there are 10 attributes and 1 `Class` which is the target variable. So, we have 10 attributes and 1 target variable.

### View top 5 rows of dataset

In [None]:
# preview the dataset

df.head()

### Rename column names

We can see that the dataset does not have proper column names. The columns are merely labelled as 0,1,2.... and so on. We should give proper names to the columns. I will do it as follows:-

In [None]:
col_names = [ 'Clump_thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape', 'Marginal_Adhesion', 
             'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']

df.columns = col_names

df.columns

We can see that the column names are renamed. Now, the columns have meaningful names.

In [None]:
# let's agian preview the dataset

df.head()

### View summary of dataset


In [None]:
# view summary of dataset

df.info()

We can see that the `Id` column has been removed from the dataset. 

We can see that there are 9 numerical variables and 1 categorical variable in the dataset. I will check the frequency distribution of values in the variables to confirm the same.

### Check data types of columns of dataframe

In [None]:
df.dtypes

In [None]:
for i in df.columns:
    df[i] = pd.to_numeric(df[i], errors='coerce') 

In [None]:
df.dtypes

Now, we can see that all the columns of the dataframe are of type numeric.

### Summary of variables


- There are 10 numerical variables in the dataset.


- All of the variables are of discrete type.


- Out of all the 10 variables, the first 9 variables are feature variables and last variable `Class` is the target variable.




# **9. Declare feature vector and target variable** <a class="anchor" id="9"></a>
 

In [None]:
X = df.drop(['Class'], axis=1)

Y = df['Class']

X.head()

 
 

In [None]:
#print frequency distribution of classes
print(Y.value_counts())

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size = 0.25, random_state = 8)


In [None]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

We now have training and testing set ready.

In [None]:
cols = X_train.columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)


In [None]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [None]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_train.head()

We now have `X_train` dataset ready to be fed into the Logistic Regression classifier. I will do it as follows.

# **13. Fit K Neighbours Classifier to the training eet** <a class="anchor" id="13"></a>
 

In [None]:
# import KNeighbors ClaSSifier from sklearn
from sklearn.neighbors import KNeighborsClassifier


# instantiate the model
knn = KNeighborsClassifier(n_neighbors=5,weights='distance')


# fit the model to the training set
knn.fit(X_train, Y_train)


# **14. Predict test-set results** 
 

In [None]:
Y_pred = knn.predict(X_test)

Y_pred

# **15. Check Train and test errors 

In [None]:
from sklearn.metrics import accuracy_score

print('Model test error: {0:0.4f}'. format(1-accuracy_score( Y_test, Y_pred )))

Here, **y_test** are the true class labels and **y_pred** are the predicted class labels in the test-set.

In [None]:
Y_pred_train = knn.predict(X_train)
print('Training-set error: {0:0.4f}'. format(1-accuracy_score(Y_train, Y_pred_train)))

# **17. Confusion matrix**  



In [None]:
# Print the Confusion Matrix with k =3 and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, Y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

# **18. Classification metrices** 

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

# Precision


In [None]:
# print precision score

precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))


### Recall or TP Rate




In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

### f1-score


In [None]:
f1_score=2*precision*recall/ (precision+recall)
print( "F1_score: ",f1_score)


### Decision Tree

In [None]:
# import DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier

# instantiate the DecisionTreeClassifier model with criterion gini index

gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)

In [None]:
# fit the model
gini.fit(X_train, Y_train)

In [None]:
Y_pred_dt = gini.predict(X_test)
print('Model Test error with criterion gini index: {0:0.4f}'. format(1-accuracy_score(Y_test, Y_pred_dt)))
Y_pred_train_dt = gini.predict(X_train)

print('Model Train error with criterion gini index: {0:0.4f}'. format(1-accuracy_score(Y_train, Y_pred_train_dt)))

In [None]:
cm_dt = confusion_matrix(Y_test, Y_pred_dt)
 

TP_dt=cm_dt[0,0]
TN_dt=cm_dt[1,1]
FP_dt=cm_dt[0,1]
FN_dt=cm_dt[1,0]

print('Confusion matrix\n\n', cm_dt)

print('\nTrue Positives(TP) = ', TP_dt)

print('\nTrue Negatives(TN) = ', TN_dt)

print('\nFalse Positives(FP) = ', FP_dt)

print('\nFalse Negatives(FN) = ', FN_dt)


In [None]:
# print precision score

precision_dt = TP_dt / float(TP_dt + FP_dt)


print('Precision : {0:0.4f}'.format(precision_dt))

recall_dt = TP_dt / float(TP_dt + FN_dt)

print('Recall or Sensitivity : {0:0.4f}'.format(recall_dt))

f1_score_dt=(2*precision_dt*recall_dt)/(precision_dt+recall_dt)

print('F1 score : {0:0.4f}'.format(f1_score_dt))
 

### Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, Y_train)

In [None]:
Y_pred_nbc = gnb.predict(X_test)

Y_pred_nbc

In [None]:
print('Model Test error: {0:0.4f}'. format(1-accuracy_score(Y_test, Y_pred_nbc)))

In [None]:
Y_pred_train_nbc= gnb.predict(X_train)

print('Model Train error: {0:0.4f}'. format(1-accuracy_score(Y_train, Y_pred_train_nbc)))

The training-set accuracy score is 0.9629 while the test-set accuracy to be 0.9649. These two values are quite comparable. So, there is no sign of overfitting.

In [None]:
cm_nbc = confusion_matrix(Y_test, Y_pred_nbc)

print('Confusion matrix\n\n', cm_nbc)

print('\nTrue Positives(TP) = ', cm_nbc[0,0])

print('\nTrue Negatives(TN) = ', cm_nbc[1,1])

print('\nFalse Positives(FP) = ', cm_nbc[0,1])

print('\nFalse Negatives(FN) = ', cm_nbc[1,0])

In [None]:
TP_nbc = cm_nbc[0,0]
TN_nbc = cm_nbc[1,1]
FP_nbc = cm_nbc[0,1]
FN_nbc = cm_nbc[1,0]

In [None]:
# precision 

precision_nbc = TP_nbc / float(TP_nbc + FP_nbc)


print('Precision : {0:0.4f}'.format(precision_nbc))

In [None]:
recall_nbc = TP_nbc / float(TP_nbc + FN_nbc)

print('Recall or Sensitivity : {0:0.4f}'.format(recall_nbc))

In [None]:
f1_score_nbc=2*precision_nbc*recall_nbc/ (precision_nbc+recall_nbc)
print('F1 score: {0:0.4f}'.format(f1_score_nbc))

### ANN - 

In [None]:
import numpy as np
import tensorflow as tf

In [None]:
data = '../input/ucibreastcancerwisconsincleaned/UCI-breast-cancer-wisconsin-data.csv'
col_names = [ 'Clump_thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape', 'Marginal_Adhesion', 
             'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']
 
df_ann = pd.read_csv(data,names=col_names, header=None)

df_ann.drop(index=0, axis=0, inplace=True)
df_ann.head()
dataset = df_ann.values
dataset

In [None]:
X = dataset[:,:-1]

scaler = StandardScaler()
X_scale = scaler.fit_transform(X)
Y = dataset[:,-1]

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(Y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_and_val, X_test, Y_train_and_val, Y_test = train_test_split(X_scale, y, test_size=0.25,random_state=8)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_and_val, Y_train_and_val, test_size=0.1)

print(X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape)
 


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
model = Sequential([
    Dense(32, activation='relu', input_shape=(9,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid'),
])
def compile_and_fit(model,verbose):
    model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy' ])
    hist = model.fit(X_train, Y_train,
          batch_size=32, epochs=100,verbose=verbose,
          validation_data=(X_val, Y_val))
    return hist

In [None]:
hist=compile_and_fit(model,verbose=1)

In [None]:
def error_model(model):
    print("Train error:",1- (model.evaluate(X_train, Y_train)[1]))
    print("Test error:",1-(model.evaluate(X_test, Y_test)[1]))

In [None]:
def metrics(model):
    y_pred = model.predict(X_test)
    cm_ann = confusion_matrix(Y_test, np.rint(y_pred))
    print("Confusion Matrix:\n",cm_ann)
    TP_ann=cm_ann[0,0]
    TN_ann=cm_ann[1,1]
    FP_ann=cm_ann[0,1]
    FN_ann=cm_ann[1,0]
    print("TP :",TP_ann)
    print("TN :",TN_ann)
    print("FP :",FP_ann)
    print("FN :",FN_ann)



    # print precision score

    precision_ann = TP_ann / float(TP_ann + FP_ann)

    print('Precision : {0:0.4f}'.format(precision_ann))

    recall_ann = TP_ann / float(TP_ann + FN_ann)

    print('Recall or Sensitivity : {0:0.4f}'.format(recall_ann))

    f1_score_ann=(2*precision_ann*recall_ann)/(precision_ann+recall_ann)

    print('F1 score : {0:0.4f}'.format(f1_score_ann))
     

In [None]:
print("ANN metrics")
error_model(model)
print()
metrics(model)

# Q2

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_loss(hist):
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper right')
    plt.show()

### 1) ReLU 

In [None]:

print("ReLU activation")
error_model(model)
print()
# metrics(model)
plot_loss(hist)

### 2) Leaky ReLU

In [None]:
model1 = Sequential([
    Dense(32,activation=keras.layers.LeakyReLU(alpha=0.02),input_shape=(9,)),
    Dense(units=32, 
              activation=keras.layers.LeakyReLU(alpha=0.02)) ,
    Dense(1, activation='sigmoid'),
])

hist1=compile_and_fit(model1,verbose=0)

print("Model 2- LeakyReLU")
error_model(model1)
print()
# metrics(model1)
plot_loss(hist1)

### tanh

In [None]:
model2 = Sequential([
    Dense(32,activation='tanh',input_shape=(9,)),
    Dense(units=32, 
              activation='tanh') ,
    Dense(1, activation='sigmoid'),
])

hist2=compile_and_fit(model2,verbose=0)
print ("model 3- tanh")
error_model(model2)
print()
# metrics(model2)
plot_loss(hist2)

####  It is observed that while the training loss and validation loss are very close in the cases of all cases ReLU ,tanh and LeakyReLU. Also, validation loss is higher than train loss in all cases which is comparable to real world scenario. Hence, these models are not overfitting and are performing quite well.


# Q3

In [None]:
test_set_errors=[]
number_of_nodes=[]

model_1 = Sequential([
    Dense(1, activation='relu', input_shape=(9,)),
    Dense(1, activation='sigmoid' ),

    
])
hist_1=compile_and_fit(model_1,0)
test_set_errors.append((1-model_1.evaluate(X_test, Y_test)[1]))
number_of_nodes.append(1)
print(test_set_errors)

In [None]:
 
model_2 = Sequential([
    Dense(10, activation='relu', input_shape=(9,)),
    Dense(1, activation='sigmoid' ),

    
])
hist_2=compile_and_fit(model_2,0)
test_set_errors.append((1-model_2.evaluate(X_test, Y_test)[1]))
print(test_set_errors)
number_of_nodes.append(2)


In [None]:
 
model_3 = Sequential([
    Dense(3, activation='relu', input_shape=(9,)),
    Dense(1, activation='sigmoid' ),

    
])
hist_3=compile_and_fit(model_3,0)
test_set_errors.append((1-model_3.evaluate(X_test, Y_test)[1]))
number_of_nodes.append(3)


In [None]:
num=len(col_names)-1
print("Num of features:",num)
num=int(np.sqrt(num))
print("Square root of features:",num)
model_sqrt_features = Sequential([
    Dense(num, activation='relu', input_shape=(9,)),
    Dense(1, activation='sigmoid' ),

    
])
hist_4=compile_and_fit(model_sqrt_features,0)
test_set_errors.append((1-model_sqrt_features.evaluate(X_test, Y_test)[1]))
number_of_nodes.append(num)


In [None]:
num=len(col_names)-1
print("Num of features:",num)
num=int(num/2)
print("Half of features:",num)
model_half_features = Sequential([
    Dense(num, activation='relu', input_shape=(9,)),
    Dense(1, activation='sigmoid' ),

    
])
hist_5=compile_and_fit(model_half_features,0)
test_set_errors.append((1-model_half_features.evaluate(X_test, Y_test)[1]))
number_of_nodes.append(num)


In [None]:
print(test_set_errors)
plt.plot(number_of_nodes,test_set_errors,) 
plt.title('Test set errors vs. Number of nodes')
plt.ylabel('Test set errors ')
plt.xlabel('Number of nodes')
plt.legend()
plt.show()

Test set error decreases as model complexity increases