In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt# for Plotting graphs
import seaborn as sns# same as matplotlib but to make life easier
#loading the dataset
df=pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
df.head()

# Analyzing The dataset

In [None]:
#describing the data
df.describe()

In [None]:
#printing the shape of data
print(df.shape)
df.info()

# Starting with EDA

In [None]:
# visualize number of patients diagonised with liver diesease
plt.figure(figsize=(6,6))
ax = sns.countplot(x = df['diagnosis'].apply(lambda x:'(M) Ganas' if x == 'M' else '(B) Jinak'))
ax.set_xlabel('Kondisi Pasien')
for p in ax.patches:
  ax.annotate(f'{p.get_height()}',(p.get_x()+0.15, p.get_height()+3))

In [None]:
#plotting Correlation
plt.figure(figsize=(25,25))
sns.heatmap(df.corr(),cmap='Reds',annot=True)

# Starting Data Preprocessing

In [None]:
#checking for missing values as per column
df.isna().sum()

In [None]:
#Scaling the dataset using Min Max scaler:
#Getting Numerical Columns
cols=df.columns.to_list()
cols.remove('diagnosis')
cols.remove('id')
cols.remove('Unnamed: 32')
print("Columns with numerical data:")
cols

In [None]:
#getting Numerical columns:
df_numerical=df[cols]

#starting scaling process:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_numerical)
scaled=scaler.transform(df_numerical) #the variable scaled will be in numpy array 
x=pd.DataFrame(scaled, columns=cols) #converting the variable to dataframe.
y=df['diagnosis']# Getting the labels
# x

In [None]:
#moving for feature selection
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel 

In [None]:
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(x, y)
print("Showing feature importance values")
print(clf.feature_importances_)

In [None]:
model=SelectFromModel(clf, prefit=True) #getting features from  the above classifer as per the importances
cols=x.columns.to_list()#getting list of columns
tf=model.get_support()#getting which features are important
selectedcols=[]
for i in range(len(cols)):
    if tf[i]:
        selectedcols.append(cols[i])
print("showing selected columns")
print(selectedcols)
#converting the data
X_new = model.transform(x)
X_new.shape 

In [None]:
df['diagnosis'].replace({'M':1,'B':0},inplace=True)

# Applying ML Algorithms

In [None]:
#splitting the dataset for Training and testing and using 10-fold Cross validation.
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
kf.get_n_splits(x)

AB_accuracy=[]
AB_precision=[]
AB_recall=[]
AB_f1_score=[]

RF_accuracy=[]
RF_precision=[]
RF_recall=[]
RF_f1_score=[]

GB_accuracy=[]
GB_precision=[]
GB_recall=[]
GB_f1_score=[]

In [None]:
#initializing the models
#importing libraries of performance Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


#Making the classifier Objects
clf_ab=AdaBoostClassifier()
clf_rf=RandomForestClassifier(max_depth=5, random_state=0)
clf_gb=GradientBoostingClassifier()

In [None]:
i=1# count the number of folds
#starting the 10 fold cross valivation
for train_index, test_index in kf.split(X_new):
    print("%d Number of fold"%i)
    i+=1
    #Splitting the data
    X_train, X_test = X_new[train_index], X_new[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Training and Evaluating AB
    model=clf_ab.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    AB_accuracy.append(accuracy_score(y_test,y_pred))
    AB_precision.append(precision_score(y_test,y_pred))
    AB_recall.append(recall_score(y_test,y_pred))
    AB_f1_score.append(f1_score(y_test,y_pred))
    
    #Training and Evaluating Random Forest
    model=clf_rf.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    RF_accuracy.append(accuracy_score(y_test,y_pred))
    RF_precision.append(precision_score(y_test,y_pred))
    RF_recall.append(recall_score(y_test,y_pred))
    RF_f1_score.append(f1_score(y_test,y_pred))
    
    #Training and Evaluating KNN
    model=clf_gb.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    GB_accuracy.append(accuracy_score(y_test,y_pred))
    GB_precision.append(precision_score(y_test,y_pred))
    GB_recall.append(recall_score(y_test,y_pred))
    GB_f1_score.append(f1_score(y_test,y_pred))

# Analyzing the performance

In [None]:
#visualizing results of SVM per fold
x=list(range(1,11))
plt.plot(x,AB_accuracy,label='Accuracy')
plt.plot(x,AB_precision,label='Precision')
plt.plot(x,AB_recall, label='Recall')
plt.plot(x,AB_f1_score,label='F1 Score')
plt.title("Performance of SVM")
plt.legend()
plt.xlabel("Cross Validation Fold")
plt.ylabel("performace")
plt.show()

In [None]:
#visualizing results of Random Forest per fold
plt.plot(x,RF_accuracy,label='Accuracy')
plt.plot(x,RF_precision,label='Precision')
plt.plot(x,RF_recall, label='Recall')
plt.plot(x,RF_f1_score,label='F1 Score')
plt.title("Performance of Random Forest")
plt.xlabel("Cross Validation Fold")
plt.ylabel("performace")
plt.legend()
plt.show()

In [None]:
#visualizing results of KNN per epoch
# x=list(range(1,6))
plt.plot(x,GB_accuracy,label='Accuracy')
plt.plot(x,GB_precision,label='Precision')
plt.plot(x,GB_recall, label='Recall')
plt.plot(x,GB_f1_score,label='F1 Score')
plt.title("Performance of KNN")
plt.xlabel("Cross Validation Fold")
plt.ylabel("performace")
plt.legend()
plt.show

In [None]:
#visualizing average results:
AB=["AB ", (sum(AB_accuracy)/len(AB_accuracy)), (sum(AB_precision)/len(AB_precision)), 
     (sum(AB_recall)/len(AB_recall)), (sum(AB_f1_score)/len(AB_f1_score))]

RF=["RF ", (sum(RF_accuracy)/len(RF_accuracy)), (sum(RF_precision)/len(RF_precision)), 
     (sum(RF_recall)/len(RF_recall)), (sum(RF_f1_score)/len(RF_f1_score))]

GB=["GB ", (sum(GB_accuracy)/len(GB_accuracy)), (sum(GB_precision)/len(GB_precision)), 
     (sum(GB_recall)/len(GB_recall)), (sum(GB_f1_score)/len(GB_f1_score))]
data=[]
data.append(AB)
data.append(RF)
data.append(GB)
#converting results to dataframe
results=pd.DataFrame(data,columns=["Algorithms","Accuracy", "Precision", "Recall", "F1 Score"])
results