In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# HABERMAN'S dataset
The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

### Importing required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#reading the dataset
df = pd.read_csv("../input/habermans-survival-data-set/haberman.csv",names=["Age","Year","A-Nodes","Survival"])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
survive_list = df["Survival"].value_counts()
print(survive_list)

In [None]:
#Plotting bar graph for survival status
fig = plt.figure(figsize=(5,5))
plt.ylabel("Count")
plt.title("No. of Surived vs Didn't Survived")
plt.bar(["Survived","Didn't Survived"],survive_list)
plt.show()

In [None]:
#Age distribution in dataset
plt.hist(df["Age"])
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Age Distribution")
plt.show()

In [None]:
hist_plot = sns.FacetGrid(df, col="Survival")
hist_plot.map(sns.histplot, "Survival")

In [None]:
# Probability Density Function to show probability of a given continuous outcome
sns.FacetGrid(df, hue="Survival",size=4).map(sns.distplot,'Age').add_legend();
plt.title('pdf plot1 corresponding to Age failure')
plt.show()

In [None]:
# Axil nodes
sns.FacetGrid(df,hue="Survival",height=8).map(sns.distplot,"A-Nodes").add_legend()
plt.title('pdf plot3 corresponding to Axil Nodes Failure', fontsize=15)
plt.show()

In [None]:
# year
sns.FacetGrid(df, hue="Survival",height=8).map(sns.distplot,'Year').add_legend();
plt.title('pdf plot2 corresponding to operation year feature',fontsize=15)
plt.show()

In [None]:
#BOX PLOT
#box plot using feature Age
sns.boxplot(x='Survival',y='Age',data=df)
plt.grid()
plt.title('Box plot using feature Age')
plt.show()

In [None]:
#box plot using  year
sns.boxplot(x='Survival',y='Year',data=df)
plt.grid()
plt.title('Box plot using Operation Year')
plt.show()

In [None]:
#box plot using Axil Nodes
sns.boxplot(x='Survival',y='A-Nodes',data=df)
plt.grid()
plt.title('Box plot using Axil-Nodes')
plt.show()

In [None]:
#SCATTER PLOT
sns.FacetGrid(df,hue="Survival",size=6).map(plt.scatter,"Age", "Year").add_legend()
plt.title("scatter plot of  Age Vs Year")
plt.show()

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df,hue="Survival",size=6).map(plt.scatter,"A-Nodes", "Year").add_legend()
plt.title("scatter plot of A-nodes Vs Year")
plt.show()

# Splitting training and test dataset

In [None]:
from sklearn.model_selection import train_test_split 
x = df.iloc[:,:2]
y = df["Survival"]
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.20, random_state=0)  
  

In [None]:
#feature Scaling  
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()    
x_train= st_x.fit_transform(x_train)    
x_test= st_x.transform(x_test)

# Model Building

# Knn

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
classifier_knn.fit(x_train, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(classifier_knn,x,y,cv=5))
#Train
knn_predict= classifier_knn.predict(x_train) 
score_knn= metrics.accuracy_score(y_train,knn_predict)
print("Your Model Accuracy is", score_knn)

In [None]:
#Test
knn_predict= classifier_knn.predict(x_test)  
score_knn1= metrics.accuracy_score(y_test,knn_predict)
print(cross_val_score(classifier_knn,x,y,cv=5))
print("Your Model Accuracy is", score_knn)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)  
classifier_tree.fit(x_train, y_train)

In [None]:
#Train 
tree_predict= classifier_tree.predict(x_train)  
score_tree= metrics.accuracy_score(y_train,tree_predict)
print("Your Model Accuracy is", score_tree)
print(cross_val_score(classifier_tree,x,y,cv=5))

In [None]:
#Test
tree_predict= classifier_tree.predict(x_test)  
score_tree1 = metrics.accuracy_score(y_test,tree_predict)
print("Your Model Accuracy is", score_tree1)

# Support vector Machine

In [None]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel='linear', random_state=0)  
classifier_svc.fit(x_train, y_train)

In [None]:
#Train
from sklearn.metrics import confusion_matrix
svc_predict= classifier_svc.predict(x_train)  
cm= confusion_matrix(y_train,svc_predict )
score_svc= metrics.accuracy_score(y_train,svc_predict)
print("Your Model Accuracy is", score_svc)
print(cm)
print(cross_val_score(classifier_svc,x,y,cv=5))

In [None]:
#Test 
svc_predict= classifier_svc.predict(x_test)  
cm= confusion_matrix(y_test, svc_predict)
score_svc1= metrics.accuracy_score(y_test,svc_predict)
print("Your Model Accuracy is", score_svc1)
print(cm)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_random= RandomForestClassifier(n_estimators= 14, criterion="entropy")  
classifier_random.fit(x_train, y_train)

In [None]:
#Train
from sklearn.metrics import confusion_matrix
random_predict= classifier_random.predict(x_train)  
cm= confusion_matrix(y_train,random_predict )
score_random= metrics.accuracy_score(y_train,random_predict)
print("Your Model Accuracy is", score_random)
print(cm)
print(cross_val_score(classifier_random,x,y,cv=5))

In [None]:
#Test 
random_predict= classifier_random.predict(x_test)  
cm= confusion_matrix(y_test, random_predict)
score_random1= metrics.accuracy_score(y_test,svc_predict)
print("Your Model Accuracy is", score_random1)
print(cm)

# Gaussian NB

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier_NB = GaussianNB()
classifier_NB.fit(x_train,y_train)

In [None]:
#Train
from sklearn.metrics import confusion_matrix
NB_predict= classifier_NB.predict(x_train)  
cm= confusion_matrix(y_train,NB_predict )
score_NB= metrics.accuracy_score(y_train,NB_predict)
print("Your Model Accuracy is", score_NB)
print(cm)
print(cross_val_score(classifier_NB,x,y,cv=5))

In [None]:
#Test 
NB_predict= classifier_random.predict(x_test)  
cm= confusion_matrix(y_test, NB_predict)
score_NB1= metrics.accuracy_score(y_test,NB_predict)
print("Your Model Accuracy is", score_NB1)
print(cm)

# Pretty Table

In [None]:
from prettytable import PrettyTable
Table = PrettyTable(["Algorithm", "Accuracy(Train)","Accuracy(Test)"])
Table.add_row([" KNN algorithm", score_knn,score_knn1])
Table.add_row(["Support Vector", score_svc,score_svc1])
Table.add_row(["Decision Tree", score_tree,score_tree1])
Table.add_row(["Random Forest", score_random, score_random1])
Table.add_row(["gaussianNB", score_NB, score_NB1])
print(Table)