In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data Preprocessing

# Importing Data Set

In [None]:
haberman_data=pd.read_csv("/kaggle/input/habermans-survival-data-set/haberman.csv")

# Data Cleaning

## Printing the first 5 rows of the dataset

In [None]:
haberman_data.head()

## Renaming Columns appropriately

In [None]:
haberman_data.columns=["Age","Year","Nodes","Survival_Status"]
haberman_data.head()

## Finding missing values (if any)

In [None]:
print("The number of rows in the dataset are :",len(haberman_data))

In [None]:
haberman_data.info()

In [None]:
haberman_data.describe()

In [None]:
haberman_data.isnull().sum()

##### 1.The number of rows in the dataset is equal to the count of the individual attribute entries (namely Age, Year, Nodes and Survival_Status) as shown above.
##### 2. The constraint of the attributes (namely Age,Year,Nodes and Survival_Status) is non-null as shown in dataset.info()
##### 3. The sum of all the null values in the attributes is zero.
##### The above three conclusions prove that there is no missing data in the dataset

## Handling Categorical Data (if any)

##### There is no need of Label Encoding here because the class label (i.e., Survival_Status here) is already in machine-readable (numeric) form.

# Deciding if its Supervised or UnSupervised Machine Learning Task

##### The dataset consists of 4 attributes in which one is the class label (i.e., Survival_Status here). As the dataset is labelled,We are going to apply Supervised Machine Learning Algorithms.

# Deciding if Classification or Regression suits well

##### Survival_Status is the class label (in this case).
##### It is either 1 if the person survived 5 years(or even longer) or 2 if the person died within 5 years.
##### This shows that the test data should be mapped to either 1 or 2 (to know if the patient had survived or not) which is Binary Classification which also meant that the class label is discrete.

# Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sb

## Understanding the relationship between features using Heatmap

In [None]:
sb.heatmap(haberman_data.corr(), annot=True , cmap="RdYlGn",center=0.4)

## Heatmap for Survived persons' data

In [None]:
data_survived=haberman_data[haberman_data["Survival_Status"]==1]
sb.heatmap(data_survived.corr(), annot=True , cmap="RdYlGn",center=0.4)

## Heatmap for not survived persons' data

In [None]:
data_not_survived=haberman_data[haberman_data["Survival_Status"]==2]
sb.heatmap(data_not_survived.corr(), annot=True , cmap="RdYlGn",center=0.4)

### From the Heatmap, we observe almost no highly positive or negative correlations between our features and the target value.

## Spotting Outliers in the data using BoxPlot

In [None]:
sb.boxplot( y=haberman_data['Age'])

No outliers found in Attribute : Age

In [None]:
sb.boxplot( y=haberman_data['Year'])

No outliers found in Attribute : Year

In [None]:
sb.boxplot( y=haberman_data['Nodes'])

Number of outliers found in Attribute : Nodes

In [None]:
sb.boxplot( y=haberman_data['Survival_Status'])

No Outliers found in Attribute : Survival_Status

### Generating BoxPlot for the entire dataset

In [None]:
haberman_data.boxplot()

#### As there are outliers in the attribute "Nodes", Let's begin to remove them before applying any algorithm.

### Outlier Removal

In outlier removal, all the nodes above the value of 10 are outliers as observed from the boxplot for Attribute : Nodes

In [None]:
outliers=haberman_data[haberman_data["Nodes"]>10]
print(outliers)
print("The number of outliers in the Attribute : Nodes is :",len(outliers))

In [None]:
train_data=haberman_data[haberman_data["Nodes"]<=10]
from sklearn.model_selection import train_test_split

# Removing Class Label from the dataset to use it for training

In [None]:
class_label=train_data["Survival_Status"]
train_data.drop(["Survival_Status"],axis=1,inplace=True)
train_data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train_data,class_label)
X_test

## Insights drawn from EDA on haberman_data

The data has no missing values and there is no need of Label Encoding. We have removed outliers and the features are independent of each other.

# 2. Modeling

# Trying different Supervised ML models (Classification) for this data 

## 1. Naive Bayes Classification 

### 1(a). Gaussian Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB().fit(X_train,y_train)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
pred = model.predict(X_test)
print(classification_report(pred,y_test))
print("The accuracy in the case of Naive Bayes using Gaussian Naive Bayes Classifier is:",accuracy_score(pred,y_test)*100)

### 1(b). Multinomial Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train,y_train)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
pred = model.predict(X_test)
print(classification_report(pred,y_test))
print("The accuracy in the case of Naive Bayes using Multinomial Naive Bayes Classifier is:",accuracy_score(pred,y_test)*100)

### 1(c). Bernoulli Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB().fit(X_train,y_train)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
pred = model.predict(X_test)
print(classification_report(pred,y_test))
acc1=accuracy_score(pred,y_test)*100
print("The accuracy in the case of Naive Bayes using Bernoulli Naive Bayes Classifier is:",acc1)

The accuracy is comparatively high in the case of Bernoulli Naive Bayes Classifier

## 2. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)
pred2 = model.predict(X_test)
acc2=accuracy_score(pred2,y_test)*100
print("The accuracy for Logistic Regression Model is:",acc2)
print(classification_report(pred,y_test))

## 3. K-Nearest Neighbor

Website used : https://www.geeksforgeeks.org/k-nearest-neighbor-algorithm-in-python/

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
pred=knn.predict(X_test)
acc3=accuracy_score(pred,y_test)*100
print("The accuracy for KNN Model is:",acc3)
print(classification_report(pred,y_test))

## 4. Support Vector Machines (Classification)

In [None]:
from sklearn.svm import SVC
model = SVC().fit(X_train,y_train)
pred = model.predict(X_test)
acc4=accuracy_score(pred,y_test)*100
print("The accuracy for Support Vector Classifier model is :",acc4)
print(classification_report(pred,y_test))

## 5. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier().fit(X_train,y_train)
pred = model.predict(X_test)
acc5=accuracy_score(pred,y_test)*100
print("The accuracy for Decision Tree Classifier model is:",acc5)
print(classification_report(pred,y_test))

## 6. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier().fit(X_train,y_train)
pred = model.predict(X_test)
acc6=accuracy_score(pred,y_test)*100
print("The accuracy for Random Forest Classifier Model is:",acc6)
print(classification_report(pred,y_test)) 

The features are more linear and independent. Hence, Naive Bayes, Logistic and SVM worked well.

# 3. Model Evaluation

# Comparing all the Machine Learning Models applied (Best Algorithm Analysis)

Website used : https://dibyendudeb.com/comparing-machine-learning-algorithms/

## Storing machine learning algorithms (MLA) in a variable

In [None]:
models = []
models.append(('NB', BernoulliNB()))
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))

## Creating a box plot to compare their accuracy

This part of code creates a box plot for all the models against their cross validation score.

### Importing required modules

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve
from sklearn import ensemble, linear_model, neighbors, svm, tree, neural_network
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import svm,model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process


### K-Fold Cross Validation

In [None]:
# evaluate each model in turn
import random
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=random.seed())
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Comparison between different MLAs')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## Comparing the algorithms

In [None]:
MLA = [
    linear_model.LogisticRegressionCV(),
    ensemble.RandomForestClassifier(),
    svm.SVC(probability=True),
    tree.DecisionTreeClassifier(),
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    naive_bayes.MultinomialNB(),
    neighbors.KNeighborsClassifier(),
    ]


In [None]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in MLA:  
    
    predicted = alg.fit(X_train, y_train).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Algorithm used'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train, y_train), 4)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 4)
    MLA_compare.loc[row_index, 'Precision'] = precision_score(y_test, predicted)
    MLA_compare.loc[row_index, 'Recall'] = recall_score(y_test, predicted)
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

## Creating plot to show the train accuracy

In [None]:

plt.subplots(figsize=(13,5))
sb.barplot(x="Algorithm used", y="Train Accuracy",data=MLA_compare,palette='hot',edgecolor=sb.color_palette('dark',7))
plt.xticks(rotation=90)
plt.title('Train Accuracy Comparison')
plt.show()

## Creating plot to show the test accuracy

In [None]:
plt.subplots(figsize=(13,5))
sb.barplot(x="Algorithm used", y="Test Accuracy",data=MLA_compare,palette='hot',edgecolor=sb.color_palette('dark',7))
plt.xticks(rotation=90)
plt.title('Test Accuracy Comparison')
plt.show()

## Creating plots to compare precision

In [None]:
plt.subplots(figsize=(13,5))
sb.barplot(x="Algorithm used", y="Precision",data=MLA_compare,palette='hot',edgecolor=sb.color_palette('dark',7))
plt.xticks(rotation=90)
plt.title('Precision Comparison')
plt.show()

## Creating plots to compare Recall

In [None]:
plt.subplots(figsize=(13,5))
sb.barplot(x="Algorithm used", y="Recall",data=MLA_compare,palette='hot',edgecolor=sb.color_palette('dark',7))
plt.xticks(rotation=90)
plt.title('Recall Comparison')
plt.show()

# Comparing Accuracies using Pretty Table

Website Used : https://pypi.org/project/prettytable/

In [None]:
from prettytable import PrettyTable
x = PrettyTable()

In [None]:
x.field_names = ["Machine Learning Algorithm", "Accuracy"]
x.add_row(["Naive Bayes Classification",acc1])
x.add_row(["Logistic Regression",acc2])
x.add_row(["K-Nearest Neighbor",acc3])
x.add_row(["Support Vector Machines",acc4])
x.add_row(["Decision Tree",acc5])
x.add_row(["Random Forest",acc6])

In [None]:
print(x)