In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Problem Statement :
## Classify the patients as having liver problems and not having liver problems based on 10 features related to liver

## About the Data:

   __Context and Content__
- Patients with Liver disease have been continuously increasing because of excessive consumption of alcohol, inhale of harmful gases, intake of contaminated food, pickles and drugs.
- This data set contains 10 variables that are age, gender, total Bilirubin, direct Bilirubin, total proteins, albumin, A/G ratio, SGPT, SGOT and Alkphos.
- This data set contains 416 liver patient records and 167 non liver patient records collected from North East of Andhra Pradesh, India. The "Dataset" column is a class label used to divide groups into liver patient (liver disease) or not (no disease). This data set contains 441 male patient records and 142 female patient records.Any patient whose age exceeded 89 is listed as being of age "90".

__Acknowledgements__
- This dataset was downloaded from the UCI ML Repository:

- Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [None]:
import matplotlib.pyplot as plt 
plt.rc("font", size=14)

import seaborn as sns
sns.set(style="white")
sns.set(style="darkgrid", color_codes=True)

from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

### Load Data and read the data as a data frame

In [None]:
df = pd.read_csv("../input/indian-liver-patient-records/indian_liver_patient.csv")

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.info()

### Input variables

In [None]:
print(df.drop('Dataset', axis=1).dtypes)

  ### Target variable

In [None]:
print("Dataset : values are {}, dtype is {}".format(df['Dataset'].unique(),
                                                          df['Dataset'].dtype))

#### Description of The Target Variables
- __Dataset__	       Dataset: field used to split the data into two sets (patient with liver disease, or no disease)
- __"1" stands for LiverPatients and "2" stands for NonLiverPatients.__

### Null Values

In [None]:
df.isna().sum()

In [None]:
print("\nThere are 4 Null/Missing values in the dataset\n")

### Drop NaN values

In [None]:
df[df['Albumin_and_Globulin_Ratio'].isna()]    

In [None]:
# Drop Nan values as there are only 4 NaN's
df.dropna(inplace=True)

In [None]:
df.isna().sum().value_counts()

### Negative values

In [None]:
(df.drop('Gender', axis=1) < 0).sum()

In [None]:
print("\nThere are no Negative values in the dataset\n")

### Duplicated data

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
print("\nThere are 13 duplicate records in the dataset\n")

In [None]:
#Removing Duplicate Rows

df.drop_duplicates(inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
#check changed shape
df.shape

In [None]:
#check columns
df.columns

## Exploratory Data Analysis (EDA)

### numerical columns

- There are 9 numerical columns.Though Dataset is numeric, it is the dependent variable for classification and is considered categorical

In [None]:
num_columns = ['Age','Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 
               'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 
               'Albumin_and_Globulin_Ratio']

### categorical columns

- Gender is the only categorical independent variable.Dataset is categorical and is the target variable

In [None]:
cat_columns = ['Gender','Dataset']

### 5 point summary of numerical attributes

In [None]:
df.describe().T

In [None]:
df.describe().drop('count',axis=0).plot(figsize=(20,8))
plt.show()

### Outliers analysis of numerical columns

In [None]:
#Identifying Outliers in Numeric columns using IQR (Inter Quartile Range) and Q1 (25% Quantile), Q3(75% Quantile).

def identify_outliers(col):    
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_limit = q1 - 1.5*iqr
    upper_limit = q3 + 1.5*iqr
    return(col, q1, q3, iqr, lower_limit, upper_limit)

In [None]:
#Checking for Outliers and identifying them by calling identify_outliers() function.
#observations below Q1- 1.5*IQR, or those above Q3 + 1.5*IQR  are defined as outliers.

for col in num_columns :
    col, q1, q3, iqr, lower_limit, upper_limit = identify_outliers(col)
    print("\nColumn name : {}\n Q1 = {} \n Q3 = {}\n IQR = {}".format(col, q1, q3, iqr))
    print(" Lower limit = {}\n Upper limit = {}\n".format(lower_limit, upper_limit))
    outlier_count = len(df.loc[(df[col] < lower_limit) | (df[col] > upper_limit)])
    if outlier_count != 0 :
        print(outlier_count, "OUTLIERS ARE PRESENT in {} column.".format(col))
        print("Outlier datapoints in {} column are:".format(col))
        print(np.array(df.loc[(df[col] < lower_limit) | (df[col] > upper_limit)][col]))
    else:
        print("OUTLIERS ARE NOT PRESENT in {} column\n".format(col))

## Visualizing Outliers in dataset using boxplot

In [None]:
#Visualizing Outliers in dataset using boxplot

print('\n\t\tBoxplot to check the presence of outliers in numeric columns')
print('\t\t==============================================================\n')
#num_columns = ['Age','Income', 'CCAvg', 'Mortgage']
fig, ax = plt.subplots(3,3,figsize=(15, 10))
for col,subplot in zip(num_columns,ax.flatten()) :
    sns.boxplot(x=df[[col]], width=0.5, color='orange', ax=subplot)
    #subplot.set_title('Boxplot for {}'.format(col))
    subplot.set_xlabel(col)    
plt.show()

### Observations on outlier analysis of numerical columns:
- Maximum number of outliers, which is 83 is seen in Total_Bilirubin column and is extremely right skewed
- Age and Albumin columns do not have any outliers
- Total_Protiens has 8 outliers and Albumin_and_Globulin_Ratio has 10 outliers
- Direct_Bilirubin, Alkaline_Phosphotase, Alamine_Aminotransferase, Aspartate_Aminotransferase columns also have huge number of outliers and are extremely right skewed

## Variance in numerical columns

In [None]:
df[num_columns].var()

## Visualizing variance of numerical columns through lineplot

In [None]:
plt.xticks(rotation = 90, fontsize=10)
plt.yticks(fontsize=10)
plt.plot(df[num_columns].var(), color='green', marker='s',linewidth=2, markersize=5)
plt.yscale('log')
plt.show()

### Observations on variance of numerical columns
- Variance in all the numerical columns vary exponentially
- Variance in Albumin_and_Globulin_Ratio column is very less with a value of 0.1
- Variance in Aspartate_Aminotransferase column is very high and is close to 10^5

# Univariate Analysis

## Visualizing Distribution of numerical columns through distplots

In [None]:
fig, ax = plt.subplots(3,3,figsize=(15, 10))
for col,subplot in zip(num_columns,ax.flatten()) :
    ax =sns.distplot(df[col], ax=subplot, hist_kws={'color':'g','alpha':1}, kde_kws={'color':'black', 'lw':2})

### Observations on univariate analysis of numerical columns:
 - Distplots for all the columns show that the columns Total_Bilirubin, Direct_Bilirubin, Alkaline_Phosphotase, Alamine_Aminotransferase, Aspartate_Aminotransferase and Albumin_and_Globulin_Ratio are extremely right skewed with long tails to the right side of the distribution.For all these columns, the mean is greater than the median
 - Distribution of Age column is nearly normal and has very less skewness in both the tails.Mean and median are approximately equal in Age column
 - Distribution for Total_Protiens, Albumin and Albumin_and_Globulin_Ratio columns is also nearly normal.The mean and median meadin for these columns is also approximately equal

## Visualizing value counts of categorical columns through countplot

In [None]:
# Apart from Dataset which is the Target column there is only one other categorical column, Gender
# Value counts and distribution of Gender column

df.Gender.value_counts()

In [None]:
ax = sns.countplot(df.Gender)

### Observations on univariate analysis of categorical columns:
- 428 liver patients (76%) are male and 138 patients (24%) are female

## Target column

In [None]:
# The Target column is 'Dataset'.
# Value counts and distribution of Target column
df.groupby(by='Dataset').count()

In [None]:
sns.countplot(df['Dataset'], palette = 'plasma')
plt.show()

### Observations on value counts of Target column:
1. __A status of '1' refers to a 'Liver Patient'.__
2. Among the 566 Patient's data, 404 customers (= 71%) are Liver patients
3. __A status of '2' refers to a 'Non Liver Patient'.__
4. Among the 566 Patient's data, 162 customers (= 29%) are Not Liver patients

## Visualizing frequency of each feature column by target column

In [None]:
for col in df.drop('Dataset', axis=1).columns :
    pd.crosstab(df[col], df['Dataset']).plot(kind='bar',color=('b', 'r'), figsize=(20,5))

### Observations on distribution of Target column:
- Liver patients with 60 years of age are the maximum
- Most of the liver Patients are in the age group of 32 to 60 years
- Most of the liver patients are are male
- People with Total_Bilirubin  value less than 1.0 are the most having a liver disease.So, we can say that, Total_Bilirubin is an important feature which decides the health of liver
- People with Direct_Bilirubin value of 0.2 and in general people with Direct_Bilirubin value less than 1.6 are having liver disease, which also tells that Direct_Bilirubin is also an important factor which decides liver health
- The plots for Total_Proteins, Albumin and Albumin_and_Globulin_ratio do not reveal any feature importance, because people with both high values and low values are seen having a liver disease

# Bivariate Analysis

In [None]:
sns.pairplot(vars=df.drop(['Gender', 'Dataset'], axis=1).columns,hue='Dataset',data=df)
plt.show()

### Observations on bivariate analysis using pairplot:
- From the pairplot we can see that the columns Total_Bilirubin and Direct_Bilirubin, Alamine_Aminotransferase and Aspartate_Aminotransferase, Total_Protiens and Albumin, Total_Protiens and Albumin_and_Globulin_Ratio, Albumin and Albumin_and_Globulin_Ratio show a positive correlation and the correlation appers to be strong
- The formation of two good clusters is evident from the KDE plots alng the diagnol

## Correlation between numerical columns

In [None]:
#Dropping categorical column and target for finding correlation
corr = df[num_columns].corr()
corr.style.background_gradient(cmap='YlGnBu')

## Visualizing Correlation between numerical columns through Heat map

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True, square=True)
plt.show()

### Observations on Correlation between numerical columns:
- Total_Bilirubin and Direct_Bilirubin columns are having a very strong correlation of 0.87
- Alamine_Aminotransferase and Aspartate_Aminotransferase columns also have a strong correlation of 0.79
- Albumin and Total_Protien columns also have a good correlation of 0.78
- Albumin also has a correlation of 0.69 with Albumin_and_Globulin_Ratio

# Implementing KNN, Logistic and Naïve Bayes Classification models

In [None]:
le = LabelEncoder()

In [None]:
df['Gender'] = le.fit_transform(df['Gender'])

In [None]:
le.classes_

In [None]:
df['Gender'].value_counts()

### Create the X(Feature-set) and Y(Target-set) sets for your Data.

In [None]:
X = df.drop('Dataset',axis=1)
y = df['Dataset']

In [None]:
print('Shape of Feture-set : ', X.shape)
print('Shape of Target-set : ', y.shape)

### Split the data as train set and test set with a ratio of 70:30.

In [None]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.30, random_state=7)

In [None]:
print("Training Set Shape:\nFeatures : {0}  Target : {1}\n".format(X_train.shape, y_train.shape))
print("Test Set Shape:\nFeatures : {0}  Target : {1}".format(X_test.shape, y_test.shape))

### Standardize the training dataset and test dataset

In [None]:
#Standardization using Standard Scaler class of sklearn.preprocessing module

scaler = StandardScaler().fit(X_train)

In [None]:
#Training set transformed to fit Standard Scaler

X_trainS = scaler.transform(X_train)

In [None]:
#Test set transformed to fit Standard Scaler

X_testS = scaler.transform(X_test)

In [None]:
print(X_trainS.mean(), X_trainS.std())
print(X_testS.mean(), X_testS.std())

In [None]:
#DataFrame to store model Performance metrics of all the classification methods
compare_metrics_df = pd.DataFrame(index=('K-NearestNeighbors', 'Logistic Regression', 'Gaussian Naive Bayes'), 
                                  columns=('Trainingset Accuracy', 'Testset Accuracy', 'Precision Score', 
                                           'Recall Score', 'F1 Score', 'ROC_AUC Score'))

In [None]:
compare_metrics_df.index.name = 'Classifier Name'

### Build KNN Classification model using train Dataset and predict the class on test dataset.

In [None]:
#Implementing KNN Classifier for default k value 5

knn_clf = KNeighborsClassifier(n_neighbors=5, weights='distance')

In [None]:
#Fit the model to the training set

knn_clf.fit(X_trainS, y_train)

In [None]:
# Predict classes using the built model

yhat_knn = knn_clf.predict(X_testS)

In [None]:
# Model accuracy score using score() function on Training data set

compare_metrics_df.loc['K-NearestNeighbors','Trainingset Accuracy'] = round(knn_clf.score(X_trainS, y_train), 2)
knn_clf.score(X_trainS, y_train)

In [None]:
# Model accuracy score using score() function on Test data set

compare_metrics_df.loc['K-NearestNeighbors','Testset Accuracy'] = round(knn_clf.score(X_testS, y_test), 2)
knn_clf.score(X_testS, y_test)

In [None]:
k_range = 100
mean_train_acc_knn = np.zeros(k_range)
mean_test_acc_knn = np.zeros(k_range)

for n in range(1,k_range+1) :
    KNN = KNeighborsClassifier(n_neighbors=n, weights='distance')
    KNN.fit(X_trainS, y_train)
    mean_train_acc_knn[n-1] = KNN.score(X_trainS, y_train)
    mean_test_acc_knn[n-1] = KNN.score(X_testS, y_test)

In [None]:
print('\nBest test accuracy is {0} for a K value of {1}'.format(mean_test_acc_knn.max(), mean_test_acc_knn.argmax()+1))
print('\nThe train accuracy for best test accuracy is {}'.format(mean_train_acc_knn[mean_test_acc_knn.argmax()+1]))
print('\nThe Best K-value for the classification is K = {}'.format(mean_test_acc_knn.argmax()+1))

### Model Performance using KNN Classifier

In [None]:
confusion_matrix_knn = confusion_matrix(y_test, yhat_knn)
confusion_matrix(y_test, yhat_knn)

In [None]:
print("Accuracy Score: ",accuracy_score(y_test, yhat_knn))
compare_metrics_df.loc['K-NearestNeighbors','Precision Score'] = round(precision_score(y_test, yhat_knn), 2)
print("Precision Score: ",precision_score(y_test, yhat_knn))
compare_metrics_df.loc['K-NearestNeighbors','Recall Score'] = round(recall_score(y_test, yhat_knn), 2)
print("Recall Score: ",recall_score(y_test, yhat_knn))
compare_metrics_df.loc['K-NearestNeighbors','F1 Score'] = round(f1_score(y_test, yhat_knn), 2)
print("F1 Score: ",f1_score(y_test, yhat_knn))
compare_metrics_df.loc['K-NearestNeighbors','ROC_AUC Score'] = round(roc_auc_score(y_test, yhat_knn), 2)
print("ROC_AUC Score: ",roc_auc_score(y_test, yhat_knn))
print("Classification Report\n",classification_report(y_test, yhat_knn))

### Build Logistic Regression Classification Model using train Dataset and predict the class on test dataset.

In [None]:
#Implementing Logistic Regression Classifier

lgr_clf = LogisticRegression(solver='lbfgs', random_state=7)

In [None]:
#Fit the model to the training set

lgr_clf.fit(X_trainS, y_train)

In [None]:
# Predict classes using the built model

yhat_lgr = lgr_clf.predict(X_testS)

In [None]:
# Model accuracy score using score() function on Training data set

compare_metrics_df.loc['Logistic Regression','Trainingset Accuracy'] = round(lgr_clf.score(X_trainS, y_train), 2)
lgr_clf.score(X_trainS, y_train)

In [None]:
# Model accuracy score using score() function on Test data set

compare_metrics_df.loc['Logistic Regression','Testset Accuracy'] = round(lgr_clf.score(X_testS, y_test), 2)
lgr_clf.score(X_testS, y_test)

### Model Performance using Logistic Regression Classifier

In [None]:
confusion_matrix_lgr = confusion_matrix(y_test, yhat_lgr)
confusion_matrix(y_test, yhat_lgr)

In [None]:
print("Accuracy Score: ",accuracy_score(y_test, yhat_lgr))
compare_metrics_df.loc['Logistic Regression','Precision Score'] = round(precision_score(y_test, yhat_lgr), 2)
print("Precision Score: ",precision_score(y_test, yhat_lgr))
compare_metrics_df.loc['Logistic Regression','Recall Score'] = round(recall_score(y_test, yhat_lgr), 2)
print("Recall Score: ",recall_score(y_test, yhat_lgr))
compare_metrics_df.loc['Logistic Regression','F1 Score'] = round(f1_score(y_test, yhat_lgr), 2)
print("F1 Score: ",f1_score(y_test, yhat_lgr))
compare_metrics_df.loc['Logistic Regression','ROC_AUC Score'] = round(roc_auc_score(y_test, yhat_lgr), 2)
print("ROC_AUC Score: ",roc_auc_score(y_test, yhat_lgr))
print("Classification Report\n",classification_report(y_test, yhat_lgr))

### Build Gaussian Naïve Bayes Classification Model using train Dataset and predict the class on test dataset.
- For Gaussian Naive Bayes, the estimator learns the mean and standard deviation of each feature (per class). At prediction time the probability of a value being in a class is a function of the distance from the center of the distribution. The function used is Probability Density Function (PDF), of a Normal/Gaussian distribution. And the Normal PDF is just a Standard Normal distribution (0 mean, unit variance) that is scaled by variance and shifted by mean. So a value which is at mean+(0.5*std) has the same probability.
- With standardization the mean and stddev changes, but probabilities stay exactly the same, and thus classification results. In essence Gaussian Naive Bayes performs standardization internally.
- So, even if we use a Standardised Dataset or not, the classification results and accuracies will be the same.

In [None]:
#Implementing Logistic Regression Classifier

gnb_clf = GaussianNB()

In [None]:
#Fit the model to the training set

gnb_clf.fit(X_trainS, y_train)

In [None]:
# Predict classes using the built model

yhat_gnb = gnb_clf.predict(X_testS)

In [None]:
# Model accuracy score using score() function on Training data set

compare_metrics_df.loc['Gaussian Naive Bayes','Trainingset Accuracy'] = round(gnb_clf.score(X_trainS, y_train), 2)
gnb_clf.score(X_trainS, y_train)

In [None]:
# Model accuracy score using score() function on Test data set

compare_metrics_df.loc['Gaussian Naive Bayes','Testset Accuracy'] = round(gnb_clf.score(X_testS, y_test), 2)
gnb_clf.score(X_testS, y_test)

### Model Performance using Naive Bayes Classifier

In [None]:
confusion_matrix_gnb = confusion_matrix(y_test, yhat_gnb)
confusion_matrix(y_test, yhat_gnb)

In [None]:
print("Accuracy Score: ",accuracy_score(y_test, yhat_gnb))
compare_metrics_df.loc['Gaussian Naive Bayes','Precision Score'] = round(precision_score(y_test, yhat_gnb), 2)
print("Precision Score: ",precision_score(y_test, yhat_gnb))
compare_metrics_df.loc['Gaussian Naive Bayes','Recall Score'] = round(recall_score(y_test, yhat_gnb), 2)
print("Recall Score: ",recall_score(y_test, yhat_gnb))
compare_metrics_df.loc['Gaussian Naive Bayes','F1 Score'] = round(f1_score(y_test, yhat_gnb), 2)
print("F1 Score: ",f1_score(y_test, yhat_gnb))
compare_metrics_df.loc['Gaussian Naive Bayes','ROC_AUC Score'] = round(roc_auc_score(y_test, yhat_gnb), 2)
print("ROC_AUC Score: ",roc_auc_score(y_test, yhat_gnb))
print("Classification Report\n",classification_report(y_test, yhat_gnb))

## Comparision of K-NN, Logistic Regression and Naïve Bayes Classification Models

In [None]:
compare_metrics_df

In [None]:
print("Confusion Matrix of all the 3 models")
print("====================================")
print("\nK-Nearest Neighbors:\n")
print(confusion_matrix_knn)
print("\nLogistic Regression:\n")
print(confusion_matrix_lgr)
print("\nGaussian Naive Bayes:\n")
print(confusion_matrix_gnb)

### Interpretation of metrics from accuracy, precision, recall, roc_auc and f1 scores
- Logistic Regression classifier is generalizing and fitting well on the dataset with test accuracy of 70%  and train accuracy of 73%
- Though Naive Bayes classifier is giving 100% Precision score and 0 False Psitives, the test accuracy is very low with 58%
- KNN classifier is ovefitting with a train accuracy of 100%
- __So Logistic Regression Classifier can be applied on this dataset with test accuarcy of 70%__