In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt# for Plotting graphs
import seaborn as sns# same as matplotlib but to make life easier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#loading the dataset
df=pd.read_csv("/kaggle/input/indian-liver-patient-records/indian_liver_patient.csv")
df.head()

# Analyzing The dataset

In [None]:
#describing the data
df.describe()

In [None]:
#printing the shape of data
print(df.shape)
df.info()

In [None]:
#encoding the Gender attribute
df['Gender'].replace({'Male':1,'Female':0},inplace=True)

# Starting with EDA

In [None]:
#plotting Correlation
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),cmap='Greens',annot=True)

In [None]:
sns.pairplot(df, hue='Dataset')

In [None]:
# visualize number of patients diagonised with liver diesease
sns.countplot(data = df, x = 'Dataset');

In [None]:
#Visualizing data with liver disease along with Gender
plt.figure(figsize=(6,6))
ax = sns.countplot(x = df['Dataset'].apply(lambda x:'Normal' if x == 1 else 'Liver Disease'), hue=df['Gender'])
ax.set_xlabel('Patient Condition')

# Starting Data Preprocessing

In [None]:
#checking for missing values as per column
df.isna().sum()

In [None]:
#checking the rows with the missing values
df[df['Albumin_and_Globulin_Ratio'].isna()]

In [None]:
#Lets have a look for correlation of Albumin_and_Globulin_Ratio with other columns
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),cmap='Greens',annot=True)

In [None]:
#As seen above Albumin_and_Globulin_Ratio is highly correlated with Albumin
# we apply binning to Albumin and will fill the values in Albumin_and_Globulin_Ratio using median of the bin value
df["binned_Albumin"]=pd.cut(df['Albumin'],bins=10,labels=list(range(10)))
#checking the rows with the missing values
df[df['Albumin_and_Globulin_Ratio'].isna()]

In [None]:
#seprating dataframe as per bins of missing data
df_bin6=df[df['binned_Albumin']==6]
df_bin4=df[df['binned_Albumin']==4]
df_bin3=df[df['binned_Albumin']==3]
df_bin8=df[df['binned_Albumin']==8]

In [None]:
#filling na values for bin 6
df_bin6['Albumin_and_Globulin_Ratio'].fillna(df_bin6['Albumin_and_Globulin_Ratio'].median(),inplace=True)
print("Median for Albumin Globumin Ratio for bin 6: ",df_bin6['Albumin_and_Globulin_Ratio'].median())
#adding the replaced values
df.drop(df[df['binned_Albumin']==6].index, inplace = True)
df=df.append(df_bin6,ignore_index=True)

#filling na values for bin 4
df_bin4['Albumin_and_Globulin_Ratio'].fillna(df_bin4['Albumin_and_Globulin_Ratio'].median(),inplace=True)
print("Median for Albumin Globumin Ratio for bin 4: ",df_bin4['Albumin_and_Globulin_Ratio'].median())
#adding the replaced values
df.drop(df[df['binned_Albumin']==4].index, inplace = True)
df=df.append(df_bin4,ignore_index=True)

#filling na values for bin 3
df_bin3['Albumin_and_Globulin_Ratio'].fillna(df_bin3['Albumin_and_Globulin_Ratio'].median(),inplace=True)
print("Median for Albumin Globumin Ratio for bin 3: ",df_bin3['Albumin_and_Globulin_Ratio'].median())
#adding the replaced values
df.drop(df[df['binned_Albumin']==3].index, inplace = True)
df=df.append(df_bin3,ignore_index=True)

#filling na values for bin 8
df_bin8['Albumin_and_Globulin_Ratio'].fillna(df_bin8['Albumin_and_Globulin_Ratio'].median(),inplace=True)
print("Median for Albumin Globumin Ratio for bin 8: ",df_bin8['Albumin_and_Globulin_Ratio'].median())
#adding the replaced values
df.drop(df[df['binned_Albumin']==8].index, inplace = True)
df= df.append(df_bin8,ignore_index=True)
#Printing Shape of Dataset
print(df.shape)

In [None]:
#remove the binned albumin column
df.drop(columns=['binned_Albumin'], inplace=True)

In [None]:
#Scaling the dataset using Min Max scaler:
#Getting Numerical Columns
cols=df.columns.to_list()
cols.remove('Gender')
cols.remove('Dataset')
print("Columns with numerical data:")
cols

In [None]:
#getting Numerical columns:
df_numerical=df[cols]

#starting scaling process:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_numerical)
scaled=scaler.transform(df_numerical) #the variable scaled will be in numpy array 
x=pd.DataFrame(scaled, columns=cols) #converting the variable to dataframe.
x['Gender']=df['Gender']# adding Gender to X or attribute list
y=df['Dataset']# Getting the labels
x

In [None]:
#moving for feature selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(x, y)
print("Showing feature importance values")
print(clf.feature_importances_) 

In [None]:
model=SelectFromModel(clf, prefit=True) #getting features from  the above classifer as per the importances
cols=x.columns.to_list()#getting list of columns
tf=model.get_support()#getting which features are important
selectedcols=[]
for i in range(len(cols)):
    if tf[i]:
        selectedcols.append(cols[i])
print("showing selected columns")
print(selectedcols)
#converting the data
X_new = model.transform(x)
X_new.shape 

# Applying ML Algorithms

In [None]:
#splitting the dataset for Training and testing and using 5-fold Cross validation.
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits(X_new)

#making a comparative study of 3 different ML Algorithms namely SVM, Random Forest, KNN
#metrics for SVM
SVM_accuracy=[]
SVM_precision=[]
SVM_recall=[]
SVM_f1_score=[]

#metrics for Random Forest
RF_accuracy=[]
RF_precision=[]
RF_recall=[]
RF_f1_score=[]

#metrics for KNN
KNN_accuracy=[]
KNN_precision=[]
KNN_recall=[]
KNN_f1_score=[]

In [None]:
#initializing the models
#importing libraries of the selected algorithms
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#importing libraries of performance Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

#Making the classifier Objects
clf_svm=SVC() #SVM object
clf_rf=RandomForestClassifier(max_depth=5, random_state=0)#Random Forest Object
clf_knn = KNeighborsClassifier(n_neighbors=3)#KNN object

In [None]:
i=1# count the number of folds
#starting the 5 fold cross valivation
for train_index, test_index in kf.split(X_new):
    print("%d Number of fold"%i)
    i+=1
    #Splitting the data
    X_train, X_test = X_new[train_index], X_new[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Training and Evaluating SVM
    model=clf_svm.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    SVM_accuracy.append(accuracy_score(y_test,y_pred))
    SVM_precision.append(precision_score(y_test,y_pred))
    SVM_recall.append(recall_score(y_test,y_pred))
    SVM_f1_score.append(f1_score(y_test,y_pred))
    
    #Training and Evaluating Random Forest
    model=clf_rf.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    RF_accuracy.append(accuracy_score(y_test,y_pred))
    RF_precision.append(precision_score(y_test,y_pred))
    RF_recall.append(recall_score(y_test,y_pred))
    RF_f1_score.append(f1_score(y_test,y_pred))
    
    #Training and Evaluating KNN
    model=clf_knn.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    KNN_accuracy.append(accuracy_score(y_test,y_pred))
    KNN_precision.append(precision_score(y_test,y_pred))
    KNN_recall.append(recall_score(y_test,y_pred))
    KNN_f1_score.append(f1_score(y_test,y_pred))

# Analyzing the performance

In [None]:
#visualizing results of SVM per fold
x=list(range(1,6))
plt.plot(x,SVM_accuracy,label='Accuracy')
plt.plot(x,SVM_precision,label='Precision')
plt.plot(x,SVM_recall, label='Recall')
plt.plot(x,SVM_f1_score,label='F1 Score')
plt.title("Performance of SVM")
plt.legend()
plt.xlabel("Cross Validation Fold")
plt.ylabel("performace")
plt.show()

In [None]:
#visualizing results of Random Forest per fold
plt.plot(x,RF_accuracy,label='Accuracy')
plt.plot(x,RF_precision,label='Precision')
plt.plot(x,RF_recall, label='Recall')
plt.plot(x,RF_f1_score,label='F1 Score')
plt.title("Performance of Random Forest")
plt.xlabel("Cross Validation Fold")
plt.ylabel("performace")
plt.legend()
plt.show()

In [None]:
#visualizing results of KNN per epoch
x=list(range(1,6))
plt.plot(x,KNN_accuracy,label='Accuracy')
plt.plot(x,KNN_precision,label='Precision')
plt.plot(x,KNN_recall, label='Recall')
plt.plot(x,KNN_f1_score,label='F1 Score')
plt.title("Performance of KNN")
plt.xlabel("Cross Validation Fold")
plt.ylabel("performace")
plt.legend()
plt.show

In [None]:
#visualizing average results:
SVM=["SVM ", (sum(SVM_accuracy)/len(SVM_accuracy)), (sum(SVM_precision)/len(SVM_precision)), 
     (sum(SVM_recall)/len(SVM_recall)), (sum(SVM_f1_score)/len(SVM_f1_score))]

RF=["RF ", (sum(RF_accuracy)/len(RF_accuracy)), (sum(RF_precision)/len(RF_precision)), 
     (sum(RF_recall)/len(RF_recall)), (sum(RF_f1_score)/len(RF_f1_score))]

KNN=["KNN ", (sum(KNN_accuracy)/len(KNN_accuracy)), (sum(KNN_precision)/len(KNN_precision)), 
     (sum(KNN_recall)/len(KNN_recall)), (sum(KNN_f1_score)/len(KNN_f1_score))]
data=[]
data.append(SVM)
data.append(RF)
data.append(KNN)
#converting results to dataframe
results=pd.DataFrame(data,columns=["Algorithms","Accuracy", "Precision", "Recall", "F1 Score"])
results