# Liver Disease Detection
**The attributes**
* Age Age of the patient
* Gender Gender of the patient
* TB Total Bilirubin
* DB Direct Bilirubin
* Alkphos Alkaline Phosphotase
* Sgpt Alamine Aminotransferase
* Sgot Aspartate Aminotransferase
* TP Total Protiens
* ALB Albumin
* A/G Ratio Albumin and Globulin Ratio
* Selector field used to split the data into two sets (labeled by the experts)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import os
from sklearn.neighbors import KNeighborsClassifier
from xgboost import  XGBClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report, roc_auc_score, roc_curve,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [None]:
data=pd.read_csv('../input/indian-liver-patient-records/indian_liver_patient.csv')
data.head()



# Exploratory data analysis

In [None]:
columns=data.columns
#providing information about various data-types and the pesence of null values
# age 583 non-null implies out of 583
data.info()

## cleaning dataset



In [None]:
data['Gender']=data['Gender'].replace({'Male':0.0,'Female':1.0})
data=data.dropna(axis=0)
data['Dataset']=data['Dataset'].replace(2,0)
# data=data.drop(['Albumin','Albumin_and_Globulin_Ratio'],axis=1)

In [None]:
data['Dataset'].value_counts()
# 1-positive

In [None]:
#Data is unbalanced
data['Dataset'].value_counts().plot.bar()
cnt=data['Dataset'].value_counts(normalize=True)
print('positive samples: {}%\nNegative Samples: {}%'.format(round(cnt[1]*100,2),round(cnt[0]*100,2)))

In [None]:
sns.displot(data=data,x='Age',hue='Dataset')

In [None]:
#pairplot
pp=sns.pairplot(data,hue='Dataset',corner=True)

In [None]:
plt.figure(figsize=(10,10))
hm=sns.heatmap(data.corr(),annot=True)

In [None]:
data.describe()

In [None]:
fg=data[(data['Gender']==1) & (data['Dataset']==1 )& (data['Age']>=50)].dropna().shape[0]
fl=data[(data['Gender']==1) & (data['Dataset']==1 )& (data['Age']<50)].dropna().shape[0]

mg=data[(data['Gender']==0) & (data['Dataset']==1 )& (data['Age']>=50)].dropna().shape[0]

ml=data[(data['Gender']==0) & (data['Dataset']==1 )& (data['Age']<50)].dropna().shape[0]

label=['fg','fl','mg','ml']
plt.figure(figsize=(10,10))
sizes=[fg,fl,mg,ml]
fig,ax1=plt.subplots()
ax1.pie(sizes,labels=label,shadow=True,startangle=45, autopct='%1.1f%%')
plt.legend(["Women >= 50 yrs","Women <50 yrs","Men >=50 yrs","Men <50 yrs"],loc="best")
plt.show()

In [None]:
g = sns.FacetGrid(data, col="Dataset", row="Gender", margin_titles=True)
g.map(plt.hist, "Age", color="g")
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Disease by Gender and Age');

# splitting to training and testing sets

In [None]:
X=data[data.columns[:-1]]
Y=data['Dataset']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,random_state=27)
scaler=StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

# Logistic Regression

In [None]:
log_model=LogisticRegression()
log_model.fit(X_train,Y_train)
Y_pred=log_model.predict(X_test)
accuracy_score(Y_test,Y_pred)

In [None]:
confusion_matrix(Y_test,Y_pred)

## balancing the data

In [None]:
sm = SMOTE(random_state=123)
X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
pd.Series(Y_train_res).value_counts().plot.bar()

In [None]:
log_model=LogisticRegression()
log_model.fit(X_train_res,Y_train_res)
Y_pred=log_model.predict(X_test)
accuracy_score(Y_test,Y_pred)

In [None]:
confusion_matrix(Y_test,Y_pred)

In [None]:

c=[1,100,0.1,0.75,10,5,0.01,0.001,0.00001]
accuracy=[]
f1=[]
for c_val in c:
  log=LogisticRegression(penalty='l2',max_iter=100,C=c_val,class_weight='balanced').fit(X_train_res,Y_train_res)
  Y_pred=log.predict(X_test)
  accuracy.append(accuracy_score(Y_test,Y_pred))
  f1.append(f1_score(Y_test,Y_pred))
performance=pd.DataFrame(({"accuracy":accuracy,"f1_score":f1,"C":c}))
performance

In [None]:
logit_roc_auc = roc_auc_score(Y_test, log.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, log.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

# KNN


In [None]:
knn=KNeighborsClassifier().fit(X_train_res,Y_train_res)
Y_pred=knn.predict(X_test)
accuracy_score(Y_test,Y_pred)

In [None]:
## parameter tuning
params={'n_neighbors':[3,5,7,1],'weights':['uniform', 'distance'] }
random=GridSearchCV(estimator=knn, param_grid=params,verbose=0,cv=5,refit=True,scoring='accuracy')
random.fit(X_train_res,Y_train_res)

#finding the best hyperparameters
print(random.best_params_)

In [None]:
confusion_matrix(Y_test,Y_pred)

In [None]:
knn=KNeighborsClassifier(n_neighbors=1).fit(X_train_res,Y_train_res)
Y_pred=knn.predict(X_test)
accuracy_score(Y_test,Y_pred)

In [None]:
neighbours=[1,3,5,11,13,7,9]
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
params={'n_neighbors':neighbours,'weights':weights,'metric':metric}
random=GridSearchCV(estimator=knn, param_grid=params,verbose=0,refit=True,scoring='accuracy')
random.fit(X_train_res,Y_train_res)
roc_auc_score(Y_test,Y_pred)
print(random.best_params_)

In [None]:
logit_roc_auc = roc_auc_score(Y_test, knn.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, knn.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='KNN (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
sns.heatmap(confusion_matrix(Y_test,Y_pred,normalize='true'),annot=True)

# Support Vector Machine

In [None]:
svm=SVC().fit(X_train_res,Y_train_res)
Y_pred=svm.predict(X_test)
accuracy_score(Y_test,Y_pred)

# Hyper Parameter-tuning

In [None]:
kernel=["rbf"]
c=[0.1,0.5,10,5,10,1,100]
gamma= [0.001,0.005, 0.01, 0.1, 1, 10,100,150]
params={'kernel':kernel,'C':c,'gamma':gamma}

In [None]:
random=RandomizedSearchCV(estimator=svm, param_distributions=params,verbose=0,cv=5,refit=True,scoring='accuracy',random_state=0)
random.fit(X_train_res,Y_train_res)

In [None]:
#finding the best hyperparameters
print(random.best_params_)

In [None]:
svmc=SVC(C=1,kernel='rbf',gamma=10).fit(X_train_res,Y_train_res)
Y_pred=svmc.predict(X_test)
accuracy_score(Y_test,Y_pred)

# Decision Tree Classifier

In [None]:
dt=DecisionTreeClassifier().fit(X_train_res,Y_train_res)
Y_pred=dt.predict(X_test)
accuracy_score(Y_test,Y_pred)

In [None]:
logit_roc_auc = roc_auc_score(Y_test, dt.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, dt.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

# Random Forest

In [None]:
RandomForestClassifier()

In [None]:
rf=RandomForestClassifier().fit(X_train_res,Y_train_res)
Y_pred=rf.predict(X_test)
accuracy_score(Y_test,Y_pred)

In [None]:
rf.feature_importances_