In [None]:
import numpy as np
import pandas as pd 
import os
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# EDA

In [None]:
df=pd.read_csv('/kaggle/input/indian-liver-patient-records/indian_liver_patient.csv')

In [None]:
df.head()

### Albumin_and_Globulin_Ratio column contains null values so dropping them as there are only 4 of them

In [None]:
df.isnull().sum()

In [None]:
df.drop('Albumin_and_Globulin_Ratio',axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
outputDistribution=df['Dataset'].value_counts()
sns.barplot(outputDistribution.index,outputDistribution.values)
plt.ylabel('Count')
plt.xlabel('Output Classes')
plt.title('Class Counts')
plt.show()

print ("Ratio of Class 1 to Class 2:",outputDistribution.values[0]/outputDistribution.values[1])

### We see that the columns like Alkaline_Phosphotase,Aspartate_Aminotransferase have very high values as compared to others so we would need to do feature scaling.

## Checking the gender column

In [None]:
# Converting strings to binary feature

def getGen(gender):
    if (gender=='Male'):
        return 0
    else:
        return 1

df['Sex']=df['Gender'].apply(getGen)
df.drop('Gender',axis=1,inplace=True)

## Checking The Age column

In [None]:
sns.catplot(x='Dataset',y='Age',data=df,kind='box')
plt.show()

### So from the violin plot we can say that the average age of people having liver disease is greater than those not having not having it 

### And maximum people have age around the average and there seem to be no outliers in the age

In [None]:
sns.catplot(x='Dataset',y='Age',data=df,kind='violin')
plt.show()

## Checking the Total Proteins Column

In [None]:
sns.catplot(x='Dataset',y='Total_Protiens',data=df,kind='box')
plt.show()

### So from the barplot we can see that the average protein value for people with and without the disease.And majority people with cancer have protein value lower than the average value.

### And some values have protein pretty low and some have pretty high values. So considering them as outliers dropping these points.Since majority of points have protein value between 3 and 9 so removing the values outside this range.

In [None]:
toRemove=((df['Total_Protiens']<=3) | (df['Total_Protiens']>=9))
toRemove.sum()

In [None]:
df1=df[~toRemove]
df.shape,df1.shape

In [None]:
sns.catplot(x='Dataset',y='Total_Protiens',data=df1,kind='box')
plt.show()

## Checking the Albumin Feature

In [None]:
df1.head()

In [None]:
sns.catplot(x='Dataset',y='Albumin',data=df1,kind='box')
plt.show()

### From the plot we can infer that people having disease have a lower average albumin value then the people not having the disease. 

### There is a point where the albumin value is pretty high than the normal values so that might be an outlier hence droping it.

In [None]:
df2=df1[~(df1['Albumin']>5)]

In [None]:
df1.shape,df2.shape

## Checking the Total_Bilrubin Feature

In [None]:
sns.catplot(x='Dataset',y='Total_Bilirubin',data=df2,kind='boxen')
plt.show()

### The Total Bilrubin Content of people with disease is significantly higher than the ones who are fit.

### Points with bilrubin value higher than 40mg/dl are highly uncommon cases so removing them considering them as outliers 

In [None]:
df2[df2['Total_Bilirubin'] > 40]

In [None]:
df3=df2[df2['Total_Bilirubin']<40]

## Checking the Direct_Bilrubin Feature

In [None]:
sns.catplot(x='Dataset',y='Direct_Bilirubin',data=df3,kind='boxen')
plt.show()

### The people with disease seem to have higher direct bilrubin than the ones who are fit.

## Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
dfFinal=df3.copy()

In [None]:
Y=dfFinal['Dataset']
dfFinal.drop('Dataset',axis=1,inplace=True)
X=dfFinal.values

In [None]:
print(X.shape,Y.shape)
xTrain,xTest,yTrain,yTest=train_test_split(X,Y)

In [None]:
# Logistic Regression

lr_clf=LogisticRegression(max_iter=1000)
lr_clf.fit(xTrain,yTrain)
yPredicted_lr=lr_clf.predict(xTest)


testScore=lr_clf.score(xTest,yTest)
trainScore=lr_clf.score(xTrain,yTrain)

print ("train score:",trainScore)
print ("test score:",testScore)

print()
print (classification_report(yTest,yPredicted_lr))
print (confusion_matrix(yTest,yPredicted_lr))
print()

In [None]:
# SVM with RBF Kernel

svm_clf=svm.SVC()
svm_clf.fit(xTrain,yTrain)

yPredicted_svm=svm_clf.predict(xTest)

trainScore=svm_clf.score(xTrain,yTrain)
testScore=svm_clf.score(xTest,yTest)

print ("Train Score:",trainScore)
print ("Test Score:",testScore)

print()
print('Clasification Report:')
print (classification_report(yTest,yPredicted_svm))
print('Confusion Matrix:')
print (confusion_matrix(yTest,yPredicted_svm))

In [None]:
# Random  Forest Classifier

clf_rf=RandomForestClassifier()
clf_rf.fit(xTrain,yTrain)

trainScore=clf_rf.score(xTrain,yTrain)
testScore=clf_rf.score(xTest,yTest)
yPredicted_rf=clf_rf.predict(xTest)

print ("Train Score:",trainScore)
print ("Test Score:",testScore)

print()
print (classification_report(yTest,yPredicted_rf))
print (confusion_matrix(yTest,yPredicted_rf))