In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/indian_liver_patient.csv')
df.head(2)

### Data Visualisation:

In [None]:
import seaborn as sns
patient = df.Dataset.value_counts()
sns.barplot(x = patient.index, y = patient, palette='magma')
plt.xlabel('Dataset')
plt.ylabel('Count')
plt.show()

In [None]:
gender = df.Gender.value_counts()
sns.barplot(x = gender.index, y = gender, palette='magma')
plt.xlabel('Gender of the Patients')
plt.ylabel('Count')
plt.show()

In [None]:
sns.factorplot(x='Age', y='Gender', hue='Dataset', data=df, palette='magma')

Age is definitely an important factor for liver disease. 1: Liver Disease 2: Fit

In [None]:
df.groupby('Dataset').mean()

### Analysing each feature:

Let's condsider the two most important columns: Total Bilirubin and Outcome (Dataset)

### Total Bilirubin:

In [None]:
liver_sub = df.iloc[:,[2,10]]

In [None]:
liver_sub.head(2)

In [None]:
sns.boxplot(x='Dataset', y='Total_Bilirubin', data=liver_sub, palette='magma')

It looks like Total Bilirubin is higher in liver condition cases, especially in the case of the outliers. Let's also check without the outliers. 

In [None]:
sns.boxplot(x='Dataset', y='Total_Bilirubin', data=liver_sub, palette='magma', showfliers=False) 

Even after removing the outlier points, we can see that the level of Total Bilirubin is significantly higher in people with liver disease. 

This actually makes sense, because 'Bilirubin' is known to be a by-product of the breakdown of Red Blood Cells, which should be efficiently dealt with by the liver. So raised levels can indicate that the liver is not functioning correctly. 

### Direct Bilirubin:

In [None]:
liver_sub1 = df.iloc[:,[3,10]]
liver_sub1.head(2)

In [None]:
sns.boxplot(x='Dataset', y='Direct_Bilirubin', data=liver_sub1, palette='magma')

In [None]:
sns.boxplot(x='Dataset', y='Direct_Bilirubin', data=liver_sub1, palette='magma', showfliers=False)

Again, Direct Bilirubin too shows higher value in case of diseased person, as compared to the fit person. 

### Alkaline Phosphotase:

In [None]:
liver_sub2 = df.iloc[:,[4,10]]

In [None]:
sns.boxplot(x='Dataset', y='Alkaline_Phosphotase', data=liver_sub2, palette='magma')

In [None]:
sns.boxplot(x='Dataset', y='Alkaline_Phosphotase', data=liver_sub2, palette='magma', showfliers = False)

Even Alkaline Phosphotase is higher in diseased  liver (and the ouliers are more sparsed). 

### Alamine Aminotransferase:

In [None]:
liver_sub3 = df.iloc[:,[5,10]]
sns.boxplot(x='Dataset', y='Alamine_Aminotransferase', data=liver_sub3, palette='magma')
plt.show()

In [None]:
sns.boxplot(x='Dataset', y='Alamine_Aminotransferase', data=liver_sub3, palette='magma', showfliers = False);

There's a significant relationsip with higher levels of Alamine Aminotransferase with diseased liver. 

### Aspartate Aminotransferase:

In [None]:
liver_sub4 = df.iloc[:,[6,10]]
sns.boxplot(x='Dataset', y='Aspartate_Aminotransferase', data=liver_sub4, palette='magma')
plt.show()

In [None]:
sns.boxplot(x='Dataset', y='Aspartate_Aminotransferase', data=liver_sub4, palette='magma', showfliers = False);

Again, there's a significant relationsip with higher levels of Aspartate Aminotransferase with diseased liver.

### Total Protiens:

In [None]:
liver_sub5 = df.iloc[:,[7,10]]
sns.boxplot(x='Dataset', y='Total_Protiens', data=liver_sub5, palette='magma')
plt.show()

Interesting observation here: 
the level of Total Protiens is marginally higher than that in diseased liver. 

### Albumin:

In [None]:
liver_sub6 = df.iloc[:,[8,10]]
sns.boxplot(x='Dataset', y='Albumin', data=liver_sub6, palette='magma')
plt.show()

Albumin in case of diseased liver is significantly lower as compared to a fit person's liver.

In [None]:
liver_sub7 = df.iloc[:,[9,10]]
sns.boxplot(x='Dataset', y='Albumin_and_Globulin_Ratio', data=liver_sub7, palette='magma', showfliers = False)
plt.show()

### Albumin/Globulin Ratio:

Albumin/Globulin Ratio in diseased liver is much lower than that in a fit person's liver. 

In [None]:
df[df['Albumin_and_Globulin_Ratio'].isnull()]

In [None]:
df["Albumin_and_Globulin_Ratio"] = df.Albumin_and_Globulin_Ratio.fillna(df['Albumin_and_Globulin_Ratio'].mean())

In [None]:
pd.get_dummies(df['Gender'], prefix = 'Gender').head()
df = pd.concat([df, pd.get_dummies(df['Gender'], prefix = 'Gender')], axis=1)
df.head()
X = df.drop(['Gender','Dataset'], axis=1)
X.head(3)

In [None]:
y = df['Dataset']

In [None]:
df_corr = X.corr()

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(df_corr, cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 10},
           cmap= 'magma')
plt.title('Correlation between features');

## Observations:

There are few correlation between the following features:
    - Total & Direct Bilirubin
    - Alamine Aminotransferase & Aspartate Aminotransferase
    - Total Protiens & Albumin
    - Albumin & Albumin/Globulin Ratio

## Applying Machine Learning:

In [None]:
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

### Logistic Regression:

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
log_predicted= logreg.predict(X_test)

In [None]:
accuracy_score(y_test, log_predicted)

In [None]:
confusion_matrix(y_test,log_predicted)

In [None]:
print('Classification Report: \n', classification_report(y_test,log_predicted))
sns.heatmap(confusion_matrix(y_test,log_predicted), annot=True, fmt="d")

### GaussianNB:

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
gauss_predicted = gaussian.predict(X_test)
accuracy_score(y_test, gauss_predicted)

In [None]:
print(classification_report(y_test,gauss_predicted))
sns.heatmap(confusion_matrix(y_test, gauss_predicted),annot=True, fmt="d")

### Random Forest:

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)
accuracy_score(y_test,rf_predicted)

In [None]:
print(classification_report(y_test,rf_predicted))
sns.heatmap(confusion_matrix(y_test, rf_predicted), annot = True, fmt = "d")

### Linear Regression:

In [None]:
linear = linear_model.LinearRegression()
linear.fit(X_train, y_train)
lin_predicted = linear.predict(X_test)

In [None]:
from sklearn.feature_selection import RFE
rfe =RFE(linear, n_features_to_select=3)
rfe.fit(X,y)

In [None]:
for i in range(len(rfe.ranking_)):
    if rfe.ranking_[i] == 1:
        print(X.columns.values[i])

As we can see, the above three features have highest importance in the diagnosis of a person with diseased liver. 

In [None]:
X_final = X[['Total_Protiens','Albumin','Gender_Male']]

### Analysis after feature selection and elimination: 

#### Logistic Regression II:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.30, random_state=101)

In [None]:
logreg.fit(X_train, y_train)
log_predicted= logreg.predict(X_test)

In [None]:
accuracy_score(y_test, log_predicted)

In [None]:
print('Classification Report: \n', classification_report(y_test,log_predicted))

In [None]:
sns.heatmap(confusion_matrix(y_test,log_predicted),annot=True,fmt="d")

Note: Improved performance after Feature Elimination in case of Logistic Regression.

#### GaussianNB II:

In [None]:
gaussian.fit(X_train, y_train)
gauss_predicted = gaussian.predict(X_test)
accuracy_score(y_test, gauss_predicted)

In [None]:
print(classification_report(y_test,gauss_predicted))
sns.heatmap(confusion_matrix(y_test, gauss_predicted),annot=True, fmt="d")

Note: Feature Selection resulted in a better performance in case of GaussianNB.

#### Random Forest II:

In [None]:
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)
accuracy_score(y_test,rf_predicted)

Note: Feature Selection resulted in poor performance in case of RF