In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
patient_df = pd.read_csv('/kaggle/input/indian-liver-patient-records/indian_liver_patient.csv')

In [None]:
patient_df.info()

In [None]:
# Check for null values
patient_df.isnull().sum()

In [None]:
# drop rows containing null values
patient_df.dropna(axis=0, inplace = True)

In [None]:
# Check for duplicate values
patient_df_copy = patient_df.copy()
patient_df_copy.drop_duplicates(inplace = True)
print("Number of duplicate rows: ", patient_df.shape[0] - patient_df_copy.shape[0])

In [None]:
patient_df = patient_df_copy.copy()
patient_df.head()
patient_df.info()

### Exploratory Data Analysis

In [None]:
!pip install seaborn==0.11.1

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.__version__

In [None]:
sns.set_context(font_scale=1.4)
sns.countplot(x = 'Gender', data= patient_df, hue='Dataset')

In [None]:
patient_df[['Dataset', 'Gender', 'Age']].groupby(['Dataset', 'Gender'], as_index = False).count().sort_values(by = 'Dataset', ascending = False)

In [None]:
# Correlation
corr = patient_df.corr()
plt.figure(figsize = (15, 10))
sns.heatmap(corr, annot = True)

We can see very high correlation between :
*     Direct_Bilirubin and Total_Bilirubin : 0.87
*     Aspartate_Aminotransferase and Alamine_Aminotransferase : 0.79
*     Total_Protiens and Albumin : 0.78
*     Albumin_and_Globulin_Ratio and Albumin : 0.69

Hence we can eliminate one of these variables 

In [None]:
g = sns.FacetGrid(data = patient_df, col='Dataset', row='Gender')
g.map_dataframe(sns.histplot, x = 'Age', binwidth = 10)
g.set_axis_labels("Age", "Count")

In [None]:
# Convert Categorical variable to indicator variable
patient_df = pd.get_dummies(patient_df, prefix='Gender', columns=['Gender'])

In [None]:
# Drop the variables with high correlation
patient_df.drop(axis = 1, columns = ["Direct_Bilirubin", "Aspartate_Aminotransferase", "Total_Protiens"], inplace = True)

In [None]:
# Split the dataset for training and testing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve, cross_validate, KFold
X_train, X_test, y_train, y_test = train_test_split(patient_df.drop(["Dataset"], axis = 1 ), patient_df["Dataset"], test_size=0.3, random_state=0, stratify=patient_df['Dataset'])

In [None]:
# Scale the variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=-1, solver='newton-cg',random_state=55) 
cv = StratifiedKFold(10, shuffle=True, random_state=42)
cv_results = cross_validate(logreg, X_train_scaled, y_train, cv=cv, scoring='f1_macro', return_train_score=True)

In [None]:
print("Training score: ", cv_results["train_score"].mean())

# Stacking Models

In [None]:
# Stacking Models
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

In [None]:
estimators = [
    ('rf', RandomForestClassifier(random_state=55,class_weight='balanced_subsample')),
    ('gb', GradientBoostingClassifier(random_state=55),
    ('xgb',XGBClassifier(random_state=55)), 
    )
]

In [None]:
clf = StackingClassifier(
    estimators=estimators, final_estimator=RandomForestClassifier(random_state=55), cv=5
)

In [None]:
clf.fit(X_train, y_train).score(X_test, y_test)

In [None]:
pred = clf.predict(X_test)

In [None]:
print("Training classification report \n",classification_report(y_train, clf.predict(X_train)))
print("_______"*40)
print("Testing classification report\n",classification_report(y_test, pred))

In [None]:
cm = confusion_matrix(y_test, pred)
cm

In [None]:
sns.heatmap(cm, fmt = 'g', annot=True, xticklabels=["Liver Disease", "Non-liver disease"], yticklabels=["Liver Disease", "Non-liver disease"] )