In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import skew, norm 
from warnings import filterwarnings as filt

filt('ignore')
plt.rcParams['figure.figsize'] = (12,6)
plt.style.use('fivethirtyeight')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/indian-liver-patient-records/indian_liver_patient.csv')
df.shape

In [None]:
df.head()

### handling missing values

In [None]:
df.isnull().values.sum()

In [None]:
df.isnull().sum()

In [None]:
sns.distplot(df['Albumin_and_Globulin_Ratio'])

In [None]:
df['Albumin_and_Globulin_Ratio'] = df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].median())

In [None]:
df.isnull().values.sum()

### data cleaning 

In [None]:
df['Gender'] = df['Gender'].apply(lambda x : 0 if x == 'Female' else 1)
df['Dataset'] = df['Dataset'] - 1
df = df.rename(columns = {'Dataset' : 'Healthy'})
df.head()

In [None]:
df.info()

### features engineering 

In [None]:
plt.pie(df.Healthy.value_counts(), labels = ['unhealthy liver', 'healthy liver'], shadow = True);

In [None]:
sns.countplot(df.Healthy)

In [None]:
corr = df.corr()
sns.heatmap(corr.where(np.tril(np.ones(corr.shape),  k = -1).astype(bool)), fmt = '.2f', annot = True, cmap = 'icefire')

In [None]:
def correlation(df, tol = 0.6):
    corr = df.corr().abs()
    mean_corr = corr.mean()
    up_tri = corr.where(np.triu(np.ones(corr.shape), k = 1).astype(bool))
    drop = []
    
    for row in range(corr.shape[0]  - 1):
        col_ind = row + 1
        for col in range(col_ind, corr.shape[1]):
            if corr.iloc[row, col] > tol:
                drop.append(row) if mean_corr.iloc[row] > mean_corr.iloc[col] else drop.append(col)
    
    return df.columns[list(set(drop))]

In [None]:
high_corr_feats = correlation(df)
high_corr_feats.values

the above cell shows the highly correlated features 

In [None]:
sns.pairplot(df, hue = 'Healthy');

In [None]:
# regplot

cols = [('Albumin', 'Total_Protiens') , ('Direct_Bilirubin', 'Total_Bilirubin'), ('Aspartate_Aminotransferase', 'Alamine_Aminotransferase')]
for ind, col in enumerate(cols):
    plt.figure(ind)
    x, y = col
    sns.lmplot(data = df, x = x, y = y, hue = 'Healthy', col = 'Healthy', truncate=False)

we'll try 2 diff followings:

* try the whole dataset
* remove the highly correlated features

In [None]:
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from pdpbox.pdp import *
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from shap import force_plot, TreeExplainer, initjs

In [None]:
def permImp(x, y):
    model = RandomForestClassifier().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return show_weights(perm, feature_names = x.columns.tolist())

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    plt.barh(score.index, score.mi_score)
    plt.title('mutual info classifier')
    return

def mi_score(x, y):
    score = pd.DataFrame(mutual_info_classif(x, y, discrete_features=False), index = x.columns, columns = ['mi_score']).sort_values('mi_score', ascending = False)
    plot_mi(score)
    return score

def isolate(x, y, col):
    model = RandomForestClassifier().fit(x, y)
    dist = pdp_isolate(model, model_features = x.columns, dataset = x, feature = col)
    return pdp_plot(dist, feature_name = col)

def interact(x, y, cols):
    model = RandomForestClassifier().fit(x, y)
    dist = pdp_interact(model, model_features = x.columns, dataset = x, features = cols)
    return pdp_interact_plot(dist, feature_names = cols)

def forceplot(x, y, n_cls = 1):
    idx = y[y == n_cls].sample( n = 1).index
    x_samp = x.loc[idx]
    print(f'Chose the sample from the index : {idx}')
    
    model = RandomForestClassifier().fit(x, y)
    explainer = TreeExplainer(model, feature_names= x.columns)
    shap_values = explainer.shap_values(x_samp)[n_cls]
    exp_values = explainer.expected_value[n_cls]
    return force_plot(exp_values, shap_values, feature_names = x.columns)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score

In [None]:
x = df.drop(['Healthy'], axis = 1)
y = df.Healthy
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
fig, ax = plt.subplots(1,2)
ax[0].pie(y_train.value_counts(), labels = ['unhealthy', 'healthy'], shadow = True)
ax[0].set_title('training target')
ax[1].pie(y_test.value_counts(), labels = ['unhealthy', 'healthy'], shadow = True);
ax[1].set_title('testing target')

In [None]:
permImp(x_train, y_train)

In [None]:
mscore = mi_score(x_train, y_train)

In [None]:
high_corr_feats

all the correlated feats contains lot of information , i dont think dropping them is a good idea, but still lets try those 2 following methods 

In [None]:
isolate(x_train, y_train, 'Direct_Bilirubin');

In [None]:
initjs()
forceplot(x_train, y_train, n_cls = 0);

In [None]:
x.head()

In [None]:
fig, ax = plt.subplots(9, 2, figsize = (18, 16))
fig.tight_layout()
for ind, col in enumerate(x.drop(['Gender'], axis = 1).columns):
    sns.distplot(x[col], ax = ax[ind, 0], fit = norm )
    sns.boxplot(x[col], ax = ax[ind, 1])

In [None]:
skews = df.skew().abs().sort_values(ascending = True)
plt.barh(skews.index, skews)

In [None]:
high_skews = skews[skews > 2].index

In [None]:
new_x = x.copy()
new_y = y.copy()
new_x[high_skews] = new_x[high_skews].apply(np.log1p)
x_train, x_test, y_train, y_test = train_test_split(new_x, new_y, stratify = new_y, test_size = 0.2)
x_train.head()

In [None]:
skews = x_train.skew().abs().sort_values(ascending = True)
plt.barh(skews.index, skews)

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
def best_model(x, y, fold = 10):
    models = [LogisticRegression(), RidgeClassifier(), SVC(), GaussianNB(), KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), XGBClassifier(verbosity = 0), LGBMClassifier()]
    mnames = ['logistic regg', 'ridge clf', 'svm', 'naive bayes', 'knn', 'decision tree', 'random forest', 'xgboost', 'lgbm']
    scalers = [None, StandardScaler(), RobustScaler(), MinMaxScaler()]
    snames = ['none', 'std', 'robust', 'minmax']
    scores = [[] for _ in range(4)]
    
    total = len(models) * len(scalers)
    print(f'total number of iterations : {total}')
    
    for model in models:
        for ind,scaler in enumerate(scalers):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            cv = StratifiedKFold(n_splits = fold, shuffle = True)
            score = cross_val_score(model, x, y, cv = cv, scoring = 'f1_micro').mean()
            scores[ind].append(score)
            
    return pd.DataFrame(scores, index = snames, columns = mnames).T

def report(xt, yt, xtest, ytest, pred, model):
    print(' Report '.center(60, '='))
    print()
    print(f"Training score :===>  {model.score(xt, yt)}")
    print(f"Testing score :===>  {model.score(xtest, ytest)}")
    print()
    print(classification_report(ytest, pred))
    sns.heatmap(confusion_matrix(ytest, pred), fmt = '.1f', annot = True, cmap = 'icefire')
    plt.xlabel('predicted')
    plt.ylabel('actual')

def get_score(xt, yt, xtest, ytest, model, scaler = None, predict = True):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])  
    model.fit(xt, yt)
    if not predict:
        return model
    
    pred = model.predict(xtest)
    report(xt, yt, xtest, ytest, pred, model)
    
def gridcv(x, y, model, params, scaler = None, fold = 10):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])   
        
    cv = StratifiedKFold(fold, shuffle = True)
    clf = GridSearchCV(model, param_grid = params, return_train_score = 'f1_micro', scoring = 'f1_micro', cv = cv)
    clf.fit(x, y)
    results = pd.DataFrame(clf.cv_results_).sort_values('mean_test_score', ascending = False)
    res = results[['mean_train_score','mean_test_score', 'params']]
    return clf, res

def plot_cv(res):
    sns.lineplot(x = res.reset_index().index, y = res.mean_train_score)
    sns.lineplot(x = res.reset_index().index, y = res.mean_test_score)
    plt.legend(['train score', 'test score'])
    plt.title('f1 micro score comparision')

whole dataset 

In [None]:
best_model(x_train, y_train)

dropping high correlated feats 

In [None]:
best_model(x_train.drop(high_corr_feats, axis = 1), y_train)

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE() # disadvantage - overfitting 
new_x_train, new_y_train = smote.fit_resample(x_train, y_train)
new_x_train.shape, x_train.shape

In [None]:
new_y_train.value_counts()

whole dataset + over sampling 

In [None]:
best_model(new_x_train, new_y_train)

removing high correlated feats + over sampling 

In [None]:
best_model(new_x_train.drop(high_corr_feats, axis = 1), new_y_train)

lgbm with std scaler for an over sampled dataset gave the highest score of 0.83 f1 micro score

In [None]:
get_score(new_x_train, new_y_train, x_test, y_test, LGBMClassifier(), StandardScaler())

model is now suffering from high varience which is expected 

In [None]:
clf, results = gridcv(new_x_train, new_y_train, LGBMClassifier(), {'model__n_estimators' : np.arange(50, 325, 25)}, StandardScaler(), 10)
plot_cv(results)

even the base parameter lgbm gave the high f1 score for both classes 

In [None]:
clf, results = gridcv(new_x_train, new_y_train, LGBMClassifier(), {'model__n_estimators' : np.arange(50, 325, 25)}, StandardScaler(), 10)
plot_cv(results)

In [None]:
results.head(3)

In [None]:
get_score(new_x_train, new_y_train, x_test, y_test, LGBMClassifier(n_estimators = 200, max_depth = 8, reg_lambda = 4, reg_alpha = 1.15), StandardScaler())

this is the best i could get for lgbm 

In [None]:
get_score(new_x_train, new_y_train, x_test, y_test, XGBClassifier(n_estimators = 200, max_depth = 8, reg_lambda = 4, reg_alpha = 1.15), StandardScaler())

In [None]:
dtc = DecisionTreeClassifier()
path = dtc.cost_complexity_pruning_path(new_x_train, new_y_train)
ccp_alphas  = path.ccp_alphas
ccp_alphas

In [None]:
clf, results = gridcv(new_x_train, new_y_train, RandomForestClassifier(), {'model__ccp_alpha' : ccp_alphas}, MinMaxScaler(), 10)
plot_cv(results)

In [None]:
results.iloc[0,-1]

In [None]:
get_score(new_x_train, new_y_train, x_test, y_test, RandomForestClassifier(ccp_alpha=0.0014846306981138452, n_estimators = 200, max_depth=8))

In [None]:
from sklearn.ensemble import VotingClassifier

clfs = [
    ('lgbm', LGBMClassifier(n_estimators = 200, max_depth = 8, reg_lambda = 4, reg_alpha = 1.15)),
    ('xgb' , XGBClassifier(n_estimators = 200, max_depth = 8, reg_lambda = 4, reg_alpha = 1.15)),
    ('rf'  , RandomForestClassifier(ccp_alpha=0.0014846306981138452, n_estimators = 200, max_depth=8))
]

clf = VotingClassifier(estimators=clfs, voting='soft')
clf.fit(new_x_train, new_y_train)
pred = clf.predict(x_test)
report(new_x_train, new_y_train, x_test, y_test, pred, clf)

lgbm gave the highest f1 score of 84% , 65% 