In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from collections import defaultdict
import warnings

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.data import AUTOTUNE
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
RANDOM_SEED = 2
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/cirrhosis-prediction-dataset/cirrhosis.csv')
df.head()

In [None]:
df.info()

In [None]:
plt.figure(figsize = (20, 16))
sns.heatmap(df.isnull())

In [None]:
df = df.drop(df[df.Stage.isnull()].index)
df.describe().transpose()

In [None]:
for column in df.columns:
    print(column, '---->', df[column].dtype)

In [None]:
stage_corr = df.corr()['Stage'][:-1]
numerical = stage_corr.loc[abs(stage_corr) > 0.15].index
stage_corr

In [None]:
fig, axes = plt.subplots(3, 3, figsize = (10, 10))
axes = axes.flatten()
short_df = df.iloc[: 305]
for i, column in enumerate(df.loc[:, :'Stage'].select_dtypes('O')):
    data = short_df[column].groupby(df['Stage']).value_counts(normalize = True).rename('proportion').to_frame().reset_index()
    sns.barplot(x = column, y = 'proportion', hue = 'Stage', data = data, ax = axes[i])

<font size= '4'>Status, Drug, Hepatomegaly, Spiders, and Edema have somewhat large differences in the class representation between stages
We'll keep thse<font>

In [None]:
cat = ['Status', 'Drug', 'Hepatomegaly', 'Spiders', 'Edema']
df = df[[*numerical,*cat, 'Stage']]
df

In [None]:
pipe = imbpipeline([('imputer', SimpleImputer(strategy = 'median')), ('scaler', StandardScaler())])
models = [DecisionTreeClassifier, RandomForestClassifier, XGBClassifier]
model_names = ['Decision Tree', 'Random Forest', 'XGB']

In [None]:
def fit_model(train_indices, val_indices, model):
    X_train = X.iloc[train_indices]
    X_val = X.iloc[val_indices]
    y_train = y.iloc[train_indices]
    y_val = y.iloc[val_indices]
    if model == XGBClassifier:
            mod = model(use_label_encoder = False)
    else:
        mod = model()
    mod.fit(X_train, y_train)
    return X_val, y_val, mod

In [None]:
X = df[[*numerical, *cat]]
y = df['Stage']
for column in cat:
    if X[column].isnull().sum() != 0:
        X[column].iloc[:312] = LabelEncoder().fit_transform(X[column].iloc[:312])
    else:
        X[column] = LabelEncoder().fit_transform(X[column])
knn = KNNImputer(missing_values = np.nan, n_neighbors = 1)
X[cat] = KNNImputer(missing_values = np.nan, n_neighbors = 1).fit_transform(X[cat])
X[numerical] = pipe.fit_transform(X[numerical])

In [None]:
y = y.astype('int')
y = y - 1
y.value_counts()

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = RANDOM_SEED)
score_dic = defaultdict(list)

for train, val in skf.split(X, y):
    for i, model in enumerate(models):
        X_val, y_val, mod = fit_model(train, val,model)
        preds = mod.predict_proba(X_val)
        t = roc_auc_score(y_true = y_val, y_score = preds, multi_class = 'ovr')
        score_dic[model_names[i] + 'ovr'].append(t)

In [None]:
score_dic

In [None]:
smote = SMOTE(k_neighbors = 3)
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8)
X_copy, y_copy = smote.fit_resample(X_train, y_train)

X_val_copy, y_val_copy = smote.fit_resample(X_val, y_val)
forest = RandomForestClassifier()

param_grid = [{'n_estimators': [3, 10, 30, 50, 75, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3,4, 6, 8, 10, 20], },]
grid_search = GridSearchCV(forest, param_grid, cv = 5, scoring = 'roc_auc_ovr', return_train_score = True)
grid_search.fit(X_copy, y_copy)

In [None]:
grid_search.best_params_, grid_search.best_score_

In [None]:
grid_search.score(X_val_copy, y_val_copy) #Seems like I'm overfitting?

In [None]:
xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss')
param_grid_xgb = [{'eta' : [0.005, 0.05, 0.1, 0.3, 0.5], 'max_depth' : [2, 4, 6, 8, 10], 'lambda': [0.25, 0.5, 1, 1.5, 2]}]
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv = 5, scoring = 'roc_auc_ovr', return_train_score = True)
grid_search_xgb.fit(X_copy, y_copy)

In [None]:
grid_search_xgb.best_params_, grid_search_xgb.best_score_

In [None]:
grid_search_xgb.score(X_val_copy, y_val_copy)

In [None]:
X = X[numerical]
smote = SMOTE(k_neighbors = 3)
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.7)
X_copy, y_copy = smote.fit_resample(X_train, y_train)

X_val_copy, y_val_copy = smote.fit_resample(X_val, y_val)
forest = RandomForestClassifier()

param_grid = [{'n_estimators': [3, 10, 30, 50, 75, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3,4, 6, 8, 10, 20], },]
grid_search = GridSearchCV(forest, param_grid, cv = 5, scoring = 'roc_auc_ovr', return_train_score = True)
grid_search.fit(X_copy, y_copy)

In [None]:
grid_search.best_params_, grid_search.best_score_

In [None]:
grid_search.score(X_val_copy, y_val_copy)

In [None]:
xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss')
param_grid_xgb = [{'eta' : [0.005, 0.05, 0.1, 0.3, 0.5], 'max_depth' : [2, 4, 6, 8, 10], 'lambda': [0.25, 0.5, 1, 1.5, 2]}]
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv = 5, scoring = 'roc_auc_ovr', return_train_score = True)
grid_search_xgb.fit(X_copy, y_copy)

In [None]:
grid_search_xgb.score(X_val_copy, y_val_copy)

In [None]:
forest = RandomForestClassifier(criterion = 'entropy', max_depth = 20, n_estimators = 100)
forest.fit(X_copy, y_copy)

In [None]:
df = pd.read_csv('/kaggle/input/cirrhosis-prediction-dataset/cirrhosis.csv')
X = df[numerical].loc[(df['Stage'].notnull())]
X = pipe.transform(X)
y = df['Stage'].loc[(df['Stage'].notnull())].astype(int)
preds = forest.predict(X)
pred_proba = forest.predict_proba(X)
preds += 1
preds


In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
conf = confusion_matrix(y, preds)
conf_norm = conf.astype(float)/conf.sum(axis = 1)[:, np.newaxis]
sns.heatmap(conf_norm)
plt.title('Normalized Confusion Matrix')

In [None]:
score = roc_auc_score(y, pred_proba, multi_class = 'ovr')
score

In [None]:
sm = 0
stage = forest.predict(pipe.transform(df[numerical]))
stage += 1
label = df['ID']
return_df = pd.DataFrame({'label': label, 'stage': stage})
compare_df = pd.DataFrame({'Original': df['Stage'], 'Predict': stage})

In [None]:
return_df


In [None]:
fig, axes = plt.subplots(figsize = (10, 10))
sns.scatterplot(y = range(len(df)), x = df['Stage'])
sns.scatterplot(y = range(len(df)), x = stage + .2)

My gridsearch best_score during 5 fold cross validation is significantly higher than that of the score for the held out validation set (.87 vs .65)

Then the roc_auc_score for the whole, unaugmented set is much higher than the best during the cross validation (.95 vs .89)

If we go for correctness, I have somewhere around 65 mislabellings which is around 15% of the dataset

I'm going to have to think for a bit for other steps