In [None]:
## Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

import tensorflow as tf

In [None]:
## read data
path = '../input/cirrhosis-prediction-dataset/cirrhosis.csv'
data = pd.read_csv(path)
print(data.shape)
data.head()

In [None]:
data.columns

Let check what are these attributes

1) ID: unique identifier

2) N_Days: number of days between registration and the earlier of death, transplantation, or study analysis time in July 1986

3) Status: status of the patient C (censored), CL (censored due to liver tx), or D (death)

4) Drug: type of drug D-penicillamine or placebo

5) Age: age in [days]

6) Sex: M (male) or F (female)

7) Ascites: presence of ascites N (No) or Y (Yes)

8) Hepatomegaly: presence of hepatomegaly N (No) or Y (Yes)

9) Spiders: presence of spiders N (No) or Y (Yes)

10) Edema: presence of edema N (no edema and no diuretic therapy for edema), S (edema present without diuretics, or edema resolved by diuretics), or Y (edema despite diuretic therapy)

11) Bilirubin: serum bilirubin in [mg/dl]

12) Cholesterol: serum cholesterol in [mg/dl]

13) Albumin: albumin in [gm/dl]

14) Copper: urine copper in [ug/day]

15) Alk_Phos: alkaline phosphatase in [U/liter]

16) SGOT: SGOT in [U/ml]

17) Triglycerides: triglicerides in [mg/dl]

18) Platelets: platelets per cubic [ml/1000]

19) Prothrombin: prothrombin time in seconds [s]

20) Stage: histologic stage of disease (1, 2, 3, or 4)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
## Data Cleaning
data['Age'] = data['Age'] / 365 # convert age to years
data.head()

In [None]:
## Check missing values
data.isnull().any()

In [None]:
## Util functions
def plot_count(df, col):
    count = df[col].value_counts()
    plt.title(f'Value counts of {col}')
    plt.xlabel(f'{col}')
    plt.ylabel('Counts')
    plt.bar(count.index, count.values);
    return count

def fill_na_mean(df):
    return df.fillna(df.mean())

def insert_missing_equal_classes(df, col):
    # impute missing values while maintaining class ratios
    count = 0
    for index, row in df.iterrows():
        if pd.isnull(row[col]) and count%2 == 0:
            df[col].iloc[index] = df[col].value_counts().index[0]
        elif pd.isnull(row[col]) and count%2 != 0:
            df[col].iloc[index] = df[col].value_counts().index[1]
        count += 1
        
def insert_missing_unequal_classes(df, col):
    count = 0
    values = df[col].isnull().sum() / 2
    for index, row in df.iterrows():
        if pd.isnull(row[col]) and count <= values:
            df[col].iloc[index] = df[col].value_counts().index[0]
            count += 1
        elif pd.isnull(row[col]) and count > values:
            df[col].iloc[index] = df[col].value_counts().index[1]
            count += 1

In [None]:
# Fix missing values in Drug
plot_count(data, 'Drug')
# seems like both are equally same so lets impute with equal in both to maintain the balance
print("Number of null rows:", data['Drug'].isnull().sum())
insert_missing_equal_classes(data, 'Drug')

In [None]:
## handle missing values for ascites
ascites_count = plot_count(data, 'Ascites')
print("Number of null rows:", data['Ascites'].isnull().sum())
insert_missing_unequal_classes(data, 'Ascites')
data['Ascites'].value_counts()

In [None]:
## handle missing values for Hepatomegaly
plot_count(data, 'Hepatomegaly')
print(data['Hepatomegaly'].isnull().sum())
insert_missing_equal_classes(data, 'Hepatomegaly')
print(data['Hepatomegaly'].isnull().sum())

In [None]:
## Handle missing values for Spiders
plot_count(data, 'Spiders')
insert_missing_unequal_classes(data, 'Spiders')

In [None]:
# Fill missing values with mean
numerical_col = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
for col in numerical_col:
    if col == 'Stage':
        data[col] = data[col].fillna(data[col].mode()[0]) # Only Stage with mode as it is categorical
    data[col] = fill_na_mean(data[col])

In [None]:
## Label encode all categorical columns
from sklearn import preprocessing

categorical_cols = ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
le = preprocessing.LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

In [None]:
## Lets check the target
plot_count(data, 'Stage')

In [None]:
# Split the data
X, y = data.drop(['ID', 'Stage'], axis=1), data['Stage']
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8)
print(X_train.shape, X_val.shape)

In [None]:
# Start with simpler models like Random Forest
rf = RandomForestClassifier(criterion = 'entropy', max_depth = 20, n_estimators = 100)
rf.fit(X_train, y_train)

In [None]:
# Eval
probs = rf.predict_proba(X_val)
score = roc_auc_score(y_val, probs, multi_class = 'ovr')
score

In [None]:
# Train XGB
xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss')
param_grid_xgb = [{'eta' : [0.005, 0.05, 0.1, 0.3, 0.5], 'max_depth' : [2, 4, 6, 8, 10], 'lambda': [0.25, 0.5, 1, 1.5, 2]}]
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv = 5, return_train_score = True)
grid_search_xgb.fit(X_train, y_train)

In [None]:
grid_search_xgb.best_params_, grid_search_xgb.best_score_

In [None]:
## Neural Network
# set seed
tf.random.set_seed(42)

# model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(4)
])

# compile
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(lr=0.01),
                metrics="accuracy")

# fit model
model.fit(X_train, y_train, epochs=500, verbose=0)

In [None]:
model.evaluate(X_val, y_val)

### Next Steps?

- For the numerical columns find the outliers and try to reduce them
- Validate numerical columns (eg. Cholesterol highest is 125, to check if thats possible or is a glitch ?) 
- Try stacking of models
- Tune Neural Network to perform better