In [None]:
%%capture
!pip install --upgrade lifelines

# Importing Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import sklearn
import matplotlib.pyplot as plt

from lifelines import CoxPHFitter
from lifelines.utils import concordance_index as cindex
from sklearn.model_selection import train_test_split

# Importing the Dataset

Let's see what our data looks like:

In [None]:
df = pd.read_csv('/kaggle/input/mayo-clinic-primary-biliary-cirrhosis-data/pbc.csv')
df.head()

Let's create a list `continuous_columns` to store all the variables in the dataset:-

In [None]:
continuous_columns = ['age', # Age of the Patient
                      'bili', # Serum Bilirubin in mg/dl
                      'chol', # Serum Cholesterol in mg/dl
                      'albumin', # Albumin in gm/dl
                      'copper', # Urine Copper in ug/day
                      'alk.phos', # Alkaline Phosphatase in U/liter
                      'ast', # presence of Ascites: 0 = No 1 = Yes
                      'trig', # Triglicerides in mg/dl
                      'platelet', # Platelets per cubic ml/1000
                      'protime' # Prothrombin time in seconds
                     ]

# Pre-processing

### Assigning female as 0 and male as 1

Converting the female(f) and male(m) to binary so that they can be used in training

In [None]:
for i in df.index:
    df.at[i, 'sex'] = 0 if df.loc[i,'sex'] == "f" else 1

In [None]:
df.head()

## Splitting into Training, Testing and Validation Dataset

In [None]:
np.random.seed(0)
df_dev, df_test = train_test_split(df, test_size = 0.2)
df_train, df_val = train_test_split(df_dev, test_size = 0.25)

# Normalizing Data

In [None]:
mean = df_train.loc[:, continuous_columns].mean()
std = df_train.loc[:, continuous_columns].std()
df_train.loc[:, continuous_columns] = (df_train.loc[:, continuous_columns] - mean) / std
df_val.loc[:, continuous_columns] = (df_val.loc[:, continuous_columns] - mean) / std
df_test.loc[:, continuous_columns] = (df_test.loc[:, continuous_columns] - mean) / std

In [None]:
df_train.loc[:, continuous_columns].describe()

# One-hot Encoding the Values

In [None]:
def one_hot_encoder(dataframe, columns):
    return pd.get_dummies(dataframe, columns = columns, drop_first = True, dtype=np.float64)

In [None]:
to_encode = ["edema", "stage"]

one_hot_train = one_hot_encoder(df_train, to_encode)
one_hot_val = one_hot_encoder(df_val, to_encode)
one_hot_test = one_hot_encoder(df_test, to_encode)

print(one_hot_val.columns.tolist())
print(f"There are {len(one_hot_val.columns)} columns")

In [None]:
one_hot_train.head()

## Removing NaN values

In [None]:
one_hot_train.dropna(inplace=True)

# Fitting a Cox Proportional Hazard Model

In [None]:
cph = CoxPHFitter()
cph.fit(one_hot_train, duration_col = 'time', event_col = 'status', step_size=0.1)

In [None]:
cph.print_summary()

In [None]:
cph.plot_partial_effects_on_outcome('trt', values=[0, 1]);