
# Table of Contents
* [Target](#1)
* [Numerical Features](#2)
* [Categorical Features](#3)
* [Target vs Features](#4)
* [Fit Model](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# ML tools
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
# import data / preview
df = pd.read_csv('../input/breastcancerdataset/BRCA.csv')
df.head()

In [None]:
# overview
df.info()

#### We have a few rows with missing Patient_Status!

In [None]:
# remove rows with missing Patient_Status
df = df.dropna(subset=['Patient_Status'], axis=0)

In [None]:
# gender
df.Gender.value_counts()

In [None]:
# use only FEMALE individuals
df = df[df.Gender=='FEMALE']

In [None]:
# convert dates
df.Date_of_Surgery = pd.to_datetime(df.Date_of_Surgery)
df.Date_of_Last_Visit = pd.to_datetime(df.Date_of_Last_Visit)

In [None]:
df.Date_of_Surgery.describe(datetime_is_numeric=True)

In [None]:
df.Date_of_Last_Visit.describe(datetime_is_numeric=True)

In [None]:
df['Surgery_Year'] = df.Date_of_Surgery.dt.year
df.Surgery_Year.value_counts()

In [None]:
df['LastVisit_Year'] = df.Date_of_Last_Visit.dt.year
df.LastVisit_Year.value_counts()

In [None]:
# adjust rows having years in the future
df.LastVisit_Year[df.LastVisit_Year>2021]=2021

In [None]:
df.LastVisit_Year.value_counts()

<a id='1'></a>
# Target

In [None]:
# basic stats
target = 'Patient_Status'
print(df[target].value_counts())
df[target].value_counts().plot(kind='bar')
plt.grid()
plt.show()

<a id='2'></a>
# Numerical Features

In [None]:
features_num = ['Age', 'Protein1', 'Protein2', 'Protein3', 'Protein4',
                'Surgery_Year']

In [None]:
# basic stats
df[features_num].describe()

In [None]:
# plot distribution of numerical features
for f in features_num:
    df[f].plot(kind='hist', bins=30)
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

fig = plt.figure(figsize = (8,6))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (8,6))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
# scatter plots
sns.pairplot(df[features_num])
plt.show()

<a id='3'></a>
# Categorical Features

In [None]:
features_cat = ['Tumour_Stage', 'Histology', 'ER status', 
                'PR status', 'HER2 status', 'Surgery_type']

In [None]:
# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(14,4))
    df[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

#### ER status and PR status have only one level => remove these two features.

In [None]:
features_cat = ['Tumour_Stage', 'Histology', 'HER2 status', 'Surgery_type']

<a id='4'></a>
# Target vs Features

### Numerical Features

In [None]:
# plot target vs binned numerical features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_num:
    
    # add binned version of each numerical feature first
    new_var = f + '_bin'
    df[new_var] = pd.qcut(df[f], 10, duplicates='drop')
    
    # then create mosaic plot
    plt.rcParams['figure.figsize'] = (16,5) # increase plot size for mosaics
    mosaic(df, [new_var, target], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

### Categorical Features

In [None]:
# plot target vs features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_cat:
    plt.rcParams['figure.figsize'] = (16,5) # increase plot size for mosaics
    mosaic(df, [f, target], title='Target vs ' + f + ' [binned]',
           label_rotation=90)
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

<a id='5'></a>
# Fit Model

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# select predictors
predictors = features_num + features_cat
print('Number of predictors: ', len(predictors))
print(predictors)

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df[predictors+[target]])

# force categorical target
df_hex[target] = df_hex[target].asfactor()

# train / test split
train_perc = 0.8
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

In [None]:
# check target distribution in training data
train_hex['Patient_Status'].as_data_frame().value_counts()

In [None]:
# check target distribution in test set
test_hex['Patient_Status'].as_data_frame().value_counts()

In [None]:
# define model
n_cv = 4
fit_1 = H2OGeneralizedLinearEstimator(family = 'binomial',
                                      nfolds = n_cv,
                                      standardize = True,
                                      alpha = 0,
                                      # 0: Ridge (L2), 1: LASSO (L1)                                          
                                      lambda_search = True,
                                      score_each_iteration = True,                                          
                                      seed=12345)

In [None]:
# train model
t1 = time.time()
fit_1.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

### Performance looks not really convincing here...

### Variable Importance

In [None]:
# basic version
fit_1.varimp_plot(-1)

In [None]:
# training performance - AUC
perf_train = fit_1.model_performance(train=True)
perf_train.plot()

In [None]:
# calc performance on test test
perf_test = fit_1.model_performance(test_hex)

# ROC Curve - Test Set
perf_test.plot()