In [1]:
!pip install -q joypy

In [1]:
import numpy as np
import joypy
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as cm
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from collections import Counter
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
heg_meta = pd.read_csv('../input/one-of-the-biggest-brazilian-cancers-center/heg_sample_data.csv')
print(heg_meta.info())
heg_meta.sample(5)

## Showing all column names

In [1]:
heg_meta.columns.values

## Selecting 12 columns to display correlation matrix

In [1]:
plt.figure(dpi=250)
sns.heatmap(heg_meta.iloc[:,[1,2,3,4,9,14,19,24,29,34,39,72]].corr(), cmap='viridis', annot=True, annot_kws={"size":5})
plt.tick_params(axis="x", labelsize=3)
plt.tick_params(axis="y", labelsize=3)
plt.xticks(rotation=45)
plt.show()

## Lets check the distribution of some variables across timestamps

In [1]:
fig, axes = joypy.joyplot(heg_meta.iloc[:,[39,40,41,42,43]], figsize=(12,6), colormap=cm.cm.get_cmap('Spectral'))
fig, axes = joypy.joyplot(heg_meta.iloc[:,[9,10,11,12,13]], figsize=(12,6), colormap=cm.cm.get_cmap('Spectral'))
fig, axes = joypy.joyplot(heg_meta.iloc[:,[14,15,16,17,18]], figsize=(12,6), colormap=cm.cm.get_cmap('Spectral'))
fig, axes = joypy.joyplot(heg_meta.iloc[:,[24,25,26,27,28]], figsize=(12,6), colormap=cm.cm.get_cmap('Spectral'))

## In the [paper](https://arxiv.org/pdf/2006.05514.pdf) they do not clarify what is the outcome. Im working with the idea that 1 stands for negative outcome and 0 a positive outcome

In [1]:
Counter(heg_meta['outcome'].values)

In [1]:
column_names = list(heg_meta.columns.values[1:-1])

X = np.array(heg_meta[heg_meta.columns[1:-1]])
y = np.array(heg_meta.outcome.values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_train_count = Counter(y_train)
y_test_count  = Counter(y_test)
print('Train count is {} and test count is {}.'.format(y_train_count, y_test_count))

## Simple linear regression

In [1]:
linear_reg = LinearRegression()
lr_model   = linear_reg.fit(X_train, y_train)
lr_predict = lr_model.predict(X_test)

round_pred = np.where(lr_predict > 0.5, 1, 0)
lr_acc     = accuracy_score(y_test, round_pred)
lr_matrix  = confusion_matrix(y_test, round_pred)

print('Accuracy: {}'.format(lr_acc.round(2)))
print(classification_report(y_test, round_pred))

fig=plt.figure(figsize=(12, 6))
sns.heatmap(lr_matrix, annot=True, cmap='viridis', fmt='d') 
plt.show()

In [1]:
explainer_lr = shap.LinearExplainer(lr_model, masker=shap.maskers.Impute(data=X_train)).shap_values(X_train)
plt.figure(dpi=100)
shap.summary_plot(explainer_lr, X_train, plot_type="violin", feature_names=column_names)
plt.show()