In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_path = "/kaggle/input/breast-cancer-wisconsin-data/data.csv"

In [None]:
# Installing TensorFlow Decision Forests
!pip install -q tensorflow_decision_forests
# Install Wurlitzer to display the detailed training logs. This is only needed in colabs.
!pip install -q wurlitzer

In [None]:
import tensorflow_decision_forests as tfdf

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math

try:
  from wurlitzer import sys_pipes
except:
  from colabtools.googlelog import CaptureLog as sys_pipes

from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [None]:
#@title

# Some of the model training logs can cover the full
# screen if not compressed to a smaller viewport.
# This magic allows setting a max height for a cell.
@register_line_magic
def set_cell_height(size):
  display(
      Javascript("google.colab.output.setIframeHeight(0, true, {maxHeight: " +
                 str(size) + "})"))

In [None]:
# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

In [None]:
## Breast Cancer Wisconsin (Diagnostic) Data Set
#### Predict whether the cancer is benign or malignant


from sklearn.datasets import load_breast_cancer
# data = load_breast_cancer()
data = pd.read_csv(data_path)
data

In [None]:
data.drop("Unnamed: 32", axis=1, inplace=True)

In [None]:
dataset_df = data

# Encode the categorical label into an integer.
#
# Details:
# This stage is necessary if your classification label is represented as a
# string. Note: Keras expected classification labels to be integers.

# Name of the label column.
label = "diagnosis"

classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")
# {1: 'benign', 0: 'malignant'}

In [None]:
dataset_df[label] = dataset_df[label].map(classes.index)

# Split the dataset into a training and a testing dataset.

def split_dataset(dataset, test_ratio=0.30):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, test_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

In [None]:
## Train Decision Forest Model 

%set_cell_height 300

# Specify the model.
model_1 = tfdf.keras.RandomForestModel()

# Optionally, add evaluation metrics.
model_1.compile(
    metrics=["accuracy"])

# Train the model.
# "sys_pipes" is optional. It enables the display of the training logs.
with sys_pipes():
  model_1.fit(x=train_ds)

In [None]:
evaluation = model_1.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
y_pred = (model_1.predict(test_ds)>0.5).astype('int')

y_true = np.array(test_ds_pd[label].tolist())

In [None]:
from sklearn import metrics
# A measurement that considers both precision and recall to compute the score. 
# The F1 score can be interpreted as a weighted average of the precision and recall values, 
# where an F1 score reaches its best value at 1 and worst value at 0.

metrics.f1_score(y_true, y_pred)

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm = metrics.confusion_matrix(y_true, y_pred)

plt.figure(figsize = (6,4))
sn.heatmap(df_cm, annot=True)

In [None]:
# The input features
model_1.make_inspector().features()

# The feature importances
model_1.make_inspector().variable_importances()

model_1.make_inspector().evaluation()