In [None]:
# use TensorFlow decision forests to predict customer churn

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
import math

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import tensorflow_decision_forests as tfdf

In [4]:
# load dataset
dataset = pd.read_csv("dataset.csv", low_memory = False)

dataset.shape

(413955, 37)

In [5]:
dataset.columns

Index(['subscription_id', 'observation_dt', 'is_retained', 'specialization_id',
       'cnt_courses_in_specialization', 'specialization_domain',
       'is_professional_certificate', 'is_gateway_certificate',
       'learner_days_since_registration', 'learner_country_group',
       'learner_gender', 'learner_cnt_other_courses_active',
       'learner_cnt_other_courses_paid_active',
       'learner_cnt_other_courses_items_completed',
       'learner_cnt_other_courses_paid_items_completed',
       'learner_cnt_other_transactions_past', 'learner_other_revenue',
       'subscription_period_order', 'days_since_last_payment',
       'days_til_next_payment_due',
       'cnt_enrollments_started_before_payment_period',
       'cnt_enrollments_completed_before_payment_period',
       'cnt_enrollments_active_before_payment_period',
       'cnt_items_completed_before_payment_period',
       'cnt_graded_items_completed_before_payment_period',
       'is_subscription_started_with_free_trial',
      

In [6]:
# remove NA values
dataset = dataset.dropna()
sum(dataset.isnull().sum())

0

In [7]:
# drop id and time features
drop_features = ["subscription_id", "observation_dt", "specialization_id"]

dataset = dataset.drop(columns = drop_features)

In [8]:
dataset.dtypes

is_retained                                         float64
cnt_courses_in_specialization                       float64
specialization_domain                                object
is_professional_certificate                          object
is_gateway_certificate                               object
learner_days_since_registration                     float64
learner_country_group                                object
learner_gender                                       object
learner_cnt_other_courses_active                    float64
learner_cnt_other_courses_paid_active               float64
learner_cnt_other_courses_items_completed           float64
learner_cnt_other_courses_paid_items_completed      float64
learner_cnt_other_transactions_past                 float64
learner_other_revenue                               float64
subscription_period_order                           float64
days_since_last_payment                             float64
days_til_next_payment_due               

In [9]:
# transform boolean columns into string type

bool_cols = ["is_professional_certificate", "is_gateway_certificate", "is_active_capstone_during_pay_period", 
             "is_subscription_started_with_free_trial", "is_active_capstone_during_pay_period"]

for column in bool_cols:
#     print(column)
    dataset[column] = dataset[column].astype(str)

In [10]:
dataset["is_retained"] = dataset["is_retained"].astype(int)

In [16]:
# balanced dataset?
# what percent of the data represents the positive class

sum(dataset["is_retained"]) / dataset.shape[0] * 100

54.36631694902562

### Training a Random Forest model

In [17]:
# split dataset into training, validation and test sets

random_state = 1234

label = "is_retained"

train_ds_pd, val_ds_pd = train_test_split(dataset, test_size = 0.4, random_state = random_state)
val_ds_pd, test_ds_pd = train_test_split(val_ds_pd, test_size = 0.5, random_state = random_state)

In [18]:
train_ds_pd.shape

(248371, 34)

In [20]:
print("{} examples in training, {} examples for validation, {} examples for testing.".format(
    train_ds_pd.shape[0], val_ds_pd.shape[0], test_ds_pd.shape[0]))

248371 examples in training, 82791 examples for validation, 82791 examples for testing.


In [21]:
# convert the pandas dataframe into tensorflow datasets

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

#### train and evaluate the model

In [22]:
def trainRandomForest(tf_dataset, n_trees, max_depth):
    
    model = tfdf.keras.RandomForestModel(num_trees = n_trees, max_depth = max_depth, verbose = 2, 
                                        compute_oob_variable_importances = True)
    
    model.fit(tf_dataset)
    
    return model


def computeMetrics(model, val_dataset):
 
    # evaluate the model
    evaluation = model.predict(val_dataset)

    # predicted probabilities of subscription renewal
    predicted_probs = evaluation.flatten()


    predictions = (predicted_probs > 0.5).astype(int)
    
    metrics = {
        "accuracy": accuracy_score(val_dataset[label], predictions),
        "precision": precision_score(val_dataset[label], predictions),
        "recall": recall_score(val_dataset[label], predictions),
        "f1": f1_score(val_dataset[label], predictions),
        "roc_auc": roc_auc_score(val_dataset[label], predictions),
    }
    
    return metrics

    

# specify the model

model_1 = tfdf.keras.RandomForestModel(num_trees = 50, max_depth = 16, verbose = 2, 
                                       compute_oob_variable_importances = True)

# train the model
model_1.fit(train_ds)

Use 8 thread(s) for training
Use /var/folders/sc/c663j6bx65391kdjgk7jy_d00000gn/T/tmpgtv6aiyv as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'cnt_courses_in_specialization': <tf.Tensor 'data:0' shape=(None,) dtype=float64>, 'specialization_domain': <tf.Tensor 'data_1:0' shape=(None,) dtype=string>, 'is_professional_certificate': <tf.Tensor 'data_2:0' shape=(None,) dtype=string>, 'is_gateway_certificate': <tf.Tensor 'data_3:0' shape=(None,) dtype=string>, 'learner_days_since_registration': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'learner_country_group': <tf.Tensor 'data_5:0' shape=(None,) dtype=string>, 'learner_gender': <tf.Tensor 'data_6:0' shape=(None,) dtype=string>, 'learner_cnt_other_courses_active': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'learner_cnt_other_courses_paid_active': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'learner_cnt_other_courses_items_completed': <tf.Tensor 'data_9:0' shape=(None

Training dataset read in 0:00:12.143324. Found 248371 examples.
Training model...
Standard output detected as not visible to the user e.g. running in a notebook. Creating a training log redirection. If training gets stuck, try calling tfdf.keras.set_training_logs_redirection(False).


[INFO 24-04-29 16:12:43.9696 EDT kernel.cc:771] Start Yggdrasil model training
[INFO 24-04-29 16:12:43.9706 EDT kernel.cc:772] Collect training examples
[INFO 24-04-29 16:12:43.9706 EDT kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-04-29 16:12:43.9712 EDT kernel.cc:391] Number of batches: 249
[INFO 24-04-29 16:12:43.9712 EDT kernel.cc:392] Number of examples: 248371
[INFO 24-04-29 16:12:44.1700 EDT kernel.cc:792] Training dataset:
Number of records: 248371
Number of columns: 34

Number of columns by type:
	NUMERICAL: 26 (76.4706%)
	CATEGORICAL: 8 (23.5294%)

Columns:

NUMERICAL: 26 (76.4706%)
	1: "cnt_courses_in_specialization" NUMERICAL mean

[INFO 24-04-29 16:12:56.1697 EDT random_forest.cc:802] Training of tree  6/50 (tree index:6) done accuracy:0.659975 logloss:8.45189
[INFO 24-04-29 16:13:07.7081 EDT random_forest.cc:802] Training of tree  11/50 (tree index:10) done accuracy:0.669018 logloss:6.02425
[INFO 24-04-29 16:13:18.3017 EDT random_forest.cc:802] Training of tree  16/50 (tree index:15) done accuracy:0.675989 logloss:4.74689
[INFO 24-04-29 16:13:28.6988 EDT random_forest.cc:802] Training of tree  21/50 (tree index:20) done accuracy:0.679636 logloss:3.96233
[INFO 24-04-29 16:13:38.8579 EDT random_forest.cc:802] Training of tree  26/50 (tree index:25) done accuracy:0.681781 logloss:3.49722
[INFO 24-04-29 16:13:48.8939 EDT random_forest.cc:802] Training of tree  31/50 (tree index:30) done accuracy:0.682632 logloss:3.14357
[INFO 24-04-29 16:14:00.8548 EDT random_forest.cc:802] Training of tree  37/50 (tree index:36) done accuracy:0.68386 logloss:2.84233
[INFO 24-04-29 16:14:12.6017 EDT random_forest.cc:802] Training o

Model trained in 0:01:49.397346
Compiling model...
Model compiled.


<tf_keras.src.callbacks.History at 0x13ee8b7f0>

In [23]:
model_1.summary()

Model: "random_forest_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (33):
	cnt_courses_in_specialization
	cnt_days_active_before_payment_period
	cnt_days_active_during_payment_period
	cnt_days_since_last_activity
	cnt_enrollments_active_before_payment_period
	cnt_enrollments_active_during_payment_period
	cnt_enrollments_completed_before_payment_period
	cnt_enrollments_completed_during_payment_period
	cnt_enrollments_started_before_payment_period
	cnt_enrollments_started_during_payment_period
	cnt_graded_items_completed_before_payment_period
	cnt_graded_items_completed_during_payment_period
	cnt_items_completed_before_payment_period
	cnt_items_completed_during_p

In [24]:
model_1.compile(metrics = ["accuracy"])

evaluation = model_1.predict(val_ds)
print()




In [27]:
predicted_probs = evaluation.flatten()
# predicted_probs

predictions = (predicted_probs > 0.5).astype(int)

In [28]:
# accuracy
accuracy = accuracy_score(val_ds_pd[label], predictions)

# precision
precision = precision_score(val_ds_pd[label], predictions)

# recall
recall = recall_score(val_ds_pd[label], predictions)

# f1
f1 = f1_score(val_ds_pd[label], predictions)

# auc_score
roc_auc = roc_auc_score(val_ds_pd[label], predictions)