In [None]:
# use TensorFlow decision forests to predict customer churn

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
import math

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import tensorflow_decision_forests as tfdf

##### helper functions

##### load dataset

In [3]:
# load dataset
dataset = pd.read_csv("dataset.csv", low_memory = False)

# dataset.shape

print("{} examples, and {} features in the dataset".format(
    dataset.shape[0], dataset.shape[1]))

413955 examples, and 37 features in the dataset


In [4]:
dataset.columns

Index(['subscription_id', 'observation_dt', 'is_retained', 'specialization_id',
       'cnt_courses_in_specialization', 'specialization_domain',
       'is_professional_certificate', 'is_gateway_certificate',
       'learner_days_since_registration', 'learner_country_group',
       'learner_gender', 'learner_cnt_other_courses_active',
       'learner_cnt_other_courses_paid_active',
       'learner_cnt_other_courses_items_completed',
       'learner_cnt_other_courses_paid_items_completed',
       'learner_cnt_other_transactions_past', 'learner_other_revenue',
       'subscription_period_order', 'days_since_last_payment',
       'days_til_next_payment_due',
       'cnt_enrollments_started_before_payment_period',
       'cnt_enrollments_completed_before_payment_period',
       'cnt_enrollments_active_before_payment_period',
       'cnt_items_completed_before_payment_period',
       'cnt_graded_items_completed_before_payment_period',
       'is_subscription_started_with_free_trial',
      

In [5]:
# remove NA values
dataset = dataset.dropna()
sum(dataset.isnull().sum())

0

In [6]:
# drop id and time features
drop_features = ["subscription_id", "observation_dt", "specialization_id"]

dataset = dataset.drop(columns = drop_features)

In [7]:
dataset.dtypes

is_retained                                         float64
cnt_courses_in_specialization                       float64
specialization_domain                                object
is_professional_certificate                          object
is_gateway_certificate                               object
learner_days_since_registration                     float64
learner_country_group                                object
learner_gender                                       object
learner_cnt_other_courses_active                    float64
learner_cnt_other_courses_paid_active               float64
learner_cnt_other_courses_items_completed           float64
learner_cnt_other_courses_paid_items_completed      float64
learner_cnt_other_transactions_past                 float64
learner_other_revenue                               float64
subscription_period_order                           float64
days_since_last_payment                             float64
days_til_next_payment_due               

In [8]:
# transform boolean columns into string type

bool_cols = ["is_professional_certificate", "is_gateway_certificate", "is_active_capstone_during_pay_period", 
             "is_subscription_started_with_free_trial", "is_active_capstone_during_pay_period"]

for column in bool_cols:
#     print(column)
    dataset[column] = dataset[column].astype(str)

In [9]:
dataset["is_retained"] = dataset["is_retained"].astype(int)

In [10]:
# balanced dataset?
# what percent of the data represents the positive class

sum(dataset["is_retained"]) / dataset.shape[0] * 100

54.36631694902562

### Training a Random Forest model

In [11]:
# split dataset into training, validation and test sets

random_state = 1234

label = "is_retained"

train_ds_pd, val_ds_pd = train_test_split(dataset, test_size = 0.4, random_state = random_state)
val_ds_pd, test_ds_pd = train_test_split(val_ds_pd, test_size = 0.5, random_state = random_state)

In [14]:
print("{} examples in training, {} examples for validation, {} examples for testing.".format(
    train_ds_pd.shape[0], val_ds_pd.shape[0], test_ds_pd.shape[0]))

248371 examples in training, 82791 examples for validation, 82791 examples for testing.


In [15]:
# convert the pandas dataframe into tensorflow datasets

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

#### train and evaluate the model

In [16]:
trees = range(50, 1050, 50)
type(trees[0])

int

In [21]:
def trainRandomForest(tf_dataset, n_trees, max_depth):
    
    model = tfdf.keras.RandomForestModel(num_trees = n_trees, max_depth = max_depth, verbose = 0, 
                                        compute_oob_variable_importances = True)
    
    model.fit(tf_dataset)
    
    return model


def computeMetrics(model, val_dataset, label):
 
    # evaluate the model
    evaluation = model.predict(val_dataset)

    # predicted probabilities of subscription renewal
    predicted_probs = evaluation.flatten()


    predictions = (predicted_probs > 0.5).astype(int)
    
    metrics = {
        "accuracy": accuracy_score(val_ds_pd[label], predictions),
        "precision": precision_score(val_ds_pd[label], predictions),
        "recall": recall_score(val_ds_pd[label], predictions),
        "f1": f1_score(val_ds_pd[label], predictions),
        "roc_auc": roc_auc_score(val_ds_pd[label], predictions),
    }
    
    return metrics

 
# specify the model

# model_1 = tfdf.keras.RandomForestModel(num_trees = 50, max_depth = 16, verbose = 2, 
#                                        compute_oob_variable_importances = True)

# train the model
# model_1.fit(train_ds)

In [None]:
accuracy = []
precision = []
recall = []
f1 = []
roc_auc = []

for tree in trees:
    
    model = trainRandomForest(train_ds, tree, max_depth = 16)
    
    print("trained forest with {} trees".format(tree))
    
    metrics = computeMetrics(model, val_ds, label)
    
    accuracy.append((tree, metrics["accuracy"]))
    precision.append((tree, metrics["precision"]))
    recall.append((tree, metrics["recall"]))
    f1.append((tree, metrics["f1"]))
    roc_auc.append((tree, metrics["roc_auc"]))
    
    

[INFO 24-04-30 15:07:20.1359 EDT kernel.cc:1233] Loading model from path /var/folders/sc/c663j6bx65391kdjgk7jy_d00000gn/T/tmphxz4uvb0/model/ with prefix ab026115bd7f4d64
[INFO 24-04-30 15:07:21.4079 EDT decision_forest.cc:734] Model loaded with 50 root(s), 517174 node(s), and 33 input feature(s).
[INFO 24-04-30 15:07:21.4080 EDT abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-04-30 15:07:21.4080 EDT kernel.cc:1061] Use fast generic engine


trained forest with 50 trees


[INFO 24-04-30 15:10:57.3742 EDT kernel.cc:1233] Loading model from path /var/folders/sc/c663j6bx65391kdjgk7jy_d00000gn/T/tmpq3qa1s4a/model/ with prefix 46fb34ca8a5f427c
[INFO 24-04-30 15:11:00.0984 EDT decision_forest.cc:734] Model loaded with 100 root(s), 1042346 node(s), and 33 input feature(s).
[INFO 24-04-30 15:11:00.0984 EDT abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-04-30 15:11:00.0984 EDT kernel.cc:1061] Use fast generic engine


trained forest with 100 trees


[INFO 24-04-30 15:16:26.9185 EDT kernel.cc:1233] Loading model from path /var/folders/sc/c663j6bx65391kdjgk7jy_d00000gn/T/tmpa4eh7itr/model/ with prefix 50f70013aa0f4a2c
[INFO 24-04-30 15:16:30.6605 EDT decision_forest.cc:734] Model loaded with 150 root(s), 1558600 node(s), and 33 input feature(s).
[INFO 24-04-30 15:16:30.6605 EDT abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-04-30 15:16:30.6606 EDT kernel.cc:1061] Use fast generic engine


trained forest with 150 trees


[INFO 24-04-30 15:23:29.2653 EDT kernel.cc:1233] Loading model from path /var/folders/sc/c663j6bx65391kdjgk7jy_d00000gn/T/tmpc31qfeqk/model/ with prefix 4fb8d70708a4428f
[INFO 24-04-30 15:23:34.2165 EDT decision_forest.cc:734] Model loaded with 200 root(s), 2066842 node(s), and 33 input feature(s).
[INFO 24-04-30 15:23:34.2165 EDT abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-04-30 15:23:34.2166 EDT kernel.cc:1061] Use fast generic engine


trained forest with 200 trees


In [69]:
accuracy

[]

In [29]:
# model_1.summary()

Model: "random_forest_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (33):
	cnt_courses_in_specialization
	cnt_days_active_before_payment_period
	cnt_days_active_during_payment_period
	cnt_days_since_last_activity
	cnt_enrollments_active_before_payment_period
	cnt_enrollments_active_during_payment_period
	cnt_enrollments_completed_before_payment_period
	cnt_enrollments_completed_during_payment_period
	cnt_enrollments_started_before_payment_period
	cnt_enrollments_started_during_payment_period
	cnt_graded_items_completed_before_payment_period
	cnt_graded_items_completed_during_payment_period
	cnt_items_completed_before_payment_period
	cnt_items_completed_during_p

In [30]:
# model_1.compile(metrics = ["accuracy"])

# evaluation = model_1.predict(val_ds)
# print()




In [31]:
predicted_probs = evaluation.flatten()
# predicted_probs

predictions = (predicted_probs > 0.5).astype(int)

In [32]:
# accuracy
accuracy = accuracy_score(val_ds_pd[label], predictions)

# precision
precision = precision_score(val_ds_pd[label], predictions)

# recall
recall = recall_score(val_ds_pd[label], predictions)

# f1
f1 = f1_score(val_ds_pd[label], predictions)

# auc_score
roc_auc = roc_auc_score(val_ds_pd[label], predictions)