In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def get_flattened_transition_matrix(cluster_list, K1=7):
    heatmap = np.zeros((K1,K1))
    i = 0

    while i < len(cluster_list)-1:
        heatmap[cluster_list[i]][cluster_list[i+1]] += 1
        i += 1
    heatmap = np.divide(heatmap, heatmap.sum(axis=1).reshape(-1,1))
    heatmap = np.nan_to_num(heatmap, nan=0.0)

    return heatmap.flatten().reshape(1,-1)

# Loading and preprocessing data

In [3]:
class_mapping = {"sustainers": 0, "burnouts": 1, "churnouts": 2}

In [4]:
datadir = "../data/"

#### Train Data

In [5]:
df_train = pd.read_csv(datadir + "dtcr_chunk1_clusters.csv")
df_train["user_id"] = df_train["user_id"].astype(int).astype(str)
df_train = df_train.groupby(["user_id", "class"])["cluster"].apply(list).apply(get_flattened_transition_matrix).reset_index()

X_train = np.concatenate(df_train["cluster"].to_numpy(), axis=0)
y_train = df_train["class"].map(class_mapping).to_numpy()

del df_train


#### Test Data

In [6]:
df_test = pd.read_csv(datadir + "dtcr_chunk2_clusters.csv")
df_test["user_id"] = df_test["user_id"].astype(int).astype(str)
df_test = df_test.groupby(["user_id", "class"])["cluster"].apply(list).apply(get_flattened_transition_matrix).reset_index()

X_test = np.concatenate(df_test["cluster"].to_numpy(), axis=0)
y_test = df_test["class"].map(class_mapping).to_numpy()

del df_test

# Modeling

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
import statsmodels.api as sm

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [8]:
target_names = ["sustainers", "burnouts", "churnouts"]

### Logistic Regression

In [9]:
clf = LogisticRegression(random_state=0, max_iter=1000, C=1)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.63      0.33      0.43       115
    burnouts       0.57      0.17      0.26        78
   churnouts       0.64      0.92      0.76       256

    accuracy                           0.64       449
   macro avg       0.61      0.47      0.48       449
weighted avg       0.63      0.64      0.59       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.33      0.15      0.20       115
    burnouts       0.20      0.04      0.07        77
   churnouts       0.57      0.85      0.68       256

    accuracy                           0.53       448
   macro avg       0.37      0.34      0.32       448
weighted avg       0.44      0.53      0.45       448



### Random Forest

In [10]:
clf = RandomForestClassifier(max_depth=None, min_samples_leaf=8, random_state=0)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("---------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
---------------------------
              precision    recall  f1-score   support

  sustainers       0.97      0.53      0.69       115
    burnouts       1.00      0.10      0.19        78
   churnouts       0.67      1.00      0.80       256

    accuracy                           0.72       449
   macro avg       0.88      0.54      0.56       449
weighted avg       0.81      0.72      0.67       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.36      0.08      0.13       115
    burnouts       0.67      0.03      0.05        77
   churnouts       0.58      0.95      0.72       256

    accuracy                           0.57       448
   macro avg       0.54      0.35      0.30       448
weighted avg       0.54      0.57      0.45       448



### SVM

In [11]:
clf = SVC(C=2, random_state=0)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.97      0.63      0.76       115
    burnouts       1.00      0.44      0.61        78
   churnouts       0.74      0.99      0.85       256

    accuracy                           0.80       449
   macro avg       0.91      0.68      0.74       449
weighted avg       0.85      0.80      0.79       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.25      0.11      0.15       115
    burnouts       0.29      0.03      0.05        77
   churnouts       0.57      0.86      0.68       256

    accuracy                           0.52       448
   macro avg       0.37      0.33      0.30       448
weighted avg       0.44      0.52      0.44       448



# XGBoost

In [12]:
clf = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=100, max_depth=None, reg_lambda=4000, random_state=0)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.96      0.22      0.35       115
    burnouts       1.00      0.08      0.14        78
   churnouts       0.61      1.00      0.76       256

    accuracy                           0.64       449
   macro avg       0.86      0.43      0.42       449
weighted avg       0.77      0.64      0.55       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.33      0.04      0.08       115
    burnouts       0.50      0.01      0.03        77
   churnouts       0.58      0.97      0.72       256

    accuracy                           0.57       448
   macro avg       0.47      0.34      0.27       448
weighted avg       0.50      0.57      0.44       448



### ANN

In [13]:
clf = MLPClassifier(random_state=0, max_iter=10000, hidden_layer_sizes=(2,))
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.68      0.37      0.47       115
    burnouts       0.67      0.28      0.40        78
   churnouts       0.66      0.91      0.76       256

    accuracy                           0.66       449
   macro avg       0.67      0.52      0.54       449
weighted avg       0.66      0.66      0.62       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.31      0.14      0.19       115
    burnouts       0.20      0.06      0.10        77
   churnouts       0.57      0.82      0.67       256

    accuracy                           0.52       448
   macro avg       0.36      0.34      0.32       448
weighted avg       0.44      0.52      0.45       448

