In [1]:
!pip install xgboost



In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
user_classes = ["sustainers", "burnouts", "churnouts"]

In [None]:
train_path = "../data/chunk_1.npy"
test_path = "../data/chunk_2.npy"

In [4]:
non_feature_cols = list(map(str, [*range(1,5)]))
no_change_cols = list(map(str, [*range(5,9)]))
standardization_cols = list(map(str, [*range(9,19)]))
normalization_cols = list(map(str, [*range(19,31)]))
binary_cols = list(map(str, [*range(31,40)]))

float_cols = [*no_change_cols, *standardization_cols, *normalization_cols, *binary_cols]
all_cols = [*float_cols, *non_feature_cols]

# Loading and preprocessing data

In [5]:
class_mapping = {"sustainers": 0, "burnouts": 1, "churnouts": 2}

#### Train Data

In [6]:
data_train = np.load(train_path, allow_pickle=True)

data_train = data_train[:, :100, :]
data_train = data_train.reshape(-1, 42)
data_train = data_train[~np.all(data_train == 0, axis=1)]

df_train = pd.DataFrame(data_train, columns=[*all_cols, "class", "user_id", "settlement_part_id"])
del data_train

for col in float_cols:
    df_train.loc[:, col] = df_train.loc[:, col].astype(float)

y_train = df_train.groupby("user_id").agg({"class": "unique"})
y_train.loc[:, "class"] = y_train.loc[:, "class"].apply(lambda x: class_mapping[x[0]])
y_train = y_train.to_numpy()

X_train = df_train.groupby("user_id").agg(dict(zip(float_cols, ["mean"]*len(float_cols)))).to_numpy()
del df_train

#### Test Data

In [7]:
data_test = np.load(test_path, allow_pickle=True)

data_test = data_test[:, :100, :]
data_test = data_test.reshape(-1, 42)
data_test = data_test[~np.all(data_test == 0, axis=1)]

df_test = pd.DataFrame(data_test, columns=[*all_cols, "class", "user_id", "settlement_part_id"])
del data_test

for col in float_cols:
    df_test.loc[:, col] = df_test.loc[:, col].astype(float)

y_test = df_test.groupby("user_id").agg({"class": "unique"})
y_test.loc[:, "class"] = y_test.loc[:, "class"].apply(lambda x: class_mapping[x[0]])
y_test = y_test.to_numpy()

X_test = df_test.groupby("user_id").agg(dict(zip(float_cols, ["mean"]*len(float_cols)))).to_numpy()
del df_test

In [8]:
y_train, y_test = y_train.flatten(), y_test.flatten()

# Modeling

In [9]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import classification_report

In [10]:
target_names = ["sustainers", "burnouts", "churnouts"]

### Logistic Regression

In [11]:
clf = LogisticRegression(random_state=0, max_iter=1000, C=2)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.65      0.26      0.37       115
    burnouts       0.67      0.08      0.14        78
   churnouts       0.61      0.93      0.74       256

    accuracy                           0.61       449
   macro avg       0.64      0.42      0.42       449
weighted avg       0.63      0.61      0.54       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.45      0.18      0.26       115
    burnouts       0.60      0.04      0.07        77
   churnouts       0.59      0.91      0.72       256

    accuracy                           0.58       448
   macro avg       0.55      0.38      0.35       448
weighted avg       0.56      0.58      0.49       448



### Random Forest

In [12]:
clf = RandomForestClassifier(max_depth=7, random_state=0)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       1.00      0.70      0.82       115
    burnouts       1.00      0.42      0.59        78
   churnouts       0.76      1.00      0.86       256

    accuracy                           0.82       449
   macro avg       0.92      0.71      0.76       449
weighted avg       0.86      0.82      0.81       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.42      0.13      0.20       115
    burnouts       0.50      0.06      0.11        77
   churnouts       0.59      0.93      0.72       256

    accuracy                           0.57       448
   macro avg       0.50      0.37      0.34       448
weighted avg       0.53      0.57      0.48       448



### SVM

In [13]:
clf = SVC(C=35, random_state=0)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.74      0.43      0.55       115
    burnouts       0.91      0.13      0.22        78
   churnouts       0.65      0.93      0.76       256

    accuracy                           0.67       449
   macro avg       0.76      0.50      0.51       449
weighted avg       0.71      0.67      0.61       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.37      0.17      0.23       115
    burnouts       0.25      0.01      0.02        77
   churnouts       0.59      0.90      0.71       256

    accuracy                           0.56       448
   macro avg       0.40      0.36      0.32       448
weighted avg       0.47      0.56      0.47       448



### XGBoost

In [14]:
clf = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=100, max_depth=1, random_state=0)
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.83      0.56      0.67       115
    burnouts       0.93      0.36      0.52        78
   churnouts       0.71      0.95      0.81       256

    accuracy                           0.74       449
   macro avg       0.82      0.62      0.66       449
weighted avg       0.78      0.74      0.72       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.39      0.23      0.29       115
    burnouts       0.43      0.12      0.18        77
   churnouts       0.60      0.84      0.70       256

    accuracy                           0.56       448
   macro avg       0.47      0.39      0.39       448
weighted avg       0.51      0.56      0.50       448



### ANN

In [15]:
clf = MLPClassifier(random_state=0, max_iter=1000, hidden_layer_sizes=(16,))
clf.fit(X_train, y_train)

print("Train Classification Report")
print("--------------------------")
print(classification_report(y_train, clf.predict(X_train), target_names=target_names))

print("\n")

print("Test Classification Report")
print("--------------------------")
print(classification_report(y_test, clf.predict(X_test), target_names=target_names))

Train Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.71      0.46      0.56       115
    burnouts       0.67      0.21      0.31        78
   churnouts       0.65      0.89      0.75       256

    accuracy                           0.66       449
   macro avg       0.67      0.52      0.54       449
weighted avg       0.67      0.66      0.63       449



Test Classification Report
--------------------------
              precision    recall  f1-score   support

  sustainers       0.42      0.22      0.29       115
    burnouts       0.36      0.05      0.09        77
   churnouts       0.58      0.86      0.70       256

    accuracy                           0.56       448
   macro avg       0.46      0.38      0.36       448
weighted avg       0.51      0.56      0.49       448

