In [4]:
from pathlib import Path
import pandas as pd

In [33]:
SEED = 42

PRIVATE_DATA = Path('../files/private')

In [34]:
agg_session = pd.read_csv(PRIVATE_DATA / 'agg_data.agg_session.csv', index_col=0).drop(columns=['event_date', 'process_date', 'app_bundle_id'])
# agg_session = agg_session[(agg_session['install_datetime'] >= '2022-02-01') & (agg_session['install_datetime'] < '2022-03-01')]
agg_session = agg_session.sort_values(['install_datetime', 'user_id', 'start_session'])
agg_session = agg_session[agg_session['session_time'] != 0]

target = (agg_session['user_id'] == agg_session['user_id'].shift(-1)).astype(int)
agg_session = agg_session.assign(target=target).drop(columns=['user_id', 'install_datetime']).sample(frac=1, random_state=SEED)

In [41]:
mt_level_start = pd.read_csv(PRIVATE_DATA / 'prod_mt.MT_level_start.csv', index_col=0).astype({'booster_1_count': int, 'booster_2_count': int, 'booster_3_count': int}).drop(columns=['user_id', 'install_id'], errors='ignore')
mt_level_end = pd.read_csv(PRIVATE_DATA / 'prod_mt.MT_level_end.csv', index_col=0).fillna(0).astype({'booster_1_count': int, 'booster_2_count': int, 'booster_3_count': int}).drop(columns=['user_id', 'install_id'], errors='ignore')
mt_click_button = pd.read_csv(PRIVATE_DATA / 'prod_mt.MT_click_button.csv', index_col=0).drop(columns=['user_id', 'install_id'], errors='ignore')
mt_math_exercise = pd.read_csv(PRIVATE_DATA / 'prod_mt.MT_math_exercise.csv', index_col=0).drop(columns=['send_age']).astype({'exercise_result': int}).drop(columns=['user_id', 'install_id'], errors='ignore')
mt_session_end = pd.read_csv(PRIVATE_DATA / 'prod_mt.MT_session_end.csv', index_col=0).drop(columns=['user_id', 'install_id'], errors='ignore')
mt_show_windows = pd.read_csv(PRIVATE_DATA / 'prod_mt.MT_show_windows.csv', index_col=0).drop(columns=['user_id', 'install_id'], errors='ignore')

In [42]:
import numpy as np

def count_unique(data: pd.Series) -> int:
    return len(data.unique())

def count_zeroes(data: pd.Series) -> int:
    return np.count_nonzero(data == 0)

def count_ones(data: pd.Series) -> int:
    return np.count_nonzero(data == 1)
    

In [47]:
def default_filter(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df['session_id'].isin(agg_session['session_id'])]
    df = df.drop(columns=[
        'meta_accepted_at',
        'process_date',
        'event_id',
        'event_name',
        'event_date',
        'client_version',
        'install_id',
        'app_bundle_id',
        'device_langauge',
    ], errors='ignore')
    df = df.sort_values(['event_datetime'])
    df = df.fillna(0)
    return df

def level_agg(df: pd.DataFrame) -> pd.DataFrame:
    df_agg = df.groupby('session_id').agg({
        'device_hw_model': ['min'],
        'platform': ['min'],
        'match_type': ['count'],
        'match_count': [count_unique],
        'moves': ['min', 'max', 'sum', 'mean', 'median'],
        'goal_1_count': ['min', 'max', 'sum', 'mean', 'median'],
        'goal_2_count': ['min', 'max', 'sum', 'mean', 'median'],
        'goal_3_count': ['min', 'max', 'sum', 'mean', 'median'],
        'goal_4_count': ['min', 'max', 'sum', 'mean', 'median'],
        'booster_1_count': ['min', 'max', 'sum', 'mean', 'median'],
        'booster_2_count': ['min', 'max', 'sum', 'mean', 'median'],
        'booster_3_count': ['min', 'max', 'sum', 'mean', 'median'],
        'is_match_Campaign': ['sum'],
        'is_match_Cycled': ['sum'],
        'is_match_Lives': ['sum'],
        'is_match_Busters': ['sum'],
        'is_match_Skills': ['sum'],
        # 'device_hw_model': ['min', 'max', 'sum', 'count', 'mean', 'median', count_unique, 'unique'],
    })

    return df_agg

def click_button_agg(df: pd.DataFrame) -> pd.DataFrame:
    df_agg = (df.drop(columns=['user_id', 'event_datetime', 'device_hw_model', 'platform', 'button_id', 'from_page'], errors='ignore')
                .groupby('session_id')
                .agg('sum'))

    return df_agg

def math_exercise_agg(df: pd.DataFrame) -> pd.DataFrame:
    df_agg = df.groupby('session_id').agg({'exercise_result': [count_zeroes, count_ones]})
    return df_agg

def filter_agg_level_data(df: pd.DataFrame, agg_func: callable, filter_func: callable = default_filter, col_prefix: str = None,) -> pd.DataFrame:
    df = filter_func(df)
    df_agg = agg_func(df)
    if col_prefix:
        df_agg.columns = [ col_prefix + '__'.join(col) for col in df_agg.columns.to_flat_index()]
    return df_agg

In [44]:
match_types_ls = {f"is_match_{match}": (mt_level_start['match_type'] == match).astype(int) for match in mt_level_start['match_type'].unique()}
mt_level_start_agg = filter_agg_level_data(mt_level_start.assign(**match_types_ls), agg_func=level_agg,  col_prefix='ls__')
mt_level_start_agg.head()

Unnamed: 0_level_0,ls__device_hw_model__min,ls__platform__min,ls__match_type__count,ls__match_count__count_unique,ls__moves__min,ls__moves__max,ls__moves__sum,ls__moves__mean,ls__moves__median,ls__goal_1_count__min,...,ls__booster_3_count__min,ls__booster_3_count__max,ls__booster_3_count__sum,ls__booster_3_count__mean,ls__booster_3_count__median,ls__is_match_Campaign__sum,ls__is_match_Cycled__sum,ls__is_match_Lives__sum,ls__is_match_Busters__sum,ls__is_match_Skills__sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000B8147B53845A7947CE32930AB9971,samsung SM-G780G,android,1,1,27,27,27,27.0,27.0,5,...,0,0,0,0.0,0.0,1,0,0,0,0
0010EFD3723342CB9D083FC1CF8D4BC4,realme RMX1927,android,2,2,20,27,47,23.5,23.5,21,...,0,0,0,0.0,0.0,0,2,0,0,0
0011478E59AE4A1C8E9DAC2282000C30,samsung SM-G973F,android,4,4,27,27,108,27.0,27.0,19,...,0,0,0,0.0,0.0,4,0,0,0,0
001685A8EAB64A669C9C9B823AFD6F01,samsung SM-J400F,android,2,2,21,21,42,21.0,21.0,2,...,0,0,0,0.0,0.0,0,0,0,0,2
001B4885BD7845428EA1EE5BC5797BA8,Xiaomi 2109119DG,android,6,3,23,27,146,24.333333,23.0,13,...,0,0,0,0.0,0.0,2,0,4,0,0


In [45]:
match_types_le = {f"is_match_{match}": (mt_level_end['match_type'] == match).astype(int) for match in mt_level_end['match_type'].unique()}
mt_level_end_agg = filter_agg_level_data(mt_level_end.assign(**match_types_le), agg_func=level_agg, col_prefix='le__')
mt_level_end_agg.head()

Unnamed: 0_level_0,le__device_hw_model__min,le__platform__min,le__match_type__count,le__match_count__count_unique,le__moves__min,le__moves__max,le__moves__sum,le__moves__mean,le__moves__median,le__goal_1_count__min,...,le__booster_3_count__min,le__booster_3_count__max,le__booster_3_count__sum,le__booster_3_count__mean,le__booster_3_count__median,le__is_match_Campaign__sum,le__is_match_Cycled__sum,le__is_match_Lives__sum,le__is_match_Busters__sum,le__is_match_Skills__sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0010EFD3723342CB9D083FC1CF8D4BC4,realme RMX1927,android,4,2,0.0,0.0,0.0,0.0,0.0,21,...,0,0,0,0.0,0.0,0,4,0,0,0
0011478E59AE4A1C8E9DAC2282000C30,samsung SM-G973F,android,5,5,0.0,0.0,0.0,0.0,0.0,8,...,0,0,0,0.0,0.0,5,0,0,0,0
001685A8EAB64A669C9C9B823AFD6F01,samsung SM-J400F,android,1,1,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0.0,0.0,0,0,0,0,1
001B4885BD7845428EA1EE5BC5797BA8,Xiaomi 2109119DG,android,6,3,0.0,0.0,0.0,0.0,0.0,6,...,0,0,0,0.0,0.0,4,0,2,0,0
00293EEF0DB54D9E99CC9AF10C130B22,realme RMX2063,android,4,4,0.0,0.0,0.0,0.0,0.0,3,...,0,0,0,0.0,0.0,0,4,0,0,0


In [48]:
button_ids = {f"is_button_{button}": (mt_click_button['button_id'] == button).astype(int) for button in mt_click_button['button_id'].unique()}
from_pages = {f"is_page_{button}": (mt_click_button['from_page'] == button).astype(int) for button in mt_click_button['from_page'].unique()}
mt_click_button_agg = filter_agg_level_data(mt_click_button.assign(**button_ids, **from_pages), agg_func=click_button_agg).add_prefix('bc__').add_suffix('__count')
mt_click_button_agg.head()

Unnamed: 0_level_0,bc__event_timestamp__count,bc__is_button_MainPlayButton__count,bc__is_button_RateApp_Star_2__count,bc__is_button_RateApp_Star_4__count,bc__is_button_RateApp_DoRate__count,bc__is_button_RateApp_Star_1__count,bc__is_button_RateApp_Star_5__count,bc__is_button_RateApp_Star_3__count,bc__is_button_Lives_PlayButton__count,bc__is_button_Booster_PlayButton__count,bc__is_button_Skills_PlayButton__count,bc__is_page_Menu__count,bc__is_page_RateApp__count,bc__is_page_BonusLocation__count
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
000B8147B53845A7947CE32930AB9971,1648650889,1,0,0,0,0,0,0,0,0,0,1,0,0
0010EFD3723342CB9D083FC1CF8D4BC4,3296739431,2,0,0,0,0,0,0,0,0,0,2,0,0
001685A8EAB64A669C9C9B823AFD6F01,4944302493,0,0,0,0,0,0,0,0,0,3,0,0,3
001B4885BD7845428EA1EE5BC5797BA8,6591790430,0,0,0,0,0,0,0,4,0,0,0,0,4
0025850739B5442F826D4E9C0978757F,1648503693,1,0,0,0,0,0,0,0,0,0,1,0,0


In [49]:
mt_math_exercise_agg = filter_agg_level_data(mt_math_exercise, agg_func=math_exercise_agg, col_prefix='me__')
mt_math_exercise_agg.head()

Unnamed: 0_level_0,me__exercise_result__count_zeroes,me__exercise_result__count_ones
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0010EFD3723342CB9D083FC1CF8D4BC4,1,5
0011478E59AE4A1C8E9DAC2282000C30,1,14
001685A8EAB64A669C9C9B823AFD6F01,3,0
001B4885BD7845428EA1EE5BC5797BA8,2,10
00293EEF0DB54D9E99CC9AF10C130B22,1,5


# Unite data

In [50]:
data = pd.merge(agg_session, mt_level_start_agg, how="left", on='session_id')
data = pd.merge(data, mt_level_end_agg, how="left", on='session_id')
data = pd.merge(data, mt_click_button_agg, how="left", on='session_id')
data = pd.merge(data, mt_math_exercise_agg, how="left", on='session_id')
data

Unnamed: 0,session_id,start_session,end_session,LT,session_time,match_count,target,ls__device_hw_model__min,ls__platform__min,ls__match_type__count,...,bc__is_button_RateApp_Star_5__count,bc__is_button_RateApp_Star_3__count,bc__is_button_Lives_PlayButton__count,bc__is_button_Booster_PlayButton__count,bc__is_button_Skills_PlayButton__count,bc__is_page_Menu__count,bc__is_page_RateApp__count,bc__is_page_BonusLocation__count,me__exercise_result__count_zeroes,me__exercise_result__count_ones
0,47ED078863704B81BF3EECC80DDB0CF4,2022-03-24 03:47:06+00:00,2022-03-24 03:56:42+00:00,22,9.600000,2,1,samsung SM-J600F,android,2.0,...,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,2.0,4.0
1,B726C878A56342EF99A8AA451F9E60D4,2022-03-28 11:57:52+00:00,2022-03-28 11:58:16+00:00,17,0.400000,0,1,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,,
2,3CA8B6D6F7A4486685D0DA8A56AD64E9,2022-03-23 04:57:13+00:00,2022-03-23 05:02:59+00:00,16,5.766667,1,1,Xiaomi M2006C3LG,android,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,3.0,,
3,AABD3B2E38A54C1CAEADEA11975A50BF,2022-03-27 03:01:10+00:00,2022-03-27 03:25:15+00:00,18,24.083333,8,1,realme RMX3263,android,9.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0,1.0,35.0
4,5804DC5F91944DEAB7D05FBA1F283377,2022-03-28 09:44:09+00:00,2022-03-28 09:50:35+00:00,22,6.433333,2,1,samsung SM-A325F,android,3.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13077,BD33FA7F77784D088893C3B627BCC0D6,2022-03-28 17:21:36+00:00,2022-03-28 17:27:02+00:00,15,5.433333,1,1,Xiaomi 2109119DG,android,3.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0
13078,8B791FD31FA94EC7970206855167F43A,2022-03-28 18:47:14+00:00,2022-03-28 18:51:52+00:00,22,4.633333,1,1,HONOR NTN-LX1,android,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,2.0
13079,A2A6319E076D467F8F7B26B77079D3AE,2022-03-25 16:07:48+00:00,2022-03-25 16:10:40+00:00,18,2.866667,2,1,Xiaomi M2006C3LG,android,2.0,...,,,,,,,,,,
13080,B91814EAA18C4A4487E9EEDDED87CC72,2022-03-28 11:23:06+00:00,2022-03-28 11:29:25+00:00,25,6.316667,2,1,Xiaomi Redmi Note 5,android,3.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,7.0


In [None]:
data.describe()

Unnamed: 0,LT,session_time,match_count,target,ls__match_type__count,ls__match_count__count_unique,ls__moves__min,ls__moves__max,ls__moves__sum,ls__moves__mean,...,bc__is_button_RateApp_Star_1__count,bc__is_button_RateApp_Star_4__count,bc__is_button_Skills_PlayButton__count,bc__is_button_Lives_PlayButton__count,bc__is_button_Booster_PlayButton__count,bc__is_page_Menu__count,bc__is_page_RateApp__count,bc__is_page_BonusLocation__count,me__exercise_result__count_zeroes,me__exercise_result__count_ones
count,88745.0,88745.0,88745.0,88745.0,74258.0,74258.0,74258.0,74258.0,74258.0,74258.0,...,69886.0,69886.0,69886.0,69886.0,69886.0,69886.0,69886.0,69886.0,48895.0,48895.0
mean,16.252589,9.822571,2.7729,0.936177,36.780549,4.874721,22.057583,26.167753,896.288009,24.070936,...,0.008857,0.019317,1.005523,2.944409,0.709842,17.265003,0.11068,4.659774,13.672216,38.878495
std,11.781756,14.505606,3.465362,0.244439,114.987602,5.581624,3.906057,4.025521,2841.907927,3.396525,...,0.292804,0.740704,5.738849,16.468386,5.160194,55.233181,1.988516,18.788314,62.687468,127.933493
min,0.0,0.016667,0.0,0.0,1.0,1.0,15.0,15.0,15.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,1.45,1.0,1.0,4.0,1.0,20.0,23.0,83.0,21.857143,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,5.0
50%,15.0,4.933333,2.0,1.0,9.0,3.0,20.0,27.0,216.0,24.5,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,12.0
75%,25.0,12.683333,3.0,1.0,27.0,6.0,25.0,30.0,648.0,26.333333,...,0.0,0.0,0.0,0.0,0.0,12.0,0.0,2.0,7.0,30.0
max,46.0,380.716667,93.0,1.0,2355.0,150.0,30.0,30.0,58875.0,30.0,...,55.0,53.0,226.0,448.0,210.0,1371.0,106.0,448.0,2229.0,3765.0


In [51]:
from collections import Counter

import numpy as np
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

In [52]:
SEED = 42

np.random.seed(SEED)

In [53]:
df = data.drop(columns=[
    'user_id', 'session_id', 'start_session', 'end_session', 'install_datetime', 'le__platform__min', 'le__device_hw_model__min'
], errors='ignore')
df['ls__device_hw_model__min'].fillna('', inplace=True)
df['ls__platform__min'].fillna('', inplace=True)

In [54]:
X = df.drop(['target'], axis=1)
Y = df['target']

In [60]:
cat_features = ['ls__device_hw_model__min', 'ls__platform__min']
valid_pool = catboost.Pool(X, cat_features=cat_features)

In [62]:
model = catboost.CatBoostClassifier()

model.load_model("../models/retention_catboost.bin")

<catboost.core.CatBoostClassifier at 0x7f7a0b25e4c0>

In [64]:
for threshold in np.arange(0.1, 1, 0.1):
    threshold = round(threshold, 1)
    pred = model.predict_proba(X)[:, 1]
    pred = [(p >= threshold).astype('int') for p in pred]
    roc = round(roc_auc_score(Y, pred), 4)
    print(f"threshold: {threshold}\nROC: {roc}")
    print(classification_report(Y, pred, zero_division=0))
    print("-" * 53)

threshold: 0.1
ROC: 0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       966
           1       0.93      1.00      0.96     12116

    accuracy                           0.93     13082
   macro avg       0.46      0.50      0.48     13082
weighted avg       0.86      0.93      0.89     13082

-----------------------------------------------------
threshold: 0.2
ROC: 0.5114
              precision    recall  f1-score   support

           0       0.49      0.02      0.05       966
           1       0.93      1.00      0.96     12116

    accuracy                           0.93     13082
   macro avg       0.71      0.51      0.50     13082
weighted avg       0.90      0.93      0.89     13082

-----------------------------------------------------
threshold: 0.3
ROC: 0.5374
              precision    recall  f1-score   support

           0       0.26      0.10      0.14       966
           1       0.93      0.98      0.95     121