In [9]:
import pandas as pd
import dask.dataframe as dd
from catboost import CatBoost, Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, log_loss

In [10]:
def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [11]:
import numpy as np

def getBoolean(n):
    if n == 'False':
        return False 

    return True

def getBooleanList(pred):
    return np.array(list(map(getBoolean, pred)))

## Proof of concept of catboost

In [12]:
%%time

columns = [
    'tweet_timestamp', 
    'creator_follower_count', 
    'creator_following_count',
    'creator_is_verified', 
    'creator_creation_timestamp',
    'engager_follower_count', 
    'engager_following_count',
    'engager_is_verified', 
    'engager_creation_timestamp',
    'engagement_creator_follows_engager', 
    'number_of_photo', 
    'number_of_gif', 
    'number_of_video',
    'engagement_like_timestamp',
]

dask_df = dd.read_parquet("/Users/arcangelopisa/Downloads/sample_dataset", engine='pyarrow', columns=columns)
dask_df = dask_df.sample(0.8)
dask_df['engagement_like_timestamp'] = (dask_df['engagement_like_timestamp'] != -1).astype(np.uint8)

pandas_df = dask_df.compute()

del dask_df

pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59815580 entries, 66752 to 32846
Data columns (total 14 columns):
 #   Column                              Dtype
---  ------                              -----
 0   tweet_timestamp                     int32
 1   creator_follower_count              int32
 2   creator_following_count             int32
 3   creator_is_verified                 bool 
 4   creator_creation_timestamp          int32
 5   engager_follower_count              int32
 6   engager_following_count             int32
 7   engager_is_verified                 bool 
 8   engager_creation_timestamp          int32
 9   engagement_creator_follows_engager  bool 
 10  number_of_photo                     uint8
 11  number_of_gif                       uint8
 12  number_of_video                     uint8
 13  engagement_like_timestamp           uint8
dtypes: bool(3), int32(7), uint8(4)
memory usage: 2.4 GB
CPU times: user 20.5 s, sys: 12.2 s, total: 32.7 s
Wall time: 20.3 s


In [13]:
train, test = train_test_split(pandas_df, train_size=0.8)

X_train = train.drop(['engagement_like_timestamp'], axis=1)
y_train = train['engagement_like_timestamp']

X_test = test.drop(['engagement_like_timestamp'], axis=1)
y_test = test['engagement_like_timestamp']

del pandas_df, train, test

In [15]:
%%time

classifier = CatBoostClassifier(iterations=200,
                           depth=12,
                           learning_rate=0.25,
                           loss_function='CrossEntropy',
                           verbose = True)

classifier.fit(X_train, y_train, verbose = True)

0:	learn: 0.6769802	total: 11.1s	remaining: 36m 47s
1:	learn: 0.6676905	total: 20.7s	remaining: 34m 13s
2:	learn: 0.6622337	total: 28.8s	remaining: 31m 31s
3:	learn: 0.6589760	total: 36.4s	remaining: 29m 43s
4:	learn: 0.6568149	total: 44.7s	remaining: 29m 2s
5:	learn: 0.6555372	total: 53s	remaining: 28m 32s
6:	learn: 0.6545651	total: 1m 1s	remaining: 28m 28s
7:	learn: 0.6538272	total: 1m 11s	remaining: 28m 27s
8:	learn: 0.6533410	total: 1m 18s	remaining: 27m 46s
9:	learn: 0.6528154	total: 1m 25s	remaining: 27m 3s
10:	learn: 0.6523025	total: 1m 32s	remaining: 26m 26s
11:	learn: 0.6519275	total: 1m 39s	remaining: 25m 55s
12:	learn: 0.6515228	total: 1m 46s	remaining: 25m 33s
13:	learn: 0.6512405	total: 1m 53s	remaining: 25m 5s
14:	learn: 0.6509289	total: 2m	remaining: 24m 46s
15:	learn: 0.6507033	total: 2m 7s	remaining: 24m 25s
16:	learn: 0.6504745	total: 2m 14s	remaining: 24m 10s
17:	learn: 0.6501685	total: 2m 21s	remaining: 23m 52s
18:	learn: 0.6500128	total: 2m 28s	remaining: 23m 37s
1

<catboost.core.CatBoostClassifier at 0x130aef1f0>

In [7]:
classifier.save_model('like_classifier', format = "cbm")

In [8]:
%%time

y_pred = classifier.predict_proba(X_test)

result = getBooleanList(y_pred)

  if n == 'False':
CPU times: user 1min 24s, sys: 1.62 s, total: 1min 26s
Wall time: 1min 15s


In [10]:
y_pred

array([[0.90085285, 0.09914715],
       [0.71922361, 0.28077639],
       [0.39333767, 0.60666233],
       ...,
       [0.72635317, 0.27364683],
       [0.66672469, 0.33327531],
       [0.53503352, 0.46496648]])

In [9]:
print('RCE is {}'.format(compute_rce(result, y_test)))
print('Average precision is {}'.format(average_precision_score(y_test, result)))

RCE is -2998.4876221844934
Average precision is 0.39726230189525874
