In [None]:
import pandas as pd
import dask.dataframe as dd
from catboost import CatBoost, Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, log_loss

In [None]:
def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [None]:
import numpy as np

def getBoolean(n):
    if n > 0.5:
        return True 
    else:
        return False

def getBooleanList(pred):
    return np.array(list(map(getBoolean, pred)))

def getFirst(n):
    return n[0]

def getFirstValuePrediction(pred):
    return np.array(list(map(getFirst, pred)))

## Proof of concept of catboost

In [None]:
%%time

columns = [
    'tweet_timestamp', 
    'creator_follower_count', 
    'creator_following_count',
    'creator_is_verified', 
    'creator_creation_timestamp',
    'engager_follower_count', 
    'engager_following_count',
    'engager_is_verified', 
    'engager_creation_timestamp',
    'engagement_creator_follows_engager', 
    'number_of_photo', 
    'number_of_gif', 
    'number_of_video',
    'engagement_retweet_timestamp',
]

dask_df = dd.read_parquet("/Users/arcangelopisa/Downloads/sample_dataset", engine='pyarrow', columns=columns)
dask_df = dask_df.sample(0.8)
dask_df['engagement_retweet_timestamp'] = (dask_df['engagement_retweet_timestamp'] != -1).astype(np.uint8)

pandas_df = dask_df.compute()

del dask_df

pandas_df.info()

In [None]:
train, test = train_test_split(pandas_df, train_size=0.8)

X_train = train.drop(['engagement_retweet_timestamp'], axis=1)
y_train = train['engagement_retweet_timestamp']

X_test = test.drop(['engagement_retweet_timestamp'], axis=1)
y_test = test['engagement_retweet_timestamp']

del pandas_df, train, test

In [None]:
%%time

classifier = CatBoostClassifier(iterations=150,
                           depth=12,
                           learning_rate=0.25,
                           loss_function='CrossEntropy',
                           verbose = True)

classifier.fit(X_train, y_train, verbose = True)

classifier.save_model('retweet_classifier', format = "cbm")

In [None]:
%%time

y_pred = classifier.predict_proba(X_test)
y_pred

In [None]:
getFirstValuePrediction(y_pred)

In [None]:
result = getBooleanList(y_pred)
result

In [None]:
print('RCE is {}'.format(compute_rce(result, y_test)))
print('Average precision is {}'.format(average_precision_score(y_test, result)))