Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import os, time
start = time.time()
very_start = time.time()

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [4]:

client = Client(n_workers=8, 
                       threads_per_worker=1,
                       memory_limit='90GB',ip='10.2.48.253')
#client = Client(ip='10.2.48.253',memory_limit='100GB')
client

0,1
Client  Scheduler: tcp://10.2.48.253:41469  Dashboard: http://10.2.48.253:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 720.00 GB


# Load Train

In [5]:
%%time
path = '/raid/data/recsys/train_split'
train = dd.read_parquet(f'{path}/train-preproc-fold-*.parquet')#,dtypes=dtypes)

CPU times: user 43.3 ms, sys: 3.88 ms, total: 47.1 ms
Wall time: 44.3 ms


In [6]:
%%time
# DROP UNUSED COLUMNS
cols_drop = ['links','hashtags0', 'hashtags1', 'fold']
train = train.drop(cols_drop,axis=1)

CPU times: user 10.9 ms, sys: 365 µs, total: 11.2 ms
Wall time: 9.73 ms


In [7]:
%%time
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-3ca0acaf-8dfd-4314-a900-5ec2a9d49516'), 25)
CPU times: user 1.94 ms, sys: 3.77 ms, total: 5.71 ms
Wall time: 4.98 ms


In [8]:
%%time
train = train.repartition(npartitions=8)
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-4af478ea-baac-41c0-a1b5-1294f8225550'), 25)
CPU times: user 8.26 ms, sys: 21 µs, total: 8.28 ms
Wall time: 7.44 ms


In [9]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [10]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
for col in train.columns:
    if col in label_names:
        train[col] = train[col].astype('float32')
    elif train[col].dtype=='int64':
        train[col] = train[col].astype('int32')
    elif train[col].dtype=='int16':
        train[col] = train[col].astype('int8')

In [11]:
%%time
train = train.reset_index(drop=True)

CPU times: user 14.8 ms, sys: 332 µs, total: 15.1 ms
Wall time: 13.8 ms


In [12]:
%%time
train, = dask.persist(train)
print(train.shape)

(Delayed('int-ab8d5782-7fda-45e1-9870-0004c78683f7'), 25)
CPU times: user 11.4 ms, sys: 248 µs, total: 11.7 ms
Wall time: 10.1 ms


In [13]:
%%time 
# TIME FEATURES
# RAPIDS does this 5x faster than Pandas CPU
# If we didn't need to copy CPU to GPU to CPU, then 1300x faster!
def split_time(df):
    #gf = cudf.from_pandas(df[['timestamp']])
    df['dt_dow']  = df['timestamp'].dt.weekday#.to_array() 
    df['dt_hour'] = df['timestamp'].dt.hour#.to_array()
    df['dt_minute'] = df['timestamp'].dt.minute#.to_array()
    df['dt_second'] = df['timestamp'].dt.second#.to_array()
    return df

train = split_time(train)

CPU times: user 53.8 ms, sys: 4.4 ms, total: 58.2 ms
Wall time: 53.5 ms


In [14]:
train.head()[['engage_time','timestamp']]

Unnamed: 0,engage_time,timestamp
0,1970-01-01 00:00:00,2020-02-09 09:26:50
1,1970-01-01 00:00:00,2020-02-09 18:41:35
2,1970-01-01 00:00:00,2020-02-09 01:13:28
3,2020-02-07 12:36:47,2020-02-07 12:15:20
4,2020-02-09 13:33:47,2020-02-08 14:14:39


In [15]:
train.timestamp.dtype

dtype('<M8[ns]')

In [16]:
%%time
# ELAPSED TIME
for col in ['engage_time','timestamp']:
    train[col] = train[col].astype('int64')/1e9

CPU times: user 35.9 ms, sys: 8.05 ms, total: 43.9 ms
Wall time: 40.9 ms


In [17]:
%%time
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-943b7243-63c6-4fc8-8292-c033b18a8f40'), 29)
CPU times: user 15.5 ms, sys: 4.02 ms, total: 19.5 ms
Wall time: 18 ms


In [18]:
train.head()[['engage_time','timestamp']]

Unnamed: 0,engage_time,timestamp
0,0.0,1581240000.0
1,0.0,1581274000.0
2,0.0,1581211000.0
3,1581079000.0,1581078000.0
4,1581255000.0,1581171000.0


In [19]:
def set_nan(ds):
    mask = ds == 0
    ds.loc[mask] = np.nan
    return ds
train['engage_time'] = train['engage_time'].map_partitions(set_nan)

In [20]:
train['elapsed_time'] = train['engage_time'] - train['timestamp']
train['elapsed_time'] = train.elapsed_time.astype('float64')

In [21]:
print(train['elapsed_time'].min().compute(),train['elapsed_time'].max().compute())
print(train['elapsed_time'].mean().compute())

2.0 603956.0
15581.699535245267


In [22]:
train.head()

Unnamed: 0,tweet_id,media,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,engage_time,len_domains,len_hashtags,len_links,dt_dow,dt_hour,dt_minute,dt_second,elapsed_time
0,0,7,0,1,54,1581240000.0,0,14326,408,False,2018-03-31 21:20:17,2022773,27428,600,False,2018-03-13 13:47:49,True,0.0,0.0,0.0,0.0,,0,0,0,6,9,26,50,
1,10,0,0,1,54,1581274000.0,10,237126,1193,True,2010-11-12 21:39:20,15871988,420,518,False,2011-09-05 16:42:09,False,0.0,0.0,0.0,0.0,,0,0,0,6,18,41,35,
2,20,5,0,1,3,1581211000.0,19,23079,1803,False,2010-07-10 21:39:50,10982964,134,408,False,2016-05-19 02:19:01,False,0.0,0.0,0.0,0.0,,0,0,0,6,1,13,28,
3,30,0,5,2,11,1581078000.0,29,769176,190,False,2009-12-18 14:28:33,15871991,2,29,False,2019-09-10 09:17:08,False,0.0,0.0,0.0,1.0,1581079000.0,0,0,0,4,12,15,20,1287.0
4,40,0,0,2,6,1581171000.0,35,73952,13,False,2016-06-21 21:45:25,15871992,2,80,False,2019-12-11 15:38:45,False,0.0,0.0,0.0,1.0,1581255000.0,0,0,0,5,14,14,39,83948.0


# Feature Engineering 

In [23]:
%%time
# TRAIN FIRST 5 DAYS. VALIDATE LAST 2 DAYS
VALID_DOW = [1, 2]# order is [3, 4, 5, 6, 0, 1, 2]
valid = train[train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)
train = train[~train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)

CPU times: user 23.5 ms, sys: 4.41 ms, total: 27.9 ms
Wall time: 25.3 ms


In [24]:
%%time
train,valid = dask.persist(train,valid)
print(type(train), train.shape, valid.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-82428409-d948-4164-af2a-4f9dc88b991c'), 30) (Delayed('int-f62b48cd-97fb-424e-b9a4-1279e9c9afae'), 30)
CPU times: user 33.8 ms, sys: 0 ns, total: 33.8 ms
Wall time: 32.1 ms


In [25]:
%%time
train = train.set_index('timestamp')
valid = valid.set_index('timestamp')
train,valid = dask.persist(train,valid)
train.head()

CPU times: user 9.08 s, sys: 849 ms, total: 9.93 s
Wall time: 1min 3s


Unnamed: 0_level_0,tweet_id,media,domains,tweet_type,language,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,engage_time,len_domains,len_hashtags,len_links,dt_dow,dt_hour,dt_minute,dt_second,elapsed_time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1580947000.0,1680159,5,0,2,59,5303,1923062,0,False,2014-05-16 22:58:30,2812169,286,808,False,2014-01-04 19:54:06,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,
1580947000.0,180802,5,0,2,54,2348,9794149,90,True,2011-11-14 14:57:32,22175905,114,121,False,2013-10-26 00:06:10,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,
1580947000.0,6569932,5,1571,2,54,19001,1843389,392,True,2008-08-23 13:13:44,27557184,15,116,False,2010-07-03 18:06:52,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,
1580947000.0,3552255,9,0,2,3,119247,1747480,1615,True,2008-06-06 22:14:20,30112618,1,69,False,2020-01-07 21:31:04,False,0.0,0.0,0.0,1.0,1581085000.0,0,0,0,3,0,0,0,137335.0
1580947000.0,41656685,9,0,2,47,14702,114453,0,False,2012-12-22 08:53:53,24020411,111,82,False,2019-11-22 01:03:13,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,


In [26]:
%%time
train = train.reset_index()
valid = valid.reset_index()
train,valid = dask.persist(train,valid)
train.head()

CPU times: user 233 ms, sys: 18.3 ms, total: 251 ms
Wall time: 931 ms


Unnamed: 0,timestamp,tweet_id,media,domains,tweet_type,language,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,engage_time,len_domains,len_hashtags,len_links,dt_dow,dt_hour,dt_minute,dt_second,elapsed_time
0,1580947000.0,1680159,5,0,2,59,5303,1923062,0,False,2014-05-16 22:58:30,2812169,286,808,False,2014-01-04 19:54:06,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,
1,1580947000.0,180802,5,0,2,54,2348,9794149,90,True,2011-11-14 14:57:32,22175905,114,121,False,2013-10-26 00:06:10,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,
2,1580947000.0,6569932,5,1571,2,54,19001,1843389,392,True,2008-08-23 13:13:44,27557184,15,116,False,2010-07-03 18:06:52,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,
3,1580947000.0,3552255,9,0,2,3,119247,1747480,1615,True,2008-06-06 22:14:20,30112618,1,69,False,2020-01-07 21:31:04,False,0.0,0.0,0.0,1.0,1581085000.0,0,0,0,3,0,0,0,137335.0
4,1580947000.0,41656685,9,0,2,47,14702,114453,0,False,2012-12-22 08:53:53,24020411,111,82,False,2019-11-22 01:03:13,False,0.0,0.0,0.0,0.0,,0,0,0,3,0,0,0,


In [27]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [28]:
#for i,p in enumerate(valid.partitions):
#    print(i,len(p))

### Target Encode

In [29]:
class MTE_one_shot:
    
    def __init__(self, folds, smooth, seed=42):
        self.folds = folds
        self.seed = seed
        self.smooth = smooth
        
    def fit_transform(self, train, x_col, y_col, y_mean=None, out_col = None, out_dtype=None):
        
        self.y_col = y_col
        np.random.seed(self.seed)
        
        if 'fold' not in train.columns:
            fsize = len(train)//self.folds
            train['fold'] = 1
            train['fold'] = train['fold'].cumsum()
            train['fold'] = train['fold']//fsize
            train['fold'] = train['fold']%self.folds
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        
        if y_mean is None:
            y_mean = train[y_col].mean()#.compute().astype('float32')
        self.mean = y_mean
        
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        
        agg_each_fold = train.groupby(cols).agg({y_col:['count','sum']}).reset_index()
        agg_each_fold.columns = cols + ['count_y','sum_y']
        
        agg_all = agg_each_fold.groupby(x_col).agg({'count_y':'sum','sum_y':'sum'}).reset_index()
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all.columns = cols + ['count_y_all','sum_y_all']
        
        agg_each_fold = agg_each_fold.merge(agg_all,on=x_col,how='left')
        agg_each_fold['count_y_all'] = agg_each_fold['count_y_all'] - agg_each_fold['count_y']
        agg_each_fold['sum_y_all'] = agg_each_fold['sum_y_all'] - agg_each_fold['sum_y']
        agg_each_fold[out_col] = (agg_each_fold['sum_y_all']+self.smooth*self.mean)/(agg_each_fold['count_y_all']+self.smooth)
        agg_each_fold = agg_each_fold.drop(['count_y_all','count_y','sum_y_all','sum_y'],axis=1)
        
        agg_all[out_col] = (agg_all['sum_y_all']+self.smooth*self.mean)/(agg_all['count_y_all']+self.smooth)
        agg_all = agg_all.drop(['count_y_all','sum_y_all'],axis=1)
        self.agg_all = agg_all
        
        train.columns
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        train = train.merge(agg_each_fold,on=cols,how='left')
        del agg_each_fold
        #self.agg_each_fold = agg_each_fold
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        train[out_col] = train[out_col].fillna(self.mean)
        
        if out_dtype is not None:
            train[out_col] = train[out_col].astype(out_dtype)
        return train
    
    def transform(self, test, x_col, out_col = None, out_dtype=None):
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        test = test.merge(self.agg_all,on=x_col,how='left')
        test[out_col] = test[out_col].fillna(self.mean)
        if out_dtype is not None:
            test[out_col] = test[out_col].astype(out_dtype)
        return test
 

TE_media_reply 17.8 seconds<br>
TE_tweet_type_reply 27.1 seconds<br>
TE_language_reply 52.5 seconds<br>
TE_a_user_id_reply 180.0 seconds<br>

In [30]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    start = time.time()
    for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
        cols.append(out_col)
        train,valid = dask.persist(train,valid)
        del encoder
        #train.head()
        wait(train)
        wait(valid)
        print(out_col,"%.1f seconds"%(time.time()-start))

TE_media_reply 27.4 seconds
TE_tweet_type_reply 45.6 seconds
TE_language_reply 64.2 seconds
TE_a_user_id_reply 151.4 seconds
TE_b_user_id_reply 351.9 seconds
TE_media_retweet 27.0 seconds
TE_tweet_type_retweet 54.8 seconds
TE_language_retweet 83.1 seconds
TE_a_user_id_retweet 181.7 seconds
TE_b_user_id_retweet 392.7 seconds
TE_media_retweet_comment 29.5 seconds
TE_tweet_type_retweet_comment 59.2 seconds
TE_language_retweet_comment 89.3 seconds
TE_a_user_id_retweet_comment 190.3 seconds
TE_b_user_id_retweet_comment 401.2 seconds
TE_media_like 27.0 seconds
TE_tweet_type_like 52.0 seconds
TE_language_like 77.5 seconds
TE_a_user_id_like 163.9 seconds
TE_b_user_id_like 373.0 seconds
CPU times: user 3min 55s, sys: 17.1 s, total: 4min 12s
Wall time: 25min 18s


In [31]:
train['fold'].value_counts().compute()

0    21447306
4    21447304
3    21447304
2    21447304
1    21447304
Name: fold, dtype: int64

### Multiple Column Target Encode

In [32]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols=[]
c = ['domains','language','b_follows_a','tweet_type','media','a_is_verified']
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    out_col = f'TE_multi_{t}'
    encoder = MTE_one_shot(folds=5,smooth=20)
    train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
    valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
    cols.append(out_col)
    del encoder

CPU times: user 1.04 s, sys: 68.2 ms, total: 1.1 s
Wall time: 1.04 s


In [33]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 30.6 s, sys: 2.08 s, total: 32.7 s
Wall time: 2min 59s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-a50e363cfce36e43c6ce3e38ded602c0', 1)>}, not_done=set())

### Elapsed Time Target Encode

In [34]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language']:#, 'a_user_id', 'b_user_id']:
    for t in ['elapsed_time']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col)
        out_dtype='float32' #if 'user_id' in c else None
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype=out_dtype)
        cols.append(out_col)
        print(out_col,"%.1f seconds"%(time.time()-start))
        #del encoder

TE_media_elapsed_time 0.2 seconds
TE_tweet_type_elapsed_time 0.4 seconds
TE_language_elapsed_time 0.6 seconds
CPU times: user 578 ms, sys: 20.9 ms, total: 599 ms
Wall time: 566 ms


In [35]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 15.9 s, sys: 1.03 s, total: 17 s
Wall time: 1min 28s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-400c6aa823c4197d0c704a041c68d56a', 1)>}, not_done=set())

### Count Encode

In [36]:
class FrequencyEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, x_col, c_col=None, out_col = None):
        np.random.seed(self.seed)
        if c_col is None or c_col not in train.columns:
            c_col = 'dummy'
            train[c_col] = 1
            drop = True
        else:
            drop = False
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        if drop:
            train = train.drop(c_col,axis=1)
        agg_all.columns = cols + [out_col]
        agg_all[out_col] = agg_all[out_col].astype('int32')
        agg_all[out_col] = agg_all[out_col]*1.0/len(train)
        agg_all[out_col] = agg_all[out_col].astype('float32')
    
        train = train.merge(agg_all,on=cols,how='left')
        del agg_all
        #print(train.columns)
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        return train
    
    def transform(self, test, x_col, c_col=None, out_col = None):
        return self.fit_transform(test, x_col, c_col, out_col)
 

In [37]:
class CountEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, test, x_col, out_col = None):
        np.random.seed(self.seed)
        
        common_cols = [i for i in train.columns if i in test.columns and i!=x_col]

        if len(common_cols):
            c_col = common_cols[0]
            drop = False
        else:
            c_col = 'dummy'
            train[c_col] = 1
            test[c_col]=1
            drop = True
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_all.columns = cols + [out_col]
        
        agg_test = test.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_test.columns = cols + [out_col+'_test']
        agg_all = agg_all.merge(agg_test,on=cols,how='left')
        agg_all[out_col+'_test'] = agg_all[out_col+'_test'].fillna(0)
        agg_all[out_col] = agg_all[out_col] + agg_all[out_col+'_test']
        agg_all = agg_all.drop(out_col+'_test', axis=1)
        del agg_test
            
        if drop:
            train = train.drop(c_col,axis=1)
            test = test.drop(c_col,axis=1)
        train = train.merge(agg_all,on=cols,how='left')
        test = test.merge(agg_all,on=cols,how='left')
        del agg_all
        return train,test

In [38]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    encoder = CountEncoder()
    out_col = f'CE_{c}'
    train,valid = encoder.fit_transform(train, valid, c, out_col=out_col)
    print
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_media 20.8 seconds
CE_tweet_type 48.1 seconds
CE_language 76.7 seconds
CE_a_user_id 132.1 seconds
CE_b_user_id 223.1 seconds
CPU times: user 38.5 s, sys: 2.19 s, total: 40.7 s
Wall time: 3min 43s


In [39]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    encoder = FrequencyEncoder()
    out_col = f'CE_{c}_norm'
    train = encoder.fit_transform(train, c, c_col='tweet_id', out_col=out_col)
    valid = encoder.transform(valid, c, c_col='tweet_id', out_col=out_col)
    cols.append(out_col)
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_media_norm 42.8 seconds
CE_tweet_type_norm 78.8 seconds
CE_language_norm 118.8 seconds
CE_a_user_id_norm 174.3 seconds
CE_b_user_id_norm 262.9 seconds
CPU times: user 45.2 s, sys: 2.71 s, total: 47.9 s
Wall time: 4min 22s


### Difference Encode (Lag Features)

In [40]:
def diff_encode_cudf_v1(train,col,tar,sft=1):
    train[col+'_sft'] = train[col].shift(sft)
    train[tar+'_sft'] = train[tar].shift(sft)
    out_col = f'DE_{col}_{tar}_{sft}'
    train[out_col] = train[tar]-train[tar+'_sft']
    mask = '__MASK__'
    train[mask] = train[col] == train[col+'_sft']
    train = train.drop([col+'_sft',tar+'_sft'],axis=1)
    train[out_col] = train[out_col]*train[mask]
    train = train.drop(mask,axis=1)
    return train

In [41]:
%%time
start = time.time()
# cuDF DE ENCODING IS FAST!!
idx = 0; cols = []; sc = 'timestamp'
for c in ['b_user_id']:
    for t in ['b_follower_count','b_following_count','language']:
        for s in [1,-1]:
            start = time.time()
            train = diff_encode_cudf_v1(train, col=c, tar=t, sft=s)
            valid = diff_encode_cudf_v1(valid, col=c, tar=t, sft=s)
            train,valid = dask.persist(train,valid)
            wait(train)
            wait(valid)
            end = time.time(); idx += 1
            print('DE',c,t,s,'%.1f seconds'%(end-start))

DE b_user_id b_follower_count 1 22.2 seconds
DE b_user_id b_follower_count -1 21.7 seconds
DE b_user_id b_following_count 1 20.4 seconds
DE b_user_id b_following_count -1 19.5 seconds
DE b_user_id language 1 18.1 seconds
DE b_user_id language -1 19.4 seconds
CPU times: user 24.7 s, sys: 1.51 s, total: 26.3 s
Wall time: 2min 1s


### Diff Language

In [42]:
train_lang = train[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
valid_lang = valid[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
train_lang_count = train_lang.groupby(['a_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
valid_lang_count = valid_lang.groupby(['a_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
train_lang_count,valid_lang_count = dask.persist(train_lang_count,valid_lang_count)
train_lang_count.head()
del train_lang,valid_lang

In [43]:
%%time
train_lang_count = train_lang_count.merge(valid_lang_count,on=['a_user_id', 'language'],how='left')
train_lang_count['tweet_id_y'] = train_lang_count['tweet_id_y'].fillna(0)
train_lang_count['tweet_id_x'] = train_lang_count['tweet_id_x'] + train_lang_count['tweet_id_y']
train_lang_count = train_lang_count.drop('tweet_id_y',axis=1)
train_lang_count.columns = ['a_user_id', 'top_language', 'language_count']
train_lang_count, = dask.persist(train_lang_count)
train_lang_count.head()

CPU times: user 1.5 s, sys: 80.1 ms, total: 1.58 s
Wall time: 9.63 s


Unnamed: 0,a_user_id,top_language,language_count
0,0,3,3.0
1,0,4,1.0
2,0,11,21.0
3,0,13,3.0
4,0,38,1.0


In [44]:
%%time

train_lang_count = train_lang_count.sort_values(['a_user_id', 'language_count'])
train_lang_count['a_user_shifted'] = train_lang_count['a_user_id'].shift(1)
train_lang_count = train_lang_count[train_lang_count['a_user_id']!=train_lang_count['a_user_shifted']]
train_lang_count = train_lang_count.drop(['a_user_shifted','language_count'],axis=1)
train_lang_count.columns = ['a_user_id','top_language']
train_lang_count, = dask.persist(train_lang_count)
train_lang_count.head()

AttributeError: 'DataFrame' object has no attribute 'sort_values'

In [45]:
def diff_language(df,df_lang_count):
    df = df.merge(df_lang_count,how='left', left_on='b_user_id', right_on='a_user_id')
    df['nan_language'] = df['top_language'].isnull()
    df['same_language'] = df['language'] == df['top_language']
    df['diff_language'] = df['language'] != df['top_language']
    df['same_language'] = df['same_language']*(1-df['nan_language'])
    df['diff_language'] = df['diff_language']*(1-df['nan_language'])
    df = df.drop('top_language',axis=1)
    return df

In [46]:
#%%time
#train = diff_language(train,train_lang_count)
#valid = diff_language(valid,train_lang_count)
#train,valid = dask.persist(train,valid)
#train.head()

## Rate feature

In [47]:
%%time
# follow rate feature
train['a_ff_rate'] = (train['a_following_count'] / train['a_follower_count']).astype('float32')
train['b_ff_rate'] = (train['b_follower_count']  / train['b_following_count']).astype('float32')
valid['a_ff_rate']  = (valid['a_following_count'] / valid['a_follower_count']).astype('float32')
valid['b_ff_rate']  = (valid['b_follower_count']  / valid['b_following_count']).astype('float32')

CPU times: user 65.6 ms, sys: 3.93 ms, total: 69.6 ms
Wall time: 66.3 ms


In [48]:
train,valid = dask.persist(train,valid)

In [49]:
wait(train)
wait(valid)

DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-f50bd9d59f87b205508fbd3053c135ef', 0)>}, not_done=set())

# Summarize Features

In [50]:
%%time

label_names = ['reply', 'retweet', 'retweet_comment', 'like']
DONT_USE = ['tweet_id','timestamp','a_account_creation','b_account_creation','engage_time',
            'fold','b_user_id','a_user_id', 'dt_dow',
            'a_account_creation', 'b_account_creation', 'elapsed_time',
             'links','domains','hashtags0','hashtags1']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

RMV = [c for c in DONT_USE if c in train.columns and c not in label_names]
RMV

CPU times: user 204 µs, sys: 0 ns, total: 204 µs
Wall time: 211 µs


['tweet_id',
 'timestamp',
 'a_account_creation',
 'b_account_creation',
 'engage_time',
 'fold',
 'b_user_id',
 'a_user_id',
 'dt_dow',
 'a_account_creation',
 'b_account_creation',
 'elapsed_time',
 'domains']

In [51]:
%%time

for col in RMV:
    #print(col, col in train.columns)
    if col in train.columns:
        train = train.drop(col,axis=1)
        train, = dask.persist(train)
        train.head()
        

CPU times: user 3.75 s, sys: 191 ms, total: 3.94 s
Wall time: 16 s


In [52]:
%%time

for col in RMV:
    #print(col, col in valid.columns)
    if col in valid.columns:
        valid = valid.drop(col,axis=1)
        valid, = dask.persist(valid,)
        valid.head()
        

CPU times: user 3.22 s, sys: 190 ms, total: 3.41 s
Wall time: 13.8 s


# Train Model Validate
We will train on random 10% of first 5 days and validation on last 2 days

In [53]:
%%time

SAMPLE_RATIO = 0.1
SEED = 1

if SAMPLE_RATIO < 1.0:
    print(len(train))
    train = train.sample(frac=SAMPLE_RATIO,random_state=42)
    train, = dask.persist(train)
    train.head()
    print(len(train))

train = train.compute()
Y_train = train[label_names]
train = train.drop(label_names,axis=1)

features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

107236522
10723651
Using 61 features: 61
CPU times: user 8.9 s, sys: 8.65 s, total: 17.6 s
Wall time: 22 s


array(['media', 'tweet_type', 'language', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'b_follower_count',
       'b_following_count', 'b_is_verified', 'b_follows_a', 'len_domains',
       'len_hashtags', 'len_links', 'dt_hour', 'dt_minute', 'dt_second',
       'TE_media_reply', 'TE_tweet_type_reply', 'TE_language_reply',
       'TE_a_user_id_reply', 'TE_b_user_id_reply', 'TE_media_retweet',
       'TE_tweet_type_retweet', 'TE_language_retweet',
       'TE_a_user_id_retweet', 'TE_b_user_id_retweet',
       'TE_media_retweet_comment', 'TE_tweet_type_retweet_comment',
       'TE_language_retweet_comment', 'TE_a_user_id_retweet_comment',
       'TE_b_user_id_retweet_comment', 'TE_media_like',
       'TE_tweet_type_like', 'TE_language_like', 'TE_a_user_id_like',
       'TE_b_user_id_like', 'TE_multi_reply', 'TE_multi_retweet',
       'TE_multi_retweet_comment', 'TE_multi_like',
       'TE_media_elapsed_time', 'TE_tweet_type_elapsed_time',
       'TE_language_elapsed_tim

In [54]:
SAMPLE_RATIO = 0.35 # VAL SET NOW SIZE OF TEST SET
SEED = 1
if SAMPLE_RATIO < 1.0:
    print(len(valid))
    valid = valid.sample(frac=SAMPLE_RATIO,random_state=42)
    valid, = dask.persist(valid)
    valid.head()
    print(len(valid))
    
valid = valid.compute()
Y_valid = valid[label_names]
valid = valid.drop(label_names,axis=1)

40838716
14293552


In [55]:
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'nthread':40,
    'tree_method':'hist',
    #'predictor' : 'gpu_predictor'
}


XGB Version 1.2.0-SNAPSHOT


In [56]:
if train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { train.columns[train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {train.shape}')
print(f'X_valid.shape {valid.shape}')

no dup :) 
X_train.shape (10723651, 61)
X_valid.shape (14293552, 61)


In [57]:
%%time

for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
        valid[col] = valid[col].astype('int8')

CPU times: user 52.5 ms, sys: 12.1 ms, total: 64.6 ms
Wall time: 57.2 ms


In [58]:
%%time
# TRAIN AND VALIDATE

NROUND = 300
VERBOSE_EVAL = 50
#ESR = 50
    
oof = np.zeros((len(valid),len(label_names)))
preds = []
for i in range(4):

    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
       
    start = time.time(); print('Creating DMatrix...')
        
    dtrain = xgb.DMatrix(data=train,label=Y_train.iloc[:, i])
    dvalid = xgb.DMatrix(data=valid,label=Y_valid.iloc[:, i])
    print('Took %.1f seconds'%(time.time()-start))
             
    start = time.time(); print('Training...')
    model = xgb.train(xgb_parms, 
                           dtrain=dtrain,
                           #evals=[(dtrain,'train'),(dvalid,'valid')],
                           num_boost_round=NROUND,
                           #early_stopping_rounds=ESR,
                           verbose_eval=VERBOSE_EVAL) 
    print('Took %.1f seconds'%(time.time()-start))
        
    start = time.time(); print('Predicting...')
    #Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
    oof[:, i] += model.predict(dvalid)
    #preds.append(xgb.dask.predict(client,model,valid))
    print('Took %.1f seconds'%(time.time()-start))
        
    del model, dtrain, dvalid

#########################
### reply
#########################
Creating DMatrix...
Took 34.9 seconds
Training...
Took 158.8 seconds
Predicting...
Took 17.8 seconds
#########################
### retweet
#########################
Creating DMatrix...
Took 34.4 seconds
Training...
Took 159.8 seconds
Predicting...
Took 17.3 seconds
#########################
### retweet_comment
#########################
Creating DMatrix...
Took 34.9 seconds
Training...
Took 154.9 seconds
Predicting...
Took 17.6 seconds
#########################
### like
#########################
Creating DMatrix...
Took 34.8 seconds
Training...
Took 159.1 seconds
Predicting...
Took 18.1 seconds
CPU times: user 7h 30min 58s, sys: 9min 14s, total: 7h 40min 12s
Wall time: 14min 4s


In [59]:
yvalid = Y_valid[label_names].values

# Compute Validation Metrics

In [60]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [61]:
%%time
txt = ''
for i in range(4):
    prauc = compute_prauc(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} PRAUC:{prauc:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply                PRAUC:0.17641 RCE:19.16150
retweet              PRAUC:0.53312 RCE:28.65558
retweet_comment      PRAUC:0.05191 RCE:11.15791
like                 PRAUC:0.78464 RCE:27.56908
CPU times: user 38.8 s, sys: 1.57 s, total: 40.4 s
Wall time: 34.8 s


In [62]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 59.5 minutes
