Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# RecSys - Submit - MultiGPU

In [1]:
import os, time
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
VER = 330
start = time.time()

In [2]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask_cudf

In [3]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import cudf, cupy, time
cudf.__version__

'0.14.0'

# Load Train

In [4]:
def add_freq_tweet(train, valid):
    gf1 = cudf.from_pandas(train[['a_user_id', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['a_user_id', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2['idx'] = gf2.index
    
    gf = cudf.concat([gf1, gf2], axis=0)
    gf_unique = gf[['a_user_id', 'tweet_id']].drop_duplicates()

    gf_unique = gf_unique.groupby(['a_user_id']).count().reset_index()    
    gf_unique.columns = ['a_user_id_tmp', 'no_tweet']
    gf1 = gf1.merge(gf_unique[['a_user_id_tmp', 'no_tweet']], how='left', left_on='b_user_id', right_on='a_user_id_tmp')
    gf2 = gf2.merge(gf_unique[['a_user_id_tmp', 'no_tweet']], how='left', left_on='b_user_id', right_on='a_user_id_tmp') 
    gf1 = gf1.sort_values('idx')
    gf2 = gf2.sort_values('idx')
    
    train['no_tweet'] = gf1['no_tweet'].fillna(0).astype('int32').to_array()
    valid['no_tweet'] = gf2['no_tweet'].fillna(0).astype('int32').to_array()

def diff_time(train, valid):
    gf1 = cudf.from_pandas(train[['timestamp', 'a_user_id', 'b_user_id', 'tweet_id', 'no_tweet']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['timestamp', 'a_user_id', 'b_user_id', 'tweet_id', 'no_tweet']]).reset_index(drop=True)
    gf = cudf.concat([gf1, gf2], axis=0)
    gf = dask_cudf.from_cudf(gf, npartitions=16)
    gf['timestamp'] = gf['timestamp'].astype('int64')/1e9
    gf_unique = gf[['timestamp', 'a_user_id', 'tweet_id']].drop_duplicates()
    gf_unique.columns = ['tmp_timestamp', 'tmp_a_user_id', 'tmp_tweet_id']
    gf = gf[gf['no_tweet']!=0]
    gf = gf.drop('no_tweet', axis=1)
    gf = gf.drop('a_user_id', axis=1)
    gf = gf.merge(gf_unique, how='left', left_on='b_user_id', right_on='tmp_a_user_id')
    gf = gf[gf['tweet_id']!=gf['tmp_tweet_id']]
    gf = gf[~gf['tmp_a_user_id'].isna()]

    gf['diff_timestamp_prev'] = gf['timestamp']-gf['tmp_timestamp']
    gf['diff_timestamp_after'] = gf['tmp_timestamp']-gf['timestamp']

    gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']>0, 15*24*3600)
    gf['diff_timestamp_prev'] = gf.diff_timestamp_prev.where(gf['diff_timestamp_prev']>0, 15*24*3600)

    gf = gf[['tweet_id', 
             'b_user_id', 
             'diff_timestamp_prev', 
             'diff_timestamp_after']].groupby(['tweet_id', 'b_user_id']).min().reset_index()

    gf.to_parquet('/tmp/gf')
    del gf; del gf_unique; del gf1; del gf2; gc.collect()

    gf = cudf.read_parquet('/tmp/gf/part.0.parquet')
    gf1 = cudf.from_pandas(train[['b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf1 = gf1.merge(gf, how='left', left_on=['tweet_id', 'b_user_id'], right_on=['tweet_id', 'b_user_id'])
    gf1 = gf1.sort_values('idx')
    train['diff_timestamp_prev'] = gf1['diff_timestamp_prev'].fillna(15*24*3600).astype('int32').to_array()
    train['diff_timestamp_after'] = gf1['diff_timestamp_after'].fillna(15*24*3600).astype('int32').to_array()
    del gf1; gc.collect()

    gf1 = cudf.from_pandas(valid[['b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf1 = gf1.merge(gf, how='left', left_on=['tweet_id', 'b_user_id'], right_on=['tweet_id', 'b_user_id'])
    gf1 = gf1.sort_values('idx')
    valid['diff_timestamp_prev'] = gf1['diff_timestamp_prev'].fillna(15*24*3600).astype('int32').to_array()
    valid['diff_timestamp_after'] = gf1['diff_timestamp_after'].fillna(15*24*3600).astype('int32').to_array()
    
def add_diff_user1(train, valid, col):

    gf1 = cudf.from_pandas(train[[col, 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[[col, 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2['idx'] = gf2.index
    
    gf = cudf.concat([gf1, gf2], axis=0)
    gf_lang = gf[['b_user_id', col, 'tweet_id']]#.drop_duplicates()
    gf_lang = gf_lang[gf_lang[col]!=0]
    gf_lang = gf_lang.groupby(['b_user_id', col]).count()
    gf_lang = gf_lang.reset_index()
    gf_lang = gf_lang[gf_lang['tweet_id']>3]
    gf_lang = gf_lang.sort_values(['b_user_id', 'tweet_id'], ascending=False)
    gf_lang['b_user_id_shifted'] = gf_lang['b_user_id'].shift(1)
    gf_lang = gf_lang[gf_lang['b_user_id_shifted']!=gf_lang['b_user_id']]
    gf_lang.columns = ['b_user_id_lang', 'top_' + col, 'drop1', 'drop2']
    gf1 = gf1.merge(gf_lang[['b_user_id_lang', 'top_' + col, 'drop1', 'drop2']], how='left', left_on='b_user_id', right_on='b_user_id_lang')
    gf2 = gf2.merge(gf_lang[['b_user_id_lang', 'top_' + col, 'drop1', 'drop2']], how='left', left_on='b_user_id', right_on='b_user_id_lang')
    
    gf1 = gf1.sort_values('idx')
    gf2 = gf2.sort_values('idx')
    
    gf1['same_' + col] = gf1[col] == gf1['top_' + col]
    gf1['diff_' + col] = gf1[col] != gf1['top_' + col]
    gf1['nan_' + col] = 0
    gf1.loc[gf1['top_' + col].isna(), 'same_' + col] = 0
    gf1.loc[gf1['top_' + col].isna(), 'diff_' + col] = 0
    gf1.loc[gf1['top_' + col].isna(), 'nan_' + col] = 1
    
    gf2['same_' + col] = gf2[col] == gf2['top_' + col]
    gf2['diff_' + col] = gf2[col] != gf2['top_' + col]
    gf2['nan_' + col] = 0
    gf2.loc[gf2['top_' + col].isna(), 'same_' + col] = 0
    gf2.loc[gf2['top_' + col].isna(), 'diff_' + col] = 0
    gf2.loc[gf2['top_' + col].isna(), 'nan_' + col] = 1
    
    train['same_' + col] = gf1['same_' + col].fillna(0).astype('int8').to_array()
    train['diff_' + col] = gf1['diff_' + col].fillna(0).astype('int8').to_array()
    train['nan_' + col] = gf1['nan_' + col].fillna(0).astype('int8').to_array()
    
    valid['same_' + col] = gf2['same_' + col].fillna(0).astype('int8').to_array()
    valid['diff_' + col] = gf2['diff_' + col].fillna(0).astype('int8').to_array()
    valid['nan_' + col] = gf2['nan_' + col].fillna(0).astype('int8').to_array()

def add_diff_user1_fixed(train, valid, col):
    col = 'tw_hash0'
    gf1 = cudf.from_pandas(train[[col, 'tw_hash1', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[[col, 'tw_hash1', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2['idx'] = gf2.index
    
    gf_lang = cudf.concat([gf1[['tw_hash0', 'b_user_id', 'tweet_id']],
                      gf1[['tw_hash1', 'b_user_id', 'tweet_id']],
                      gf2[['tw_hash0', 'b_user_id', 'tweet_id']],
                      gf2[['tw_hash1', 'b_user_id', 'tweet_id']]], axis=0)
    gf_lang = gf_lang[['b_user_id', col, 'tweet_id']].drop_duplicates()
    gf_lang = gf_lang[gf_lang[col]!=0]
    gf_lang = gf_lang.groupby(['b_user_id', col]).count()
    gf_lang = gf_lang.reset_index()
    gf_lang = gf_lang[gf_lang['tweet_id']>3]
    gf_lang = gf_lang.sort_values(['b_user_id', 'tweet_id'], ascending=False)
    gf_lang['b_user_id_shifted'] = gf_lang['b_user_id'].shift(1)
    gf_lang = gf_lang[gf_lang['b_user_id_shifted']!=gf_lang['b_user_id']]
    gf_lang.columns = ['b_user_id_lang', 'top_' + col, 'drop1', 'drop2']
    gf1 = gf1.merge(gf_lang[['b_user_id_lang', 'top_' + col, 'drop1', 'drop2']], how='left', left_on='b_user_id', right_on='b_user_id_lang')
    gf2 = gf2.merge(gf_lang[['b_user_id_lang', 'top_' + col, 'drop1', 'drop2']], how='left', left_on='b_user_id', right_on='b_user_id_lang')
    
    gf1 = gf1.sort_values('idx')
    gf2 = gf2.sort_values('idx')
    
    gf1['same_' + col] = (gf1[col] == gf1['top_' + col]) | (gf1['tw_hash1'] == gf1['top_' + col])
    gf1['diff_' + col] = (gf1[col] != gf1['top_' + col]) & (gf1['tw_hash1'] != gf1['top_' + col])
    gf1['nan_' + col] = 0
    gf1.loc[gf1['top_' + col].isna(), 'same_' + col] = 0
    gf1.loc[gf1['top_' + col].isna(), 'diff_' + col] = 0
    gf1.loc[gf1['top_' + col].isna(), 'nan_' + col] = 1
    
    gf2['same_' + col] = (gf2[col] == gf2['top_' + col]) | (gf2['tw_hash1'] == gf2['top_' + col])
    gf2['diff_' + col] = (gf2[col] != gf2['top_' + col]) & (gf2['tw_hash1'] != gf2['top_' + col])
    gf2['nan_' + col] = 0
    gf2.loc[gf2['top_' + col].isna(), 'same_' + col] = 0
    gf2.loc[gf2['top_' + col].isna(), 'diff_' + col] = 0
    gf2.loc[gf2['top_' + col].isna(), 'nan_' + col] = 1
    
    train['same_' + col] = gf1['same_' + col].fillna(0).astype('int8').to_array()
    train['diff_' + col] = gf1['diff_' + col].fillna(0).astype('int8').to_array()
    train['nan_' + col] = gf1['nan_' + col].fillna(0).astype('int8').to_array()
    
    valid['same_' + col] = gf2['same_' + col].fillna(0).astype('int8').to_array()
    valid['diff_' + col] = gf2['diff_' + col].fillna(0).astype('int8').to_array()
    valid['nan_' + col] = gf2['nan_' + col].fillna(0).astype('int8').to_array()


def add_timeshift(train, valid, shift=1):
    gf1 = cudf.from_pandas(train[['timestamp', 'b_user_id']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['timestamp', 'b_user_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2['idx'] = gf2.index
    gf1['type'] = 1
    gf2['type'] = 2
    gf = cudf.concat([gf1, gf2], axis=0)

    gf = gf.sort_values(['b_user_id', 'timestamp'])
    gf['timestamp'] = gf['timestamp'].astype('int64')/1e9
    gf['b_user_id_shifted'] = gf['b_user_id'].shift(shift)
    gf['b_timestamp_shifted'] = gf['timestamp'].shift(shift)
    gf['b_timestamp_1'] = (gf['timestamp']-gf['b_timestamp_shifted']).abs()
    gf.loc[gf['b_user_id']!=gf['b_user_id_shifted'], 'b_timestamp_1'] = 15*24*3600
    gf = gf.sort_values(['idx'])

    train['b_timestamp_' + str(shift)] = gf.loc[gf['type']==1, 'b_timestamp_1'].fillna(0).astype('int8').to_array()
    valid['b_timestamp_' + str(shift)] = gf.loc[gf['type']==2, 'b_timestamp_1'].fillna(0).astype('int8').to_array()

In [5]:
cluster = LocalCUDACluster()
client = Client(cluster)

In [6]:
%%time
train = pd.read_parquet( '../preprocessings/train-1.parquet' )
test0 = pd.read_parquet( '../preprocessings/test-0.parquet' )
test1 = pd.read_parquet( '../preprocessings/test-1.parquet' )
gc.collect()

CPU times: user 48.1 s, sys: 1min 14s, total: 2min 2s
Wall time: 9.93 s


In [7]:
train.shape, test0.shape, test1.shape

((121386431, 27), (12434735, 27), (12434838, 27))

In [8]:
test0['tr'] = 0
test1['tr'] = 1

In [9]:
valid = pd.concat([test0, test1], axis=0)

In [10]:
valid.shape

(24869573, 28)

In [11]:
valid = valid.reset_index(drop=True)

In [12]:
del test0; del test1; gc.collect()

0

In [13]:
txt = pd.read_parquet( '../preprocessings/text-processings-1.parquet' )

In [14]:
%%time

train['timestamp'] = pd.to_datetime(train['timestamp'], unit="s")
train['a_account_creation'] = pd.to_datetime(train['a_account_creation'], unit="s")
train['b_account_creation'] = pd.to_datetime(train['b_account_creation'], unit="s")

valid['timestamp'] = pd.to_datetime(valid['timestamp'], unit="s")
valid['a_account_creation'] = pd.to_datetime(valid['a_account_creation'], unit="s")
valid['b_account_creation'] = pd.to_datetime(valid['b_account_creation'], unit="s")

CPU times: user 16.6 s, sys: 19.6 s, total: 36.2 s
Wall time: 33.9 s


In [30]:
for col in ['count_ats', 'count_char', 'count_words', 'tw_hash0', 'tw_hash1', 'tw_rt_uhash']:
    print(col)
    train[col] = txt.iloc[:(train.shape[0]), ][col]
    valid[col] = txt.iloc[(train.shape[0]):, ][col].values

count_ats
count_char
count_words
tw_hash0
tw_hash1
tw_rt_uhash


In [37]:
del txt; gc.collect()

0

In [38]:
%%time 
# TIME FEATURES
# RAPIDS does this 5x faster than Pandas CPU
# If we didn't need to copy CPU to GPU to CPU, then 1300x faster!
def split_time(df):
    gf = cudf.from_pandas(df[['timestamp']])
    df['dt_dow']  = gf['timestamp'].dt.weekday.to_array() 
    df['dt_hour'] = gf['timestamp'].dt.hour.to_array()
    df['dt_minute'] = gf['timestamp'].dt.minute.to_array()
    df['dt_second'] = gf['timestamp'].dt.second.to_array()
    return

split_time(train)
split_time(valid)

CPU times: user 5.83 s, sys: 5.8 s, total: 11.6 s
Wall time: 11 s


In [39]:
# DROP UNUSED COLUMNS
cols_drop = ['links','hashtags']
train.drop(cols_drop,inplace=True,axis=1)
valid.drop(cols_drop,inplace=True,axis=1)

In [40]:
%%time
# SHUFFLE ROWS because Giba's files have b users in separate files
#train.sort_index(inplace=True) # ORIGINAL RANDOM RECSYS ORDER
train = train.sort_values('timestamp').reset_index(drop=True) #TIME ORDER

CPU times: user 1min 41s, sys: 14.7 s, total: 1min 55s
Wall time: 1min 51s


In [41]:
%%time

# RAPIDS DOESNT IMPLEMENT UINT
def convert2int(df):
    print('Converting uint8 to int8...')
    for c in df.columns:
        if df[c].dtype=='uint8':
            print(c,'max value =',df[c].max(),', ',end='')
            df[c] = df[c].astype('int8')
            
convert2int(train)
convert2int(valid)

Converting uint8 to int8...
media max value = 12 , tweet_type max value = 2 , language max value = 65 , Converting uint8 to int8...
media max value = 12 , tweet_type max value = 2 , language max value = 65 , CPU times: user 561 ms, sys: 363 ms, total: 923 ms
Wall time: 833 ms


In [42]:
%%time

label_names = ['reply', 'retweet', 'retweet_comment', 'like']
train['engage_time'] = train[label_names].min(1)

CPU times: user 1.85 s, sys: 1.68 s, total: 3.52 s
Wall time: 3.36 s


In [43]:
%%time
# ELAPSED TIME
gf = cudf.from_pandas(train[['engage_time','timestamp']])
gf = gf.astype('int64')/1e9
gf.loc[gf.engage_time==0,'engage_time'] = np.nan
gf['elapsed_time'] = gf['engage_time'] - gf['timestamp']
train['elapsed_time'] = gf.elapsed_time.astype('float32').to_array()

CPU times: user 7.35 s, sys: 8.47 s, total: 15.8 s
Wall time: 15 s


In [44]:
del gf; gc.collect()

0

In [45]:
%%time

train['reply'] = (train['reply']>0).astype('int8')
train['retweet'] = (train['retweet']>0).astype('int8')
train['retweet_comment'] = (train['retweet_comment']>0).astype('int8')
train['like'] = (train['like']>0).astype('int8')

CPU times: user 10.5 s, sys: 13 s, total: 23.5 s
Wall time: 22 s


# Feature Engineering 

In [46]:
# RENAME TEST TO VALID and then use exact code from validation notebook
# valid = test
# del test; x=gc.collect()

In [47]:
train.shape,valid.shape

((121386431, 37), (24869573, 36))

In [48]:
%%time

add_diff_user1(train, valid, 'tw_rt_uhash')

CPU times: user 2.89 s, sys: 2.25 s, total: 5.14 s
Wall time: 5.03 s


In [49]:
%%time

add_diff_user1_fixed(train, valid, 'tw_userid0')

CPU times: user 5.09 s, sys: 4.45 s, total: 9.53 s
Wall time: 9.18 s


In [50]:
%%time

add_freq_tweet(train, valid)

CPU times: user 3.57 s, sys: 3.38 s, total: 6.95 s
Wall time: 6.65 s


In [51]:
add_timeshift(train, valid, shift=1)
add_timeshift(train, valid, shift=-1)

In [52]:
%%time

diff_time(train, valid)

CPU times: user 10.6 s, sys: 15.7 s, total: 26.3 s
Wall time: 1min 9s


In [53]:
%%time

train.loc[train['tw_hash0']==0,'diff_tw_hash0'] = 0
train.loc[train['tw_hash0']==0,'same_tw_hash0'] = 0

valid.loc[valid['tw_hash0']==0,'diff_tw_hash0'] = 0
valid.loc[valid['tw_hash0']==0,'same_tw_hash0'] = 0

train.loc[train['tw_rt_uhash']==0,'diff_tw_rt_uhash'] = 0
train.loc[train['tw_rt_uhash']==0,'same_tw_rt_uhash'] = 0

valid.loc[valid['tw_rt_uhash']==0,'diff_tw_rt_uhash'] = 0
valid.loc[valid['tw_rt_uhash']==0,'same_tw_rt_uhash'] = 0

CPU times: user 18.1 s, sys: 8.7 s, total: 26.8 s
Wall time: 25.9 s


## Target Encode

In [54]:
from sklearn.model_selection import KFold
def target_encode_cudf_v3(train, valid, col, tar, n_folds=5, min_ct=0, smooth=20, 
                          seed=42, shuffle=False, t2=None, v2=None, x=-1):
    #
    # col = column to target encode (or if list of columns then multiple groupby)
    # tar = tar column encode against
    # if min_ct>0 then all classes with <= min_ct are consider in new class "other"
    # smooth = Bayesian smooth parameter
    # seed = for 5 Fold if shuffle==True
    # if x==-1 result appended to train and valid
    # if x>=0 then result returned in column x of t2 and v2
    #    
    
    # SINGLE OR MULTIPLE COLUMN
    if not isinstance(col, list): col = [col]
    if (min_ct>0)&(len(col)>1): 
        print('WARNING: Setting min_ct=0 with multiple columns. Not implemented')
        min_ct = 0
    name = "_".join(col)
        
    # FIT ALL TRAIN
    gf = cudf.from_pandas(train[col+[tar]]).reset_index(drop=True)
    gf['idx'] = gf.index #needed because cuDF merge returns out of order
    if min_ct>0: # USE MIN_CT?
        other = gf.groupby(col[0]).size(); other = other[other<=min_ct].index
        save = gf[col[0]].values.copy()
        gf.loc[gf[col[0]].isin(other),col[0]] = -1
    te = gf.groupby(col)[[tar]].agg(['mean','count']).reset_index(); te.columns = col + ['m','c']
    mn = gf[tar].mean().astype('float32')
    te['smooth'] = ((te['m']*te['c'])+(mn*smooth)) / (te['c']+smooth)
    if min_ct>0: gf[col[0]] = save.copy()
    
    # PREDICT VALID
    gf2 = cudf.from_pandas(valid[col]).reset_index(drop=True); gf2['idx'] = gf2.index
    if min_ct>0: gf2.loc[gf2[col[0]].isin(other),col[0]] = -1
    gf2 = gf2.merge(te[col+['smooth']], on=col, how='left', sort=False).sort_values('idx')
    if x==-1: valid[f'TE_{name}_{tar}'] = gf2['smooth'].fillna(mn).astype('float32').to_array()
    elif x>=0: v2[:,x] = gf2['smooth'].fillna(mn).astype('float32').to_array()
    
    # KFOLD ON TRAIN
    tmp = cupy.zeros((train.shape[0]),dtype='float32'); gf['fold'] = 0
    if shuffle: # shuffling is 2x slower
        kf = KFold(n_folds, random_state=seed, shuffle=shuffle)
        for k,(idxT,idxV) in enumerate(kf.split(train)): gf.loc[idxV,'fold'] = k
    else:
        fsize = train.shape[0]//n_folds
        gf['fold'] = cupy.clip(gf.idx.values//fsize,0,n_folds-1)
    for k in range(n_folds):
        if min_ct>0: # USE MIN CT?
            if k<n_folds-1: save = gf[col[0]].values.copy()
            other = gf.loc[gf.fold!=k].groupby(col[0]).size(); other = other[other<=min_ct].index
            gf.loc[gf[col[0]].isin(other),col[0]] = -1
        te = gf.loc[gf.fold!=k].groupby(col)[[tar]].agg(['mean','count']).reset_index(); 
        te.columns = col + ['m','c']
        mn = gf.loc[gf.fold!=k,tar].mean().astype('float32')
        te['smooth'] = ((te['m']*te['c'])+(mn*smooth)) / (te['c']+smooth)
        gf = gf.merge(te[col+['smooth']], on=col, how='left', sort=False).sort_values('idx')
        tmp[(gf.fold.values==k)] = gf.loc[gf.fold==k,'smooth'].fillna(mn).astype('float32').values
        gf.drop_column('smooth')
        if (min_ct>0)&(k<n_folds-1): gf[col[0]] = save.copy()
    if x==-1: train[f'TE_{name}_{tar}'] = cupy.asnumpy(tmp.astype('float32'))
    elif x>=0: t2[:,x] = cupy.asnumpy(tmp.astype('float32'))

In [55]:
# CPU STORAGE FOR NEW FEATURES
# This is faster than adding each new column to Pandas dataframe
train2 = np.zeros((train.shape[0],28),dtype='float32')
valid2 = np.zeros((valid.shape[0],28),dtype='float32')

In [56]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id', 'tw_hash0', 'tw_rt_uhash']:
    for t in ['reply', 'retweet', 'retweet_comment', 'like']:
        start = time.time()
        target_encode_cudf_v3(train, valid, col=c, tar=t, smooth=20, min_ct=0,
                              t2=train2, v2=valid2, x=idx, shuffle=False)
        end = time.time(); idx += 1
        cols.append(f'TE_{c}_{t}')
        print('TE',c,t,'%.1f seconds'%(end-start))

TE media reply 12.2 seconds
TE media retweet 7.7 seconds
TE media retweet_comment 7.2 seconds
TE media like 6.8 seconds
TE tweet_type reply 6.3 seconds
TE tweet_type retweet 6.2 seconds
TE tweet_type retweet_comment 6.3 seconds
TE tweet_type like 6.6 seconds
TE language reply 8.3 seconds
TE language retweet 7.0 seconds
TE language retweet_comment 6.3 seconds
TE language like 6.4 seconds
TE a_user_id reply 12.8 seconds
TE a_user_id retweet 12.6 seconds
TE a_user_id retweet_comment 12.0 seconds
TE a_user_id like 11.6 seconds
TE b_user_id reply 15.3 seconds
TE b_user_id retweet 13.5 seconds
TE b_user_id retweet_comment 15.2 seconds
TE b_user_id like 13.3 seconds
TE tw_hash0 reply 7.9 seconds
TE tw_hash0 retweet 7.4 seconds
TE tw_hash0 retweet_comment 7.8 seconds
TE tw_hash0 like 9.2 seconds
TE tw_rt_uhash reply 8.8 seconds
TE tw_rt_uhash retweet 8.2 seconds
TE tw_rt_uhash retweet_comment 8.3 seconds
TE tw_rt_uhash like 10.5 seconds
CPU times: user 2min 33s, sys: 1min 58s, total: 4min 31s


In [57]:
%%time
# MAKE SURE VALID HAS INDEX 0,1,2,3...
valid = pd.concat([valid,pd.DataFrame(valid2,columns=cols)],axis=1)
del valid2; x=gc.collect()

CPU times: user 8.3 s, sys: 2.58 s, total: 10.9 s
Wall time: 10.3 s


In [58]:
%%time
# MAKE SURE TRAIN HAS INDEX 0,1,2,3...
train = pd.concat([train,pd.DataFrame(train2,columns=cols)],axis=1)
del train2; x=gc.collect()

CPU times: user 38.4 s, sys: 12 s, total: 50.3 s
Wall time: 47.6 s


## Mulitple Column Target Encode

In [59]:
# CPU STORAGE FOR NEW FEATURES
# This is faster than adding each new column to Pandas dataframe
train2 = np.zeros((train.shape[0],4),dtype='float32')
valid2 = np.zeros((valid.shape[0],4),dtype='float32')

In [60]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
c = ['domains','language','b_follows_a','tweet_type','media','a_is_verified']
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    start = time.time()
    target_encode_cudf_v3(train, valid, col=c, tar=t, smooth=20, min_ct=0,
                            t2=train2, v2=valid2, x=idx, shuffle=False)
    end = time.time(); idx += 1
    cols.append(f'TE_mult_{t}')
    print('TE','mult',t,'%.1f seconds'%(end-start))

TE mult reply 50.6 seconds
TE mult retweet 16.8 seconds
TE mult retweet_comment 16.5 seconds
TE mult like 16.8 seconds
CPU times: user 57.5 s, sys: 48 s, total: 1min 45s
Wall time: 1min 40s


In [61]:
%%time
# MAKE SURE VALID HAS INDEX 0,1,2,3...
valid = pd.concat([valid,pd.DataFrame(valid2,columns=cols)],axis=1)
del valid2; x=gc.collect()

CPU times: user 2.09 s, sys: 2.08 s, total: 4.17 s
Wall time: 4.01 s


In [62]:
%%time
# MAKE SURE TRAIN HAS INDEX 0,1,2,3...
train = pd.concat([train,pd.DataFrame(train2,columns=cols)],axis=1)
del train2; x=gc.collect()

CPU times: user 8.22 s, sys: 9.55 s, total: 17.8 s
Wall time: 17.4 s


## Elapsed Time Target Encode

In [63]:
# CPU STORAGE FOR NEW FEATURES
# This is faster than adding each new column to Pandas dataframe
train2 = np.zeros((train.shape[0],5),dtype='float32')
valid2 = np.zeros((valid.shape[0],5),dtype='float32')

In [64]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    for t in ['elapsed_time']:
        start = time.time()
        target_encode_cudf_v3(train, valid, col=c, tar=t, smooth=20, min_ct=0,
                              t2=train2, v2=valid2, x=idx, shuffle=False)
        end = time.time(); idx += 1
        cols.append(f'TE_{c}_{t}')
        print('TE',c,t,'%.1f seconds'%(end-start))

TE media elapsed_time 43.5 seconds
TE tweet_type elapsed_time 5.3 seconds
TE language elapsed_time 5.3 seconds
TE a_user_id elapsed_time 10.7 seconds
TE b_user_id elapsed_time 12.9 seconds
CPU times: user 40.7 s, sys: 40.6 s, total: 1min 21s
Wall time: 1min 17s


In [65]:
%%time
# MAKE SURE VALID HAS INDEX 0,1,2,3...
valid = pd.concat([valid,pd.DataFrame(valid2,columns=cols)],axis=1)
del valid2; x=gc.collect()

CPU times: user 2.07 s, sys: 2.33 s, total: 4.4 s
Wall time: 4.24 s


In [66]:
%%time
# MAKE SURE TRAIN HAS INDEX 0,1,2,3...
train = pd.concat([train,pd.DataFrame(train2,columns=cols)],axis=1)
del train2; x=gc.collect()

CPU times: user 8.85 s, sys: 10.4 s, total: 19.3 s
Wall time: 18.9 s


## Count Encode

In [67]:
def count_encode_cudf_v2(train,valid,col,t2=None,v2=None,x=-1):
    #
    # col = column to count encode
    # if x==-1 then result appended to train and valid
    # if x>=0 then result returned in numpy arrays t2 and v2
    #    make sure x is even because it returns in x and x+1 column
    #
    # COUNT TRAIN SEPARATELY
    gf = cudf.from_pandas(train[[col]]).reset_index(drop=True); gf['idx'] = gf.index
    te = gf.groupby(col)[['idx']].agg('count').rename({'idx':'ct'})
    gf = gf.merge(te,left_on=col,right_index=True,how='left').sort_values('idx')
    if x==-1: train[f'CE_{col}_norm'] = (gf.ct/len(gf)).astype('float32').to_array()
    elif x>=0: 
        t2[:,x] = (gf.ct/len(gf)).astype('float32').to_array()
        #t2[:,x+1] = gf.ct.astype('float32').to_array()

    # COUNT VALID SEPARATELY
    gf2 = cudf.from_pandas(valid[[col]]).reset_index(drop=True); gf2['idx'] = gf2.index
    te = gf2.groupby(col)[['idx']].agg('count').rename({'idx':'ct'})
    gf2 = gf2.merge(te,left_on=col,right_index=True,how='left').sort_values('idx')
    if x==-1: valid[f'CE_{col}_norm'] = (gf2.ct/len(gf2)).astype('float32').to_array()
    elif x>=0: 
        v2[:,x] = (gf2.ct/len(gf2)).astype('float32').to_array()
        #v2[:,x+1] = gf2.ct.astype('float32').to_array()
        
    # COUNT TRAIN VALID TOGETHER
    gf3 = cudf.concat([gf,gf2],axis=0)
    te = gf3.groupby(col)[['idx']].agg('count').rename({'idx':'ct2'})
    gf = gf.merge(te,left_on=col,right_index=True,how='left').sort_values('idx')
    gf2 = gf2.merge(te,left_on=col,right_index=True,how='left').sort_values('idx')
    if x==-1:
        train[f'CE_{col}'] = gf.ct2.astype('float32').to_array()
        valid[f'CE_{col}'] = gf2.ct2.astype('float32').to_array()
    elif x>=0:
        t2[:,x+1] = gf.ct2.astype('float32').to_array()
        v2[:,x+1] = gf2.ct2.astype('float32').to_array()

In [68]:
# CPU STORAGE FOR NEW FEATURES
# This is faster than adding each new column to Pandas dataframe
train2 = np.zeros((train.shape[0],10),dtype='float32')
valid2 = np.zeros((valid.shape[0],10),dtype='float32')

In [69]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
        start = time.time()
        count_encode_cudf_v2(train,valid,col=c,t2=train2,v2=valid2,x=idx)
        end = time.time(); idx += 2
        cols.append(f'CE_{c}_norm')
        cols.append(f'CE_{c}')
        print('CE',c,'%.1f seconds'%(end-start))

CE media 45.4 seconds
CE tweet_type 3.7 seconds
CE language 3.6 seconds
CE a_user_id 4.7 seconds
CE b_user_id 5.3 seconds
CPU times: user 33.9 s, sys: 32.8 s, total: 1min 6s
Wall time: 1min 2s


In [70]:
%%time
# MAKE SURE VALID HAS INDEX 0,1,2,3...
valid = pd.concat([valid,pd.DataFrame(valid2,columns=cols)],axis=1)
del valid2; x=gc.collect()

CPU times: user 3.14 s, sys: 2.43 s, total: 5.57 s
Wall time: 5.32 s


In [71]:
%%time
# MAKE SURE TRAIN HAS INDEX 0,1,2,3...
train = pd.concat([train,pd.DataFrame(train2,columns=cols)],axis=1)
del train2; x=gc.collect()

CPU times: user 13.8 s, sys: 12.8 s, total: 26.5 s
Wall time: 25.7 s


## Difference Encode (Lag Features)

In [72]:
def diff_encode_cudf_v1(train,col,tar,sort_col=None,sft=1,t2=None,x=0):
    if sort_col is None: 
        gf = cudf.from_pandas(train[[col, tar]]).reset_index(drop=True)
        gf['idx'] = gf.index        
        gf = gf.sort_values([col])
    else: 
        gf = cudf.from_pandas(train[[col, tar, sort_col]]).reset_index(drop=True)
        gf['idx'] = gf.index
        gf = gf.sort_values([col,sort_col])
    gf[col+'_sft'] = gf[col].shift(sft)
    gf[tar+'_sft'] = gf[tar].shift(sft)
    gf[tar+'_diff'] = gf[tar]-gf[tar+'_sft']
    gf.loc[gf[col]!=gf[col+'_sft'], tar+'_diff'] = 0
    gf = gf.sort_values(['idx'])
    if t2 is None: train[tar+'_diff'] = gf[tar+'_diff'].fillna(0).astype('float32').to_array()
    else: t2[:,x] = gf[tar+'_diff'].fillna(0).astype('float32').to_array()

In [73]:
# CPU STORAGE FOR NEW FEATURES
# This is faster than adding each new column to Pandas dataframe
train2 = np.zeros((train.shape[0],6),dtype='float32')
valid2 = np.zeros((valid.shape[0],6),dtype='float32')

In [74]:
%%time
# cuDF DE ENCODING IS FAST!!
idx = 0; cols = []; sc = 'timestamp'
for c in ['b_user_id']:
    for t in ['b_follower_count','b_following_count','language']:
        for s in [1,-1]:
            start = time.time()
            diff_encode_cudf_v1(train, col=c, tar=t, sft=s, sort_col=sc, t2=train2, x=idx)
            diff_encode_cudf_v1(valid, col=c, tar=t, sft=s, sort_col=sc, t2=valid2, x=idx)
            end = time.time(); idx += 1
            cols.append(f'DE_{c}_{t}_{s}')
            print('DE',c,t,s,'%.1f seconds'%(end-start))

DE b_user_id b_follower_count 1 51.6 seconds
DE b_user_id b_follower_count -1 6.6 seconds
DE b_user_id b_following_count 1 6.6 seconds
DE b_user_id b_following_count -1 6.5 seconds
DE b_user_id language 1 5.9 seconds
DE b_user_id language -1 6.5 seconds
CPU times: user 45.6 s, sys: 42.9 s, total: 1min 28s
Wall time: 1min 23s


In [75]:
%%time
# MAKE SURE VALID HAS INDEX 0,1,2,3...
valid = pd.concat([valid,pd.DataFrame(valid2,columns=cols)],axis=1)
del valid2; x=gc.collect()

CPU times: user 2.59 s, sys: 2.73 s, total: 5.32 s
Wall time: 5.09 s


In [76]:
%%time
# MAKE SURE TRAIN HAS INDEX 0,1,2,3...
train = pd.concat([train,pd.DataFrame(train2,columns=cols)],axis=1)
del train2; x=gc.collect()

CPU times: user 11.2 s, sys: 12.6 s, total: 23.8 s
Wall time: 23.2 s


## Diff Language

In [77]:
def add_diff_language(train, valid):
    gf1 = cudf.from_pandas(train[['a_user_id', 'language', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['a_user_id', 'language', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2['idx'] = gf2.index
    gf = cudf.concat([gf1, gf2], axis=0)
    gf_lang = gf[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
    gf_lang = gf_lang.groupby(['a_user_id', 'language']).count().reset_index()
    gf_lang = gf_lang.sort_values(['a_user_id', 'tweet_id'], ascending=False)
    gf_lang['a_user_shifted'] = gf_lang['a_user_id'].shift(1)
    gf_lang = gf_lang[gf_lang['a_user_shifted']!=gf_lang['a_user_id']]
    gf_lang.columns = ['a_user_id_lang', 'top_tweet_language', 'drop1', 'drop2']
    gf1 = gf1.merge(gf_lang[['a_user_id_lang', 'top_tweet_language']], how='left', left_on='b_user_id', right_on='a_user_id_lang')
    gf2 = gf2.merge(gf_lang[['a_user_id_lang', 'top_tweet_language']], how='left', left_on='b_user_id', right_on='a_user_id_lang')
    gf1 = gf1.sort_values('idx')
    gf2 = gf2.sort_values('idx')
    gf1['same_language'] = gf1['language'] == gf1['top_tweet_language']
    gf1['diff_language'] = gf1['language'] != gf1['top_tweet_language']
    gf1['nan_language'] = 0
    gf1.loc[gf1['top_tweet_language'].isna(), 'same_language'] = 0
    gf1.loc[gf1['top_tweet_language'].isna(), 'diff_language'] = 0
    gf1.loc[gf1['top_tweet_language'].isna(), 'nan_language'] = 1
    gf2['same_language'] = gf2['language'] == gf2['top_tweet_language']
    gf2['diff_language'] = gf2['language'] != gf2['top_tweet_language']
    gf2['nan_language'] = 0
    gf2.loc[gf2['top_tweet_language'].isna(), 'same_language'] = 0
    gf2.loc[gf2['top_tweet_language'].isna(), 'diff_language'] = 0
    gf2.loc[gf2['top_tweet_language'].isna(), 'nan_language'] = 1
    train['same_language'] = gf1['same_language'].fillna(0).astype('int32').to_array()
    train['diff_language'] = gf1['diff_language'].fillna(0).astype('int32').to_array()
    train['nan_language'] = gf1['nan_language'].fillna(0).astype('int32').to_array()
    valid['same_language'] = gf2['same_language'].fillna(0).astype('int32').to_array()
    valid['diff_language'] = gf2['diff_language'].fillna(0).astype('int32').to_array()
    valid['nan_language'] = gf2['nan_language'].fillna(0).astype('int32').to_array()

In [78]:
%%time
add_diff_language(train,valid)

CPU times: user 25.6 s, sys: 33.3 s, total: 58.9 s
Wall time: 55.4 s


## Follower Ratio

In [79]:
%%time
# follow rate feature
train['a_ff_rate'] = (train['a_following_count'] / train['a_follower_count']).astype('float32')
train['b_ff_rate'] = (train['b_follower_count']  / train['b_following_count']).astype('float32')
valid['a_ff_rate']  = (valid['a_following_count'] / valid['a_follower_count']).astype('float32')
valid['b_ff_rate']  = (valid['b_follower_count']  / valid['b_following_count']).astype('float32')

CPU times: user 1.76 s, sys: 1.2 s, total: 2.96 s
Wall time: 2.77 s


In [80]:
train.to_parquet('results/sub_train.parquet')
valid.to_parquet('results/sub_valid.parquet')

In [81]:
#train = pd.read_parquet('/recsys_features2/sub_train.parquet')
#valid = pd.read_parquet('/recsys_features2/sub_valid.parquet')

In [82]:
%%time
# follow rate feature
train['ab_fing_rate'] = (train['a_following_count'] / train['b_following_count']).astype('float32')
train['ab_fer_rate'] = (train['a_follower_count'] / train['b_follower_count']).astype('float32')
valid['ab_fing_rate'] = (valid['a_following_count'] / valid['b_following_count']).astype('float32')
valid['ab_fer_rate'] = (valid['a_follower_count'] / valid['b_follower_count']).astype('float32')

CPU times: user 1.69 s, sys: 1.21 s, total: 2.9 s
Wall time: 2.72 s


In [83]:
%%time
train['a_age'] = (datetime(2020, 2, 1)-train['a_account_creation']).dt.days/30
train['b_age'] = (datetime(2020, 2, 1)-train['b_account_creation']).dt.days/30
train['ab_age_dff'] = (train['a_account_creation']-train['b_account_creation']).dt.days/30
train['ab_age_rate'] = train['a_age']/train['b_age']

valid['a_age'] = (datetime(2020, 2, 1)-valid['a_account_creation']).dt.days/30
valid['b_age'] = (datetime(2020, 2, 1)-valid['b_account_creation']).dt.days/30
valid['ab_age_dff'] = (valid['a_account_creation']-valid['b_account_creation']).dt.days/30
valid['ab_age_rate'] = valid['a_age']/valid['b_age']

CPU times: user 15.3 s, sys: 7.71 s, total: 23.1 s
Wall time: 21.6 s


In [84]:
def follower_chain_2(train, valid):
    gf1 = cudf.from_pandas(train[['a_user_id', 'b_user_id', 'b_follows_a']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['a_user_id', 'b_user_id', 'b_follows_a']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2['idx'] = gf2.index
    
    gf = cudf.concat([gf1, gf2], axis=0)
    gf = gf[gf['b_follows_a']]
    gf.drop_column('idx')
    gf.drop_column('b_follows_a')
    gf = gf.drop_duplicates()
    
    gf1 = gf1.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['b_user_id', 'a_user_id'])
    gf1.columns = ['a_user_id', 'b_user_id', 'b_follows_a', 'idx', 'a_user_id_2', 'b_user_id_2']
    gf1['a_follows_b'] = 0
    gf1.loc[gf1['a_user_id_2']>0, 'a_follows_b'] = 1
    gf1.drop_column('a_user_id_2')
    gf1.drop_column('b_user_id_2')
    
    gf2 = gf2.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['b_user_id', 'a_user_id'])
    gf2.columns = ['a_user_id', 'b_user_id', 'b_follows_a', 'idx', 'a_user_id_2', 'b_user_id_2']
    gf2['a_follows_b'] = 0
    gf2.loc[gf2['a_user_id_2']>0, 'a_follows_b'] = 1
    gf2.drop_column('a_user_id_2')
    gf2.drop_column('b_user_id_2')
    
    gf = gf.merge(gf, how='left', left_on='b_user_id', right_on='a_user_id')
    gf = gf[gf['a_user_id_y']>0]
    gf.drop_column('a_user_id_y')
    gf.columns = ['a_user_id', 'b_user_id', 'b_user_id_2']
    gf.drop_column('b_user_id')
    gf = gf.drop_duplicates()
    gf['b_user_id'] = 0
    
    gf1 = gf1.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['a_user_id', 'b_user_id_2'])
    gf1['b_follows_a_2'] = 0
    gf1.loc[gf1['b_user_id_2']>0, 'b_follows_a_2'] = 1
    gf1.drop_column('b_user_id_y')
    gf1.drop_column('b_user_id_2')
    gf1.columns = ['a_user_id', 'b_user_id', 'b_follows_a', 'idx', 'a_follows_b', 'b_follows_a_2']
    
    gf1 = gf1.merge(gf, how='left', left_on=['b_user_id', 'a_user_id'], right_on=['a_user_id', 'b_user_id_2'])
    gf1['a_follows_b_2'] = 0
    gf1.loc[gf1['b_user_id_2']>0, 'a_follows_b_2'] = 1
    gf1.drop_column('b_user_id_y')
    gf1.drop_column('b_user_id_2')
    gf1.drop_column('a_user_id_y')
    gf1.columns = ['a_user_id', 'b_user_id', 'b_follows_a', 'idx', 'a_follows_b', 'b_follows_a_2', 'a_follows_a_2']
    
    gf2 = gf2.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['a_user_id', 'b_user_id_2'])
    gf2['b_follows_a_2'] = 0
    gf2.loc[gf2['b_user_id_2']>0, 'b_follows_a_2'] = 1
    gf2.drop_column('b_user_id_y')
    gf2.drop_column('b_user_id_2')
    gf2.columns = ['a_user_id', 'b_user_id', 'b_follows_a', 'idx', 'a_follows_b', 'b_follows_a_2']
    
    gf2 = gf2.merge(gf, how='left', left_on=['b_user_id', 'a_user_id'], right_on=['a_user_id', 'b_user_id_2'])
    gf2['a_follows_b_2'] = 0
    gf2.loc[gf2['b_user_id_2']>0, 'a_follows_b_2'] = 1
    gf2.drop_column('b_user_id_y')
    gf2.drop_column('b_user_id_2')
    gf2.drop_column('a_user_id_y')
    gf2.columns = ['a_user_id', 'b_user_id', 'b_follows_a', 'idx', 'a_follows_b', 'b_follows_a_2', 'a_follows_a_2']
    
    gf1 = gf1.sort_values('idx')
    gf2 = gf2.sort_values('idx')
    
    train['a_follows_b'] = gf1['a_follows_b'].fillna(0).astype('int8').to_array()
    train['b_follows_a_2'] = gf1['b_follows_a_2'].fillna(0).astype('int8').to_array()
    train['a_follows_b_2'] = gf1['a_follows_a_2'].fillna(0).astype('int8').to_array()
    
    valid['a_follows_b'] = gf2['a_follows_b'].fillna(0).astype('int8').to_array()
    valid['b_follows_a_2'] = gf2['b_follows_a_2'].fillna(0).astype('int8').to_array()
    valid['a_follows_b_2'] = gf2['a_follows_a_2'].fillna(0).astype('int8').to_array()

In [85]:
%%time

follower_chain_2(train, valid)

CPU times: user 22.1 s, sys: 31 s, total: 53.1 s
Wall time: 49.7 s


In [86]:
def combined_frequency(train, valid):
    gf1 = cudf.from_pandas(train[['a_user_id', 'b_user_id']]).reset_index(drop=True)
    gf1['idx'] = gf1.index
    gf2 = cudf.from_pandas(valid[['a_user_id', 'b_user_id']]).reset_index(drop=True)
    gf2['idx'] = gf2.index
    
    gf = cudf.concat([gf1, gf2])
    gf = gf[['a_user_id', 'b_user_id']].groupby(['a_user_id', 'b_user_id']).size().reset_index()
    gf.columns = ['a_user_id', 'b_user_id', 'freq_same']
    gf1 = gf1.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['a_user_id', 'b_user_id'])
    gf2 = gf2.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['a_user_id', 'b_user_id'])
    gf.columns = ['a_user_id', 'b_user_id', 'freq_diff']
    gf1 = gf1.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['b_user_id', 'a_user_id'])
    gf2 = gf2.merge(gf, how='left', left_on=['a_user_id', 'b_user_id'], right_on=['b_user_id', 'a_user_id'])
    
    train['ab_freq_s'] = gf1['freq_same'].fillna(0).astype('int32').to_array()
    train['ab_freq_d'] = gf1['freq_diff'].fillna(0).astype('int32').to_array()
    train['ab_freq_sd'] = train['ab_freq_s'] + train['ab_freq_d']
    
    valid['ab_freq_s'] = gf2['freq_same'].fillna(0).astype('int32').to_array()
    valid['ab_freq_d'] = gf2['freq_diff'].fillna(0).astype('int32').to_array()
    valid['ab_freq_sd'] = valid['ab_freq_s'] + valid['ab_freq_d']

def add_no_tweet_time(train, valid):
    gf1 = cudf.from_pandas(train[['timestamp', 'a_user_id', 'b_user_id', 'tweet_id', 'no_tweet']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['timestamp', 'a_user_id', 'b_user_id', 'tweet_id', 'no_tweet']]).reset_index(drop=True)

    gf = cudf.concat([gf1, gf2], axis=0)
    gf = dask_cudf.from_cudf(gf, npartitions=64)
    gf['timestamp'] = gf['timestamp'].astype('int64')/1e9
    gf_unique = gf[['timestamp', 'a_user_id', 'tweet_id']].drop_duplicates()
    gf_unique.columns = ['tmp_timestamp', 'tmp_a_user_id', 'tmp_tweet_id']
    gf = gf[gf['no_tweet']!=0]
    gf = gf.drop('no_tweet', axis=1)
    gf = gf.drop('a_user_id', axis=1)
    gf = gf.merge(gf_unique, how='left', left_on='b_user_id', right_on='tmp_a_user_id')
    gf = gf[gf['tweet_id']!=gf['tmp_tweet_id']]
    gf = gf[~gf['tmp_a_user_id'].isna()]

    for sec_interval in [5,60,240,480,1440]:
        gf['diff_timestamp_prev'] = gf['timestamp']-gf['tmp_timestamp']
        gf['diff_timestamp_after'] = gf['tmp_timestamp']-gf['timestamp']
        gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']>0, 15*24*3600)
        gf['diff_timestamp_prev'] = gf.diff_timestamp_prev.where(gf['diff_timestamp_prev']>0, 15*24*3600)
        gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']<sec_interval*60, 0)
        gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']==0, 1)
        gf['diff_timestamp_prev'] = gf.diff_timestamp_after.where(gf['diff_timestamp_prev']<sec_interval*60, 0)
        gf['diff_timestamp_prev'] = gf.diff_timestamp_after.where(gf['diff_timestamp_prev']==0, 1)
        gf_tmp = gf[['tweet_id', 
                     'b_user_id', 
                     'diff_timestamp_prev', 
                     'diff_timestamp_after']].groupby(['tweet_id', 'b_user_id']).sum().reset_index()

        gf_tmp.to_parquet('/tmp/time2_gf' + str(sec_interval))

    for sec_interval in [5,60,240,480,1440]:
        gf = cudf.read_parquet('/tmp/time2_gf' + str(sec_interval) + '/part.0.parquet')
        gf.columns = ['idx2', 'tweet_id', 'b_user_id', 'tweets_pres_s' + str(sec_interval), 'tweets_after_s' + str(sec_interval)]
        gf = gf.drop('idx2', axis=1)
        gf1 = cudf.from_pandas(train[['b_user_id', 'tweet_id']]).reset_index(drop=True)
        gf1['idx'] = gf1.index
        gf1 = gf1.merge(gf, how='left', left_on=['tweet_id', 'b_user_id'], right_on=['tweet_id', 'b_user_id'])
        gf1 = gf1.sort_values('idx')
        #train['tweets_after_s' + str(sec_interval)] = gf1['tweets_after_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        train['tweets_prev_s' + str(sec_interval)] = gf1['tweets_pres_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        del gf1; gc.collect()

        gf1 = cudf.from_pandas(valid[['b_user_id', 'tweet_id']]).reset_index(drop=True)
        gf1['idx'] = gf1.index
        gf1 = gf1.merge(gf, how='left', left_on=['tweet_id', 'b_user_id'], right_on=['tweet_id', 'b_user_id'])
        gf1 = gf1.sort_values('idx')
        #valid['tweets_after_s' + str(sec_interval)] = gf1['tweets_after_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        valid['tweets_prev_s' + str(sec_interval)] = gf1['tweets_pres_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        del gf1; gc.collect()

def add_no_eng_time(train, valid):
    gf1 = cudf.from_pandas(train[['timestamp', 'b_user_id', 'tweet_id']]).reset_index(drop=True)
    gf2 = cudf.from_pandas(valid[['timestamp', 'b_user_id', 'tweet_id']]).reset_index(drop=True)

    gf = cudf.concat([gf1, gf2], axis=0)
    gf = dask_cudf.from_cudf(gf, npartitions=64)
    gf['timestamp'] = gf['timestamp'].astype('int64')/1e9
    gf_unique = gf[['timestamp', 'b_user_id', 'tweet_id']]
    gf_unique.columns = ['tmp_timestamp', 'tmp_b_user_id', 'tmp_tweet_id']
    gf = gf.merge(gf_unique, how='left', left_on='b_user_id', right_on='tmp_b_user_id')
    gf = gf[gf['tweet_id']!=gf['tmp_tweet_id']]

    for sec_interval in [5,60,240,480,1440]:
        gf['diff_timestamp_prev'] = gf['timestamp']-gf['tmp_timestamp']
        gf['diff_timestamp_prev'] = gf.diff_timestamp_prev.where(gf['diff_timestamp_prev']>0, 15*24*3600)
        gf['diff_timestamp_prev'] = gf.diff_timestamp_prev.where(gf['diff_timestamp_prev']<sec_interval*60, 0)
        gf['diff_timestamp_prev'] = gf.diff_timestamp_prev.where(gf['diff_timestamp_prev']==0, 1)
        gf['diff_timestamp_after'] = gf['tmp_timestamp']-gf['timestamp']
        gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']>0, 15*24*3600)
        gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']<sec_interval*60, 0)
        gf['diff_timestamp_after'] = gf.diff_timestamp_after.where(gf['diff_timestamp_after']==0, 1)
        gf_tmp = gf[['tweet_id', 
                     'b_user_id', 
                     'diff_timestamp_prev', 
                     'diff_timestamp_after']].groupby(['tweet_id', 'b_user_id']).sum().reset_index()

        gf_tmp.to_parquet('/tmp/time2_eng_gf' + str(sec_interval))

    for sec_interval in [5,60,240,480,1440]:
        gf = cudf.read_parquet('/tmp/time2_eng_gf' + str(sec_interval) + '/part.0.parquet')
        gf.columns = ['idx2', 'tweet_id', 'b_user_id', 'tweets_pres_s' + str(sec_interval), 'tweets_after_s' + str(sec_interval)]
        gf = gf.drop('idx2', axis=1)
        gf1 = cudf.from_pandas(train[['b_user_id', 'tweet_id']]).reset_index(drop=True)
        gf1['idx'] = gf1.index
        gf1 = gf1.merge(gf, how='left', left_on=['tweet_id', 'b_user_id'], right_on=['tweet_id', 'b_user_id'])
        gf1 = gf1.sort_values('idx')
        train['eng_after_s' + str(sec_interval)] = gf1['tweets_after_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        train['eng_prev_s' + str(sec_interval)] = gf1['tweets_pres_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        del gf1; gc.collect()

        gf1 = cudf.from_pandas(valid[['b_user_id', 'tweet_id']]).reset_index(drop=True)
        gf1['idx'] = gf1.index
        gf1 = gf1.merge(gf, how='left', left_on=['tweet_id', 'b_user_id'], right_on=['tweet_id', 'b_user_id'])
        gf1 = gf1.sort_values('idx')
        valid['eng_after_s' + str(sec_interval)] = gf1['tweets_after_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        valid['eng_prev_s' + str(sec_interval)] = gf1['tweets_pres_s' + str(sec_interval)].fillna(0).astype('int32').to_array()
        del gf1; gc.collect()

In [87]:
%%time

combined_frequency(train, valid)

CPU times: user 3.95 s, sys: 4.4 s, total: 8.34 s
Wall time: 7.93 s


In [88]:
%%time

add_no_tweet_time(train, valid)

CPU times: user 1min 55s, sys: 1min 25s, total: 3min 21s
Wall time: 8min 44s


In [89]:
%%time

add_no_eng_time(train, valid)

CPU times: user 2min 11s, sys: 1min 31s, total: 3min 42s
Wall time: 9min 37s


In [90]:
train.to_parquet('results/sub_train_final.parquet')
valid.to_parquet('results/sub_valid_final.parquet')

# Summarize Features

In [6]:
train = pd.read_parquet('results/sub_train_final.parquet')
valid = pd.read_parquet('results/sub_valid_final.parquet')

In [7]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
DONT_USE = ['timestamp','a_account_creation','b_account_creation','engage_time',
            'fold','tweet_id','b_user_id','a_user_id', 'dt_dow',
            'a_account_creation', 'b_account_creation', 'elapsed_time',
             'links','domains','hashtags0','hashtags1', 'tw_hash0', 'tw_hash1', 'tw_rt_uhash', 'id']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

In [8]:
print('Using %i features:'%(len(features)))
np.asarray(features)

Using 115 features:


array(['media', 'tweet_type', 'language', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'b_follower_count',
       'b_following_count', 'b_is_verified', 'b_follows_a',
       'len_hashtags', 'len_domains', 'len_links', 'count_ats',
       'count_char', 'count_words', 'dt_hour', 'dt_minute', 'dt_second',
       'same_tw_rt_uhash', 'diff_tw_rt_uhash', 'nan_tw_rt_uhash',
       'same_tw_hash0', 'diff_tw_hash0', 'nan_tw_hash0', 'no_tweet',
       'b_timestamp_1', 'b_timestamp_-1', 'diff_timestamp_prev',
       'diff_timestamp_after', 'TE_media_reply', 'TE_media_retweet',
       'TE_media_retweet_comment', 'TE_media_like', 'TE_tweet_type_reply',
       'TE_tweet_type_retweet', 'TE_tweet_type_retweet_comment',
       'TE_tweet_type_like', 'TE_language_reply', 'TE_language_retweet',
       'TE_language_retweet_comment', 'TE_language_like',
       'TE_a_user_id_reply', 'TE_a_user_id_retweet',
       'TE_a_user_id_retweet_comment', 'TE_a_user_id_like',
       'TE_b_user_id_re

# Train Model Validate
We will train on random `0.10 * 5/7` of all 7 days. This is same size that we validated with.

In [9]:
SAMPLE_RATIO = (0.15) *(0.724) # Same size as validation train
SEED = 1
if SAMPLE_RATIO < 1.0:
    train = train.sample(frac=SAMPLE_RATIO, random_state=SEED)
    gc.collect()

In [10]:
xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'auc',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor' : 'gpu_predictor'
}

import xgboost as xgb
print('XGB Version',xgb.__version__)

XGB Version 1.0.2


In [11]:
valid.head()

Unnamed: 0,tweet_id,media,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,tr,count_ats,count_char,count_words,tw_hash0,tw_hash1,tw_rt_uhash,dt_dow,dt_hour,dt_minute,dt_second,same_tw_rt_uhash,diff_tw_rt_uhash,nan_tw_rt_uhash,same_tw_hash0,diff_tw_hash0,nan_tw_hash0,no_tweet,b_timestamp_1,b_timestamp_-1,diff_timestamp_prev,diff_timestamp_after,TE_media_reply,TE_media_retweet,TE_media_retweet_comment,TE_media_like,TE_tweet_type_reply,TE_tweet_type_retweet,TE_tweet_type_retweet_comment,TE_tweet_type_like,TE_language_reply,TE_language_retweet,TE_language_retweet_comment,TE_language_like,TE_a_user_id_reply,TE_a_user_id_retweet,TE_a_user_id_retweet_comment,TE_a_user_id_like,TE_b_user_id_reply,TE_b_user_id_retweet,TE_b_user_id_retweet_comment,TE_b_user_id_like,TE_tw_hash0_reply,TE_tw_hash0_retweet,TE_tw_hash0_retweet_comment,TE_tw_hash0_like,TE_tw_rt_uhash_reply,TE_tw_rt_uhash_retweet,TE_tw_rt_uhash_retweet_comment,TE_tw_rt_uhash_like,TE_mult_reply,TE_mult_retweet,TE_mult_retweet_comment,TE_mult_like,TE_media_elapsed_time,TE_tweet_type_elapsed_time,TE_language_elapsed_time,TE_a_user_id_elapsed_time,TE_b_user_id_elapsed_time,CE_media_norm,CE_media,CE_tweet_type_norm,CE_tweet_type,CE_language_norm,CE_language,CE_a_user_id_norm,CE_a_user_id,CE_b_user_id_norm,CE_b_user_id,DE_b_user_id_b_follower_count_1,DE_b_user_id_b_follower_count_-1,DE_b_user_id_b_following_count_1,DE_b_user_id_b_following_count_-1,DE_b_user_id_language_1,DE_b_user_id_language_-1,same_language,diff_language,nan_language,a_ff_rate,b_ff_rate,ab_fing_rate,ab_fer_rate,a_age,b_age,ab_age_dff,ab_age_rate,a_follows_b,b_follows_a_2,a_follows_b_2,ab_freq_s,ab_freq_d,ab_freq_sd,tweets_prev_s5,tweets_prev_s60,tweets_prev_s240,tweets_prev_s480,tweets_prev_s1440,eng_after_s5,eng_prev_s5,eng_after_s60,eng_prev_s60,eng_after_s240,eng_prev_s240,eng_after_s480,eng_prev_s480,eng_after_s1440,eng_prev_s1440
0,57733249,5,0,2,54,2020-02-14 17:58:46,534117,13941,1216,False,2015-11-23 15:23:06,3617447,27448,600,False,2018-03-13 13:47:49,True,0,0,0,0,121386431,0,0,0,0,0,55,5,0,0,0,4,17,58,46,0,0,1,0,0,1,5,33,-128,518474,50494,0.024988,0.094235,0.006692,0.47459,0.034772,0.09841,0.007785,0.510755,0.023573,0.100355,0.008051,0.427159,0.007879,0.064392,0.002241,0.410557,0.021339,0.091062,0.006069,0.403593,0.025726,0.111593,0.007505,0.445434,0.034587,0.095342,0.007865,0.512623,0.075886,0.119572,0.014521,0.703054,-1581221000.0,-1581206000.0,-1581185000.0,-1581258000.0,-1581258000.0,0.183514,27279102.0,0.57834,84679896.0,0.447992,63148424.0,3.61888e-07,54.0,1.206293e-07,7.0,-8.0,0.0,-1.0,0.0,0.0,0.0,1,0,0,0.087225,45.746666,2.026667,0.507906,51.0,22.966667,-28.033333,2.22061,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,57733250,7,0,1,47,2020-02-18 10:30:42,2721240,186,100,False,2010-01-09 23:09:26,12365145,139,956,False,2012-04-22 15:58:19,False,0,0,0,0,121386432,0,0,0,0,0,57,5,0,0,102048,1,10,30,42,0,0,1,0,0,1,1,47,-1,766962,1296000,0.018753,0.138635,0.008074,0.540576,0.007432,0.132841,0.006253,0.288196,0.019282,0.09373,0.006771,0.462129,0.023279,0.09934,0.006621,0.485738,0.016004,0.099546,0.004552,0.552695,0.025726,0.111593,0.007505,0.445434,0.01652,0.0705,0.004699,0.473749,0.004349,0.135523,0.004298,0.343614,-1581273000.0,-1581252000.0,-1581253000.0,-1581258000.0,-1581258000.0,0.070948,10919567.0,0.329188,48525540.0,0.049365,7510074.0,4.020978e-08,3.0,2.010489e-07,17.0,0.0,0.0,0.0,0.0,-7.0,-7.0,0,1,0,0.537634,0.145397,0.104603,1.33813,122.466667,94.666667,-27.8,1.293662,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,57733251,1,0,2,13,2020-02-15 02:48:38,2023199,250470,1,False,2012-12-26 02:17:49,28952089,16,97,False,2017-08-28 17:18:31,False,0,0,0,0,121386433,0,0,0,0,0,49,5,0,0,0,5,2,48,38,0,0,1,0,0,1,0,-128,-128,1296000,1296000,0.026295,0.114113,0.006647,0.434231,0.034772,0.09841,0.007785,0.510755,0.031939,0.132462,0.014753,0.461783,0.010243,0.06371,0.002913,0.333725,0.025607,0.109274,0.007283,0.434312,0.025726,0.111593,0.007505,0.445434,0.034587,0.095342,0.007865,0.512623,0.014791,0.201175,0.011906,0.547681,-1581260000.0,-1581206000.0,-1581222000.0,-1581258000.0,-1581258000.0,0.012145,1637619.0,0.57834,84679896.0,0.006631,1035820.0,1.568181e-05,420.0,4.020978e-08,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,4e-06,0.164948,0.010309,15654.375,86.4,29.533333,-56.9,2.925508,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,57733252,7,0,1,54,2020-02-17 04:26:53,2816974,516,406,False,2015-12-02 22:49:27,13774342,460,693,False,2014-04-01 00:25:56,True,0,0,0,0,121386434,0,0,0,0,0,104,16,0,0,2446,0,4,26,53,0,0,1,0,0,1,0,58,-106,1296000,1296000,0.018753,0.138635,0.008074,0.540576,0.007432,0.132841,0.006253,0.288196,0.023573,0.100355,0.008051,0.427159,0.016004,0.068296,0.004552,0.458945,0.016004,0.099546,0.004552,0.427695,0.025726,0.111593,0.007505,0.445434,0.001847,0.138021,0.00579,0.437448,0.006334,0.144244,0.010178,0.423202,-1581273000.0,-1581252000.0,-1581185000.0,-1581258000.0,-1581258000.0,0.070948,10919567.0,0.329188,48525540.0,0.447992,63148424.0,8.041955e-08,14.0,2.412587e-07,18.0,-2.0,0.0,-1.0,0.0,0.0,0.0,0,0,1,0.786822,0.663781,0.585859,1.121739,50.7,71.033333,20.333333,0.713749,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,57733253,5,0,2,54,2020-02-13 03:49:05,366629,19576,273,True,2009-03-04 15:49:58,11208153,468,3837,False,2011-02-25 15:13:21,False,0,0,0,0,121386435,0,0,0,0,0,82,10,0,0,0,3,3,49,5,0,0,1,0,0,1,1,-128,79,355196,1296000,0.024988,0.094235,0.006692,0.47459,0.034772,0.09841,0.007785,0.510755,0.023573,0.100355,0.008051,0.427159,0.00291,0.05219,0.000828,0.458445,0.025607,0.109274,0.007283,0.434312,0.025726,0.111593,0.007505,0.445434,0.034587,0.095342,0.007865,0.512623,0.018369,0.069014,0.006594,0.517109,-1581221000.0,-1581206000.0,-1581185000.0,-1581258000.0,-1581258000.0,0.183514,27279102.0,0.57834,84679896.0,0.447992,63148424.0,1.005244e-06,181.0,1.206293e-07,3.0,0.0,1.0,0.0,-5.0,0.0,0.0,1,0,0,0.013946,0.12197,0.071149,41.82906,132.833333,108.733333,-24.1,1.221643,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
tr_arr = valid[['tr', 'id']]

In [13]:
# CREATE TRAIN AND VALIDATION SETS
RMV = [c for c in DONT_USE if c in train.columns]

X_train = train.drop(RMV, axis=1)
Y_train = train[label_names]
del train
gc.collect()

X_valid = valid[X_train.columns]
#Y_valid = valid[label_names]
del valid
gc.collect()

if X_train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {X_train.shape}')
print(f'X_valid.shape {X_valid.shape}')

# I'M NOT A FAN OF REDUCING TO FLOAT16
#utils.reduce_mem_usage(X_train)
#utils.reduce_mem_usage(X_valid)

no dup :) 
X_train.shape (13182566, 115)
X_valid.shape (24869573, 115)


In [14]:
sub_pub = pd.read_csv('../preprocessings/sample_submission_public.csv')
sub_priv = pd.read_csv('../preprocessings/sample_submission_private.csv')

In [15]:
sub_pub.shape, sub_priv.shape

((12434735, 3), (12434838, 3))

In [16]:
np.sum(tr_arr['tr']==0), np.sum(tr_arr['tr']==1)

(12434735, 12434838)

In [17]:
label_names

['reply', 'retweet', 'retweet_comment', 'like']

In [18]:
import dask as dask

In [19]:
%%time
# Dask dataframe
dX_train = dask.dataframe.from_pandas(X_train, npartitions=8)
dY_train = dask.dataframe.from_pandas(Y_train, npartitions=8)
dX_valid = dask.dataframe.from_pandas(X_valid, npartitions=8)
dX_tr_arr = dask.dataframe.from_pandas(tr_arr, npartitions=8)

CPU times: user 43.5 s, sys: 1.89 s, total: 45.4 s
Wall time: 43.6 s


In [20]:
%%time
# CuDF Dask
ddX_train = dask_cudf.from_dask_dataframe(dX_train)
ddY_train = dask_cudf.from_dask_dataframe(dY_train)
ddX_valid = dask_cudf.from_dask_dataframe(dX_valid)
ddX_tr_arr = dask_cudf.from_dask_dataframe(dX_tr_arr)

CPU times: user 766 ms, sys: 345 ms, total: 1.11 s
Wall time: 1.12 s


In [21]:
for c in ddX_train.columns:
    if str(ddX_train[c].dtype)=='bool': 
        ddX_train[c] = ddX_train[c].astype('int8')
        ddX_valid[c] = ddX_valid[c].astype('int8')
        print (c)

a_is_verified
b_is_verified
b_follows_a


In [22]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:42677  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 429.50 GB


In [23]:
# TRAIN AND VALIDATE
LOOP1 = 3
NROUNDS = [554, 708, 448, 353] 
VERBOSE_EVAL = 50

print('loadtest')

dtest = xgb.dask.DaskDMatrix(client,data=ddX_valid)

for i in range(4):
    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
    
    sub_pub[name] = 0
    sub_priv[name] = 0
    #models = []
    
    dtrain = xgb.dask.DaskDMatrix(client,data=ddX_train,label=ddY_train.iloc[:, i])
    
    for j in range(LOOP1):
        xgb_parms['seed'] = j
                        
        start = time.time(); print('Training...')
        model = xgb.dask.train(client, xgb_parms, 
                               dtrain=dtrain,
                               num_boost_round=NROUNDS[i],
                               verbose_eval=VERBOSE_EVAL) 
        print('Took %.1f seconds'%(time.time()-start))
        
        tr_arr2 = dX_tr_arr.compute().as_matrix()
        pd_tr_arr2 = pd.DataFrame(tr_arr2, columns=['tr', 'id'])
        
        start = time.time(); print('Predicing...')
        pred = xgb.dask.predict(client,model,dtest).compute()
        print('Took %.1f seconds'%(time.time()-start))
        
        start = time.time(); print('Combining...')
        pd_tr_arr2['pred'] = pred
        pd_tr_arr2 = pd_tr_arr2.sort_values('id')
        
        sub_pub[name] += pd_tr_arr2.loc[pd_tr_arr2['tr']==0, 'pred'].values
        sub_priv[name] += pd_tr_arr2.loc[pd_tr_arr2['tr']==1, 'pred'].values
        print('Took %.1f seconds'%(time.time()-start))
        #models.append(model)
        
        if i<3:
            del model, pd_tr_arr2
            gc.collect()
        print()
    
    if i<3:
        del dtrain
        gc.collect()
    print()
        
sub_pub.iloc[:, 2:] /= LOOP1
sub_priv.iloc[:, 2:] /= LOOP1
sub_pub.to_parquet('sub_pub_1334_mulit_v2.parquet')
sub_priv.to_parquet('sub_priv_1334_mulit_v2.parquet')
#sub.to_csv('sub_like_%i.csv'%VER,index=False,header=False)

loadtest
#########################
### reply
#########################
Training...
Took 81.4 seconds




Predicing...
Took 16.3 seconds
Combining...
Took 2.2 seconds

Training...
Took 78.7 seconds




Predicing...
Took 15.9 seconds
Combining...
Took 2.1 seconds

Training...
Took 78.3 seconds




Predicing...
Took 15.8 seconds
Combining...
Took 2.2 seconds


#########################
### retweet
#########################
Training...
Took 73.6 seconds




Predicing...
Took 19.4 seconds
Combining...
Took 2.2 seconds

Training...
Took 74.6 seconds




Predicing...
Took 19.0 seconds
Combining...
Took 2.2 seconds

Training...
Took 74.7 seconds




Predicing...
Took 19.4 seconds
Combining...
Took 2.2 seconds


#########################
### retweet_comment
#########################
Training...
Took 46.6 seconds




Predicing...
Took 13.7 seconds
Combining...
Took 2.2 seconds

Training...
Took 46.9 seconds




Predicing...
Took 13.6 seconds
Combining...
Took 2.2 seconds

Training...
Took 45.5 seconds




Predicing...
Took 13.5 seconds
Combining...
Took 2.1 seconds


#########################
### like
#########################
Training...
Took 42.1 seconds




Predicing...
Took 13.9 seconds
Combining...
Took 2.1 seconds

Training...
Took 41.3 seconds




Predicing...
Took 11.5 seconds
Combining...
Took 2.2 seconds

Training...
Took 41.9 seconds




Predicing...
Took 11.6 seconds
Combining...
Took 2.2 seconds


