Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"
start = time.time()

In [2]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import cudf, cupy, time
cudf.__version__

startNB = time.time()

In [3]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [4]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()
    

# Load Train

In [5]:
%%time
train = pd.read_parquet( '../preprocessings/train-1.parquet' )
test0 = pd.read_parquet( '../preprocessings/test-0.parquet' )
test1 = pd.read_parquet( '../preprocessings/test-1.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 2

train.shape, test0.shape, test1.shape

CPU times: user 53.7 s, sys: 32.9 s, total: 1min 26s
Wall time: 5.84 s


((121386431, 25), (12434735, 25), (12434838, 25))

In [6]:
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0, test1
gc.collect()
train.shape

(146256004, 25)

In [7]:
%%time
train = train.sort_values('id').reset_index(drop=True) 
gc.collect()

CPU times: user 20.1 s, sys: 12.1 s, total: 32.1 s
Wall time: 32.1 s


0

In [8]:
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1
gc.collect()

0

In [9]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr
0,0,0,0,0,0,2,11,1581262691,0,986,1201,False,1274269909,6237935,94,648,False,1478011810,False,0,0,0,0,0,0
1,3,1,0,0,0,1,11,1581497241,1,1225,677,False,1255778244,879586,1139,46,False,1540395738,True,0,1,0,1,1,0
2,0,2,0,1,1,2,11,1580978528,2,3016,1623,False,1313450503,647103,780,440,False,1432084055,True,0,0,0,1,2,0
3,0,3,0,0,0,1,54,1581321849,3,2121,16,False,1547717153,13774339,1,45,False,1534313747,False,0,0,0,1,3,0
4,0,4,5,0,0,2,11,1580956787,4,813505,200,True,1476348838,13774340,171,388,False,1490166885,False,0,0,0,1,4,0


In [10]:
save_memory(train)
gc.collect()

0

In [11]:
train.dtypes

hashtags              int32
tweet_id              int32
media                  int8
links                 int32
domains               int32
tweet_type             int8
language               int8
timestamp             int32
a_user_id             int32
a_follower_count      int32
a_following_count     int32
a_is_verified          int8
a_account_creation    int32
b_user_id             int32
b_follower_count      int32
b_following_count     int32
b_is_verified          int8
b_account_creation    int32
b_follows_a            int8
reply                 int32
retweet               int32
retweet_comment       int32
like                  int32
id                    int32
tr                    int32
dtype: object

In [12]:
train['dt_day']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.day.values.astype( np.int8 )
train['dt_dow']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.dayofweek.values.astype( np.int8 )
train['dt_hour'] = pd.to_datetime( train['timestamp'] , unit='s' ).dt.hour.values.astype( np.int8 )
_=gc.collect()
train.groupby('dt_day')['id'].agg('count')

dt_day
6     16645598
7     17604774
8     18291076
9     17594952
10    17778468
11    17395299
12    16076264
13     3566976
14     3443360
15     3676153
16     3575402
17     3827601
18     3555061
19     3225020
Name: id, dtype: int64

In [13]:
dt = pd.read_parquet( '../preprocessings/a_count_combined-final.parquet' )
dt.head()

Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,0,18,-1,0,-1,0,0,0
1,1,2,1,1,1,1,1,1
2,2,2,1,1,1,1,1,1
3,3,2,1,1,1,1,1,1
4,4,2,1,1,1,1,1,1


In [14]:
train['a_count_combined']             = dt['a_count_combined']
train['a_user_fer_count_delta_time']  = dt['a_user_fer_count_delta_time']
train['a_user_fing_count_delta_time'] = dt['a_user_fing_count_delta_time']
train['a_user_fering_count_delta_time']=dt['a_user_fering_count_delta_time']
train['a_user_fing_count_mode']       = dt['a_user_fing_count_mode']
train['a_user_fer_count_mode']        = dt['a_user_fer_count_mode']
train['a_user_fering_count_mode']     = dt['a_user_fering_count_mode']

train.loc[ (train.dt_day==12)|(train.dt_day==18) ,['a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']] = -9

del dt; _=gc.collect()
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,0,0,0,0,0,2,11,1581262691,0,986,1201,0,1274269909,6237935,94,648,0,1478011810,0,0,0,0,0,0,0,9,6,15,18,-1,0,-1,0,0,0
1,3,1,0,0,0,1,11,1581497241,1,1225,677,0,1255778244,879586,1139,46,0,1540395738,1,0,1,0,1,1,0,12,2,8,-9,-9,-9,-9,-9,-9,-9
2,0,2,0,1,1,2,11,1580978528,2,3016,1623,0,1313450503,647103,780,440,0,1432084055,1,0,0,0,1,2,0,6,3,8,2,1,1,1,1,1,1
3,0,3,0,0,0,1,54,1581321849,3,2121,16,0,1547717153,13774339,1,45,0,1534313747,0,0,0,0,1,3,0,10,0,8,2,1,1,1,1,1,1
4,0,4,5,0,0,2,11,1580956787,4,813505,200,1,1476348838,13774340,171,388,0,1490166885,0,0,0,0,1,4,0,6,3,2,2,1,1,1,1,1,1


In [15]:
train.dtypes

hashtags                          int32
tweet_id                          int32
media                              int8
links                             int32
domains                           int32
tweet_type                         int8
language                           int8
timestamp                         int32
a_user_id                         int32
a_follower_count                  int32
a_following_count                 int32
a_is_verified                      int8
a_account_creation                int32
b_user_id                         int32
b_follower_count                  int32
b_following_count                 int32
b_is_verified                      int8
b_account_creation                int32
b_follows_a                        int8
reply                             int32
retweet                           int32
retweet_comment                   int32
like                              int32
id                                int32
tr                                int32


In [16]:
dt = pd.read_parquet( '../preprocessings/text-processings-1.parquet' )
dt.sort_values('id', inplace=True)
dt.head()

Unnamed: 0,id,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,68,17,0,0,0,0,0,0,7,0,0,0
1,1,0,182,37,1,1,1,1,1,1,12,0,0,661291
2,2,0,105,24,2,2,2,2,2,2,6,0,0,0
3,3,2,103,22,3,3,3,3,3,3,10,198539,2048,616
4,4,0,237,63,4,4,4,4,4,4,16,0,0,0


In [17]:
train['count_ats']     = dt['count_ats']
train['count_char']    = dt['count_char']
train['count_words']   = dt['count_words']
train['tw_hash']       = dt['tw_hash']
train['tw_freq_hash']  = dt['tw_freq_hash']
train['tw_first_word'] = dt['tw_first_word']
train['tw_second_word']= dt['tw_second_word']
train['tw_last_word']  = dt['tw_last_word']
train['tw_llast_word'] = dt['tw_llast_word']
train['tw_len']        = dt['tw_len']
train['tw_hash0']      = dt['tw_hash0']
train['tw_hash1']      = dt['tw_hash1']
train['tw_rt_uhash']   = dt['tw_rt_uhash']
del dt
gc.collect()

0

In [18]:
train.tail()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
146255999,0,73443134,0,0,0,1,59,1582081341,1362744,5390,833,0,1546658592,30848700,56,267,0,1428007228,0,0,0,0,0,146255999,2,19,2,3,2,1,1,1,1,1,1,0,76,15,39170347,34858164,28651,210319,2793,27536,6,0,0,7739
146256000,0,73443135,0,0,0,1,54,1582091134,179475,17747,1467,0,1172558525,4950585,1837,129,0,1272381192,0,0,0,0,0,146256000,2,19,2,5,0,0,1,0,1,0,0,0,159,29,38847520,1535005,300662,267542,11,867,13,0,0,10748
146256001,0,73443136,9,0,0,1,4,1582086464,366281,4386,80,0,1457176980,30312854,8,57,0,1235182992,0,0,0,0,0,146256001,2,19,2,4,2,1,1,1,1,1,1,0,72,10,49748666,44369960,7361311,466019,19261,202536,5,0,0,58462
146256002,845560,73443137,0,0,0,2,11,1581665518,1030299,4236,4119,0,1524226898,3141261,717,464,0,1501554925,1,0,0,0,0,146256002,2,14,4,7,2,1,1,1,1,1,1,0,161,42,50323632,44880036,7551233,985413,112,2712,14,0,0,0
146256003,6845,73443138,7,0,0,2,11,1581799075,6937005,4354,3629,0,1269050894,3141261,717,464,0,1501554925,1,0,0,0,0,146256003,2,15,5,20,2,1,1,1,1,1,1,0,216,71,50323633,44880037,2674107,2769099,646,2378,26,0,0,0


In [19]:
train.groupby('tr')['id'].agg('count')

tr
0    121386431
1     12434735
2     12434838
Name: id, dtype: int64

In [20]:
def MultiTE_gpu( tra, col, tar, L=1, smooth_method=0  ):
    np.random.seed(L)

    cols = col+[tar]
    gf = cudf.from_pandas(tra[cols])
    mn = gf[tar].mean().astype('float32')
    
    predtrain = np.zeros( tra.shape[0] )
    
    for fold in [7,8,9,10,11,12]:
        px = np.where( tra.dt_day <fold )[0]
        py = np.where( tra.dt_day==fold )[0]
        mn = gf[tar].iloc[px].mean().astype('float32')
        if smooth_method==0:
            te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
            te['smooth']  = (te['mean']*te['count'])
            te['smooth'] += (mn*L)
            te['smooth'] /= (te['count']+L)
            te = te.drop( ['mean','count'] )
        elif smooth_method==1:
            te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
            te['smooth'] = (te['sum']+L) / (te['count']+1)
            te = te.drop( ['sum','count'] )
        gf2 = gf.iloc[py].copy()
        gf2 = gf2.set_index( col )
        gf2['id'] = cupy.arange( gf2.shape[0] )
        gf2 = gf2.join( te, how='left' )
        gf2 = gf2.sort_values( 'id' )
        del te, gf2['id']
        predtrain[py] = gf2.smooth.fillna(-999).to_array()
        del gf2

    px = np.where( tra.dt_day <13 )[0]
    py = np.where( tra.dt_day>=13 )[0]
    mn = gf[tar].iloc[px].mean().astype('float32')
    if smooth_method==0:
        te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
        te['smooth']  = (te['mean']*te['count'])
        te['smooth'] += (mn*L)
        te['smooth'] /= (te['count']+L)
        te = te.drop( ['mean','count'] )
    elif smooth_method==1:
        te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
        te['smooth'] = (te['sum']+L) / (te['count']+1)
        te = te.drop( ['sum','count'] )
    gf2 = gf.iloc[py].copy()
    gf2 = gf2.set_index( col )
    gf2['id'] = cupy.arange( gf2.shape[0] )
    gf2 = gf2.join( te, how='left' )
    gf2 = gf2.sort_values( 'id' )
    del te, gf2['id']
    predtrain[py] = gf2.smooth.fillna(-999).to_array()            
    del gf2

    px = np.where( (tra.dt_day>=7)&(tra.dt_day<=11) )[0]
    py = np.where( tra.dt_day==6 )[0]
    mn = gf[tar].iloc[px].mean().astype('float32')
    if smooth_method==0:
        te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
        te['smooth']  = (te['mean']*te['count'])
        te['smooth'] += (mn*L)
        te['smooth'] /= (te['count']+L)
        te = te.drop( ['mean','count'] )
    elif smooth_method==1:
        te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
        te['smooth'] = (te['sum']+L) / (te['count']+1)
        te = te.drop( ['sum','count'] )
    gf2 = gf.iloc[py].copy()
    gf2 = gf2.set_index( col )
    gf2['id'] = cupy.arange( gf2.shape[0] )
    gf2 = gf2.join( te, how='left' )
    gf2 = gf2.sort_values( 'id' )
    del te, gf2['id']
    predtrain[py] = gf2.smooth.fillna(-999).to_array()            
    del gf2
    
    _ = gc.collect()
    predtrain[predtrain <= -999 ] = np.nan
    return predtrain.astype(np.float32)

In [21]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,0,0,0,2,11,1581262691,0,986,1201,0,1274269909,6237935,94,648,0,1478011810,0,0,0,0,0,0,0,9,6,15,18,-1,0,-1,0,0,0,0,68,17,0,0,0,0,0,0,7,0,0,0
1,3,1,0,0,0,1,11,1581497241,1,1225,677,0,1255778244,879586,1139,46,0,1540395738,1,0,1,0,1,1,0,12,2,8,-9,-9,-9,-9,-9,-9,-9,0,182,37,1,1,1,1,1,1,12,0,0,661291
2,0,2,0,1,1,2,11,1580978528,2,3016,1623,0,1313450503,647103,780,440,0,1432084055,1,0,0,0,1,2,0,6,3,8,2,1,1,1,1,1,1,0,105,24,2,2,2,2,2,2,6,0,0,0
3,0,3,0,0,0,1,54,1581321849,3,2121,16,0,1547717153,13774339,1,45,0,1534313747,0,0,0,0,1,3,0,10,0,8,2,1,1,1,1,1,1,2,103,22,3,3,3,3,3,3,10,198539,2048,616
4,0,4,5,0,0,2,11,1580956787,4,813505,200,1,1476348838,13774340,171,388,0,1490166885,0,0,0,0,1,4,0,6,3,2,2,1,1,1,1,1,1,0,237,63,4,4,4,4,4,4,16,0,0,0


In [22]:
%%time
for t in ['retweet_comment']:
    for c in [
        ['b_user_id','tweet_type','language'],
        ['tw_first_word','tweet_type','language'],
        ['tw_last_word','tweet_type','language'],
        ['tw_hash0','tweet_type','language'],
        ['tw_hash1','tweet_type','language'],
        ['tw_rt_uhash','tweet_type','language'],
        
        ['a_user_id'],
        ['b_user_id'],
        ['tw_hash'],
        ['tw_freq_hash'],
        
        ['media','tweet_type','language','a_is_verified','b_is_verified','b_follows_a'],
        ['a_count_combined','tweet_type','language'],
        ['a_user_fer_count_delta_time','media','language'],
        ['a_user_fing_count_delta_time','media','language'],
        ['a_user_fering_count_delta_time','tweet_type','language'],
        ['a_user_fing_count_mode','media','language'],
        ['a_user_fer_count_mode','media','language'],
        ['a_user_fering_count_mode','tweet_type','language'],
        
        ['domains','media','tweet_type','language'],
        ['links','media','tweet_type','language'],
        ['hashtags','media','tweet_type','language'],
        ]:
        fname = 'TE_'+'_'.join(c)+'_'+t
        print( fname )
        train[fname] = MultiTE_gpu( train, c, t, 20, 0 )

TE_b_user_id_tweet_type_language_retweet_comment
TE_tw_first_word_tweet_type_language_retweet_comment
TE_tw_last_word_tweet_type_language_retweet_comment
TE_tw_hash0_tweet_type_language_retweet_comment
TE_tw_hash1_tweet_type_language_retweet_comment
TE_tw_rt_uhash_tweet_type_language_retweet_comment
TE_a_user_id_retweet_comment
TE_b_user_id_retweet_comment
TE_tw_hash_retweet_comment
TE_tw_freq_hash_retweet_comment
TE_media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a_retweet_comment
TE_a_count_combined_tweet_type_language_retweet_comment
TE_a_user_fer_count_delta_time_media_language_retweet_comment
TE_a_user_fing_count_delta_time_media_language_retweet_comment
TE_a_user_fering_count_delta_time_tweet_type_language_retweet_comment
TE_a_user_fing_count_mode_media_language_retweet_comment
TE_a_user_fer_count_mode_media_language_retweet_comment
TE_a_user_fering_count_mode_tweet_type_language_retweet_comment
TE_domains_media_tweet_type_language_retweet_comment
TE_links_media_t

In [23]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash,TE_b_user_id_tweet_type_language_retweet_comment,TE_tw_first_word_tweet_type_language_retweet_comment,TE_tw_last_word_tweet_type_language_retweet_comment,TE_tw_hash0_tweet_type_language_retweet_comment,TE_tw_hash1_tweet_type_language_retweet_comment,TE_tw_rt_uhash_tweet_type_language_retweet_comment,TE_a_user_id_retweet_comment,TE_b_user_id_retweet_comment,TE_tw_hash_retweet_comment,TE_tw_freq_hash_retweet_comment,TE_media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a_retweet_comment,TE_a_count_combined_tweet_type_language_retweet_comment,TE_a_user_fer_count_delta_time_media_language_retweet_comment,TE_a_user_fing_count_delta_time_media_language_retweet_comment,TE_a_user_fering_count_delta_time_tweet_type_language_retweet_comment,TE_a_user_fing_count_mode_media_language_retweet_comment,TE_a_user_fer_count_mode_media_language_retweet_comment,TE_a_user_fering_count_mode_tweet_type_language_retweet_comment,TE_domains_media_tweet_type_language_retweet_comment,TE_links_media_tweet_type_language_retweet_comment,TE_hashtags_media_tweet_type_language_retweet_comment
0,0,0,0,0,0,2,11,1581262691,0,986,1201,0,1274269909,6237935,94,648,0,1478011810,0,0,0,0,0,0,0,9,6,15,18,-1,0,-1,0,0,0,0,68,17,0,0,0,0,0,0,7,0,0,0,0.006546,0.006546,0.003446,0.003143,0.003169,0.003143,0.005977,0.006546,,,0.002314,8e-06,0.001292,8.3e-05,0.001497,0.000257,0.000144,0.000166,0.002081,0.002081,0.002543
1,3,1,0,0,0,1,11,1581497241,1,1225,677,0,1255778244,879586,1139,46,0,1540395738,1,0,1,0,1,1,0,12,2,8,-9,-9,-9,-9,-9,-9,-9,0,182,37,1,1,1,1,1,1,12,0,0,661291,,,0.0014,0.003946,0.003858,0.0052,0.0014,0.040914,,0.006618,0.005248,,,,,,,,0.004026,0.004026,0.003733
2,0,2,0,1,1,2,11,1580978528,2,3016,1623,0,1313450503,647103,780,440,0,1432084055,1,0,0,0,1,2,0,6,3,8,2,1,1,1,1,1,1,0,105,24,2,2,2,2,2,2,6,0,0,0,0.006148,0.024947,0.008098,0.003428,0.003439,0.003428,0.000824,0.005088,,,0.002667,0.005562,0.006528,0.004926,0.005593,0.004663,0.006182,0.005412,0.005902,,0.002729
3,0,3,0,0,0,1,54,1581321849,3,2121,16,0,1547717153,13774339,1,45,0,1534313747,0,0,0,0,1,3,0,10,0,8,2,1,1,1,1,1,1,2,103,22,3,3,3,3,3,3,10,198539,2048,616,,0.005433,0.007758,0.005886,0.004415,0.025164,0.005232,0.006727,0.005886,0.005886,0.005144,0.015498,0.014249,0.011357,0.015358,0.010565,0.013418,0.014009,0.006033,0.006033,0.006164
4,0,4,5,0,0,2,11,1580956787,4,813505,200,1,1476348838,13774340,171,388,0,1490166885,0,0,0,0,1,4,0,6,3,2,2,1,1,1,1,1,1,0,237,63,4,4,4,4,4,4,16,0,0,0,0.006707,,0.003712,0.003428,0.003439,0.003428,0.000879,0.005675,,,0.005294,0.005562,0.005306,0.004389,0.005593,0.004256,0.005173,0.005412,0.00322,0.00322,0.003089


In [24]:
train.to_parquet( 'data/train-final-te-retweet_comment-1.parquet' )
gc.collect()

0

In [25]:
print('Elapsed Time is %f minutes'%((time.time()-startNB)/60))

Elapsed Time is 17.991000 minutes
