Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"
start = time.time()

In [2]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
#import cudf, cupy
#cudf.__version__

In [3]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0


In [4]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()

# Load Train

In [5]:
train = pd.read_parquet( 'train-1.parquet' )
test0 = pd.read_parquet( 'test-0.parquet' )
test1 = pd.read_parquet( 'test-1.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 1

train.shape, test0.shape, test1.shape

((121386431, 25), (12434735, 25), (12434838, 25))

In [6]:
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0,test1
gc.collect()

train.shape

(146256004, 25)

In [7]:
%%time
train = train.sort_values('timestamp').reset_index(drop=True) #TIME ORDER
gc.collect()

CPU times: user 1min 25s, sys: 13.3 s, total: 1min 39s
Wall time: 1min 39s


0

In [8]:
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1

train['engage'] = 0
train.loc[ (train.reply>0)|(train.retweet>0)|(train.retweet_comment>0)|(train.like>0)  , 'engage'] = 1
gc.collect()

0

In [9]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,engage
0,0,84383,5,0,0,2,59,1580947200,7581,1438254,268,True,1301520627,22274968,41,379,False,1259102347,False,0,0,0,0,96855085,0,0
1,0,3992746,5,473701,9,2,59,1580947200,4749,1045996,518,True,1235504861,23956872,0,64,False,1557970591,False,0,0,0,0,57701281,0,0
2,0,789758,9,0,0,2,54,1580947200,12117,71231,3819,False,1352943191,18642489,13,65,False,1524243737,False,0,0,0,0,89623685,0,0
3,0,6912703,5,141941,9,2,3,1580947200,297779,209838,204,True,1333746722,19850999,75,968,False,1306118149,False,0,0,0,0,51022269,0,0
4,0,1508672,5,0,0,2,54,1580947200,53927,161464,5531,True,1260797124,15177916,94,220,False,1321907890,False,0,0,0,1,2699787,0,1


In [10]:
save_memory(train)
gc.collect()

0

In [11]:
train.dtypes

hashtags              int32
tweet_id              int32
media                  int8
links                 int32
domains               int32
tweet_type             int8
language               int8
timestamp             int32
a_user_id             int32
a_follower_count      int32
a_following_count     int32
a_is_verified          int8
a_account_creation    int32
b_user_id             int32
b_follower_count      int32
b_following_count     int32
b_is_verified          int8
b_account_creation    int32
b_follows_a            int8
reply                 int32
retweet               int32
retweet_comment       int32
like                  int32
id                    int32
tr                    int32
engage                int32
dtype: object

In [12]:
dt0 = train[['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']].copy()
dt1 = train[['tr','b_user_id','b_follower_count','b_following_count','timestamp','id']].copy()
dt1.columns = ['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']
dt1['id'] = -1
dt = pd.concat( (dt0,dt1), sort=False )
del dt0,dt1; _=gc.collect()
dt.head()

Unnamed: 0,tr,a_user_id,a_follower_count,a_following_count,timestamp,id
0,0,7581,1438254,268,1580947200,96855085
1,0,4749,1045996,518,1580947200,57701281
2,0,12117,71231,3819,1580947200,89623685
3,0,297779,209838,204,1580947200,51022269
4,0,53927,161464,5531,1580947200,2699787


In [13]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 6min 2s, sys: 42.8 s, total: 6min 45s
Wall time: 6min 45s


 1    196262061
 0     63402710
-1     32847237
Name: a_fc_dif_flag, dtype: int64

In [14]:
train['a_user_fer_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fer_count_delta_time'].value_counts()

 1    84461566
 0    47340973
-1    14453465
Name: a_user_fer_count_delta_time, dtype: int64

In [15]:
train.groupby(['tr','a_user_fer_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fer_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.020248,0.043432,0.00404,0.203374
0,0,0.000739,0.002376,0.000192,0.008607
0,1,0.040521,0.179154,0.011767,0.708384
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [16]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 5min 54s, sys: 42.8 s, total: 6min 36s
Wall time: 6min 36s


 1    211322540
 0     48322632
-1     32866836
Name: a_fc_dif_flag, dtype: int64

In [17]:
train['a_user_fing_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fing_count_delta_time'].value_counts()

 1    103337405
 0     31090824
-1     11827775
Name: a_user_fing_count_delta_time, dtype: int64

In [18]:
train.groupby(['tr','a_user_fing_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fing_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.024621,0.051793,0.004836,0.24437
0,0,0.001318,0.004249,0.000348,0.015821
0,1,0.033272,0.147138,0.009664,0.58172
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [19]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 6min 19s, sys: 46.5 s, total: 7min 6s
Wall time: 7min 6s


 1    181396527
 0     66142423
-1     44973058
Name: a_fc_dif_flag, dtype: int64

In [20]:
train['a_user_fering_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fering_count_delta_time'].value_counts()

 1    82346467
 0    47606577
-1    16302960
Name: a_user_fering_count_delta_time, dtype: int64

In [21]:
train.groupby(['tr','a_user_fering_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fering_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.018202,0.039295,0.003652,0.182442
0,0,0.000494,0.001632,0.000123,0.005825
0,1,0.0416,0.183938,0.012086,0.727258
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [22]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 3min 36s, sys: 24.5 s, total: 4min
Wall time: 4min


 1    219055122
 0     41868436
-1     31588450
Name: a_fc_count_flag, dtype: int64

In [23]:
train['a_user_fing_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fing_count_mode'].value_counts()

 1    106413330
 0     29292845
-1     10549829
Name: a_user_fing_count_mode, dtype: int64

In [24]:
train.groupby(['tr','a_user_fing_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fing_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.025053,0.049914,0.004816,0.222856
0,0,0.002767,0.009126,0.000868,0.037778
0,1,0.032191,0.142372,0.009304,0.563611
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [25]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 3min 51s, sys: 25.7 s, total: 4min 17s
Wall time: 4min 17s


 1    203146666
 0     59085591
-1     30279751
Name: a_fc_count_flag, dtype: int64

In [26]:
train['a_user_fer_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fer_count_mode'].value_counts()

 1    87289061
 0    47080579
-1    11886364
Name: a_user_fer_count_mode, dtype: int64

In [27]:
train.groupby(['tr','a_user_fer_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fer_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.022789,0.046336,0.004451,0.205478
0,0,0.001172,0.004023,0.000345,0.016506
0,1,0.039363,0.17368,0.011396,0.687849
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [28]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 4min 15s, sys: 28.3 s, total: 4min 43s
Wall time: 4min 43s


 1    189397117
 0     60798233
-1     42316658
Name: a_fc_count_flag, dtype: int64

In [29]:
train['a_user_fering_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fering_count_mode'].value_counts()

 1    85155540
 0    47453340
-1    13647124
Name: a_user_fering_count_mode, dtype: int64

In [30]:
train.groupby(['tr','a_user_fering_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fering_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.01999,0.041058,0.003934,0.180545
0,0,0.001066,0.003681,0.000312,0.015235
0,1,0.040303,0.177959,0.011675,0.704806
1,-1,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0


In [31]:
del dt; _=gc.collect()

In [32]:
train['a_count_combined'] = pd.factorize(
    (1+train['a_user_fer_count_delta_time'])*3**0 +
    (1+train['a_user_fing_count_delta_time'])*3**1 +
    (1+train['a_user_fering_count_delta_time'])*3**2 +
    (1+train['a_user_fing_count_mode'])*3**3 +
    (1+train['a_user_fer_count_mode'])*3**4 +
    (1+train['a_user_fering_count_mode'])*3**5 
)[0]
train['a_count_combined'].value_counts()

2      78746518
1      23801964
0      16748586
10      9222848
5       2171185
15      1834331
4       1254366
3       1213266
6       1148744
13       883950
12       858675
29       851647
36       717321
7        664803
21       612722
35       496340
9        485131
8        414983
22       361278
30       359131
18       295766
25       267802
27       267234
49       240921
32       188563
37       144436
23       118961
41       114718
28       107623
54        97331
16        94860
46        86903
44        83745
17        74064
24        72108
50        66234
34        63939
51        53529
19        48210
53        47556
38        46088
72        45505
11        44324
42        43461
43        42764
26        42147
56        38664
48        36448
89        29813
87        28157
31        27810
60        26811
61        22234
63        21258
57        19359
74        18958
40        18169
69        17053
47        16677
59        16108
64        16038
65        13955
45      

In [33]:
train.loc[ train.tr==0 ].groupby('a_count_combined')[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,mean,mean,mean,mean
a_count_combined,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.000134,0.000483,3.6e-05,0.001738
1,0.000122,0.000366,2.8e-05,0.001323
2,0.04177,0.186519,0.012111,0.736756
3,0.00515,0.016041,0.001395,0.058424
4,0.003689,0.01447,0.000967,0.051986
5,5.4e-05,0.000196,1.7e-05,0.000704
6,0.038237,0.128285,0.012932,0.545995
7,0.029493,0.110948,0.00935,0.47662
8,0.000804,0.002577,0.000232,0.015041
9,0.034245,0.113368,0.010275,0.49177


In [34]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,engage,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,a_count_combined
0,0,84383,5,0,0,2,59,1580947200,7581,1438254,268,1,1301520627,22274968,41,379,0,1259102347,0,0,0,0,0,96855085,0,0,0,1,0,1,0,0,0
1,0,3992746,5,473701,9,2,59,1580947200,4749,1045996,518,1,1235504861,23956872,0,64,0,1557970591,0,0,0,0,0,57701281,0,0,0,1,0,1,0,0,0
2,0,789758,9,0,0,2,54,1580947200,12117,71231,3819,0,1352943191,18642489,13,65,0,1524243737,0,0,0,0,0,89623685,0,0,0,0,0,0,0,0,1
3,0,6912703,5,141941,9,2,3,1580947200,297779,209838,204,1,1333746722,19850999,75,968,0,1306118149,0,0,0,0,0,51022269,0,0,0,1,0,1,0,0,0
4,0,1508672,5,0,0,2,54,1580947200,53927,161464,5531,1,1260797124,15177916,94,220,0,1321907890,0,0,0,0,1,2699787,0,1,1,1,1,1,1,1,2


In [35]:
dt = train[['id','a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']]
dt = dt.sort_values( 'id' )
dt = dt.reset_index( drop=True )
save_memory( dt )
dt.to_parquet( 'a_count_combined-final.parquet' )
dt.tail(50)

Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
146255954,146255954,2,1,1,1,1,1,1
146255955,146255955,2,1,1,1,1,1,1
146255956,146255956,10,-1,-1,-1,-1,-1,-1
146255957,146255957,0,0,1,0,1,0,0
146255958,146255958,0,0,1,0,1,0,0
146255959,146255959,2,1,1,1,1,1,1
146255960,146255960,25,0,1,1,1,1,1
146255961,146255961,2,1,1,1,1,1,1
146255962,146255962,10,-1,-1,-1,-1,-1,-1
146255963,146255963,0,0,1,0,1,0,0
