In [644]:
import pandas as pd
import numpy as np
import datetime
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

In [645]:
def fix_str_float(ds, col):
    ds[col] = ds[col].str.replace(r'[^0-9\.]','')
    ds[col] = np.where(ds[col]=='',np.nan,ds[col])
    ds[col] = ds[col].astype('float32')
    return ds[col].astype('float32')

In [646]:
# clicks
clicks_df = pd.read_csv('data/clicks.csv', low_memory=False,dtype={'advertiser_id':'int32','action_id':'float32','source_id':'int32','country_code':'category',\
                                                 'latitude':'float32','longitude':'float32','carrier_id':'float32','os_minor':'category',\
                                                  'os_major':'category','specs_brand':'category','timeToClick':'float32','ref_type':'category'\
                                                                  ,'ref_hash':'object'})

clicks_df['touchX'] = fix_str_float(clicks_df,'touchX')
clicks_df['touchY'] = fix_str_float(clicks_df,'touchY')
clicks_df['created'] = pd.to_datetime(clicks_df['created'])
#events
events_df = pd.read_csv('data/events.csv', low_memory=False, dtype={'event_id':'int32','ref_type':'category','application_id':'category',\
                                                                                            'attributed':'bool','device_countrycode':'category','device_city':'category',\
                                                                                            'trans_id':'category','carrier':'category','device_os':'category',\
                                                                                            'connection_type':'category'})
events_df['date'] = pd.to_datetime(events_df['date'])
events_df['wifi'].astype('bool', inplace=True)
events_df.drop(columns=['device_countrycode','session_user_agent','ip_address','device_language'], inplace=True)
# installs
installs_df = pd.read_csv('data/installs.csv', low_memory=False, dtype={'ref_type':'category','application_id':'category',\
                                                      'device_brand':'category','ref_hash':'object','wifi':'category'})
installs_df['kind'] = installs_df['kind'].str.lower()
installs_df['kind'] = installs_df['kind'].astype('category')
installs_df.drop(columns=['session_user_agent','ip_address','device_language','device_model'], inplace=True)
installs_df['created'] = pd.to_datetime(installs_df['created'])
installs_df.drop(['device_countrycode'], axis=1, inplace=True)
# auctions
auctions_df = pd.read_csv('data/auctions.csv', low_memory=False, dtype={'country':'category','platform':'category',\
                                                                        'ref_type_id':'category','source_id':'category','device_id':'object'})

auctions_df['date'] = pd.to_datetime(auctions_df['date'])
print('setup done')

setup done


In [647]:
installs_df.head(5)

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_brand,user_agent,event_uuid,kind,wifi,trans_id
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,,,79837499-2f2a-4605-a663-e322f759424f,app_open,,
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,,,,,,
2,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,
3,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,
4,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,,,,,,


In [648]:
events_df['connection_type'].describe()

count       1809296
unique            4
top       Cable/DSL
freq        1291512
Name: connection_type, dtype: object

In [649]:
events_df['connection_type'].value_counts()

Cable/DSL    1291512
Cellular      517204
Corporate        527
Dialup            53
Name: connection_type, dtype: int64

In [650]:
events_df['connection_type'].isnull().sum()


5935285

In [70]:
#auctions_df = auctions_df.sort_values(by=['device_id','date'])
#auctions_df['date_dif'] = auctions_df['date'].shift(periods=-1) - auctions_df['date']
#auctions_df['in_seconds'] = np.nan
#last_row = False
#last_index = False
#for index, row in auctions_df.iterrows():
#    if not(isinstance(last_row, bool)):
#        if row['device_id']!=last_row['device_id']:
#            auctions_df.at[last_index,'date_dif'] = np.nan
#    auctions_df.at[index,'in_seconds'] = row['date_dif'].total_seconds()
#    last_row = row
#    last_index = index
#auctions_df['in_seconds'] = np.where(auctions_df['date_dif'].isnull(), np.nan, auctions_df['in_seconds'])

In [651]:
auctions_df.head(10)

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0
5,2019-04-23 18:58:02.675804,6707090658317158573,1,0
6,2019-04-23 18:58:02.848212,8869722088125970841,1,0
7,2019-04-23 18:58:03.048872,7445213948764639634,1,0
8,2019-04-23 18:58:03.285788,2932617030932207332,1,0
9,2019-04-23 18:58:03.532112,6405811806780450397,7,0


In [75]:
#auctions_df.to_csv('data/auctions_seconds.csv')

In [15]:
#datos = pd.merge(auctions_df, installs_df, left_on='device_id', right_on='ref_hash', how='left')

In [652]:
installs_df.shape

(481511, 13)

In [653]:
auctions_df.shape

(47409528, 4)

In [655]:
auctions_1 = auctions_df.loc[auctions_df['date']<'2019-04-20 00:00:00']

In [656]:
# calculate time in seconds
auctions_1.drop_duplicates(inplace=True)
auctions_1 = auctions_1.sort_values(by=['device_id','date'])
auctions_1['date_dif'] = auctions_1['date'].shift(periods=-1) - auctions_1['date']
auctions_1['device_id_next'] = auctions_1['device_id'].astype('object').shift(periods=-1)
auctions_1['date_dif'] = np.where(auctions_1['device_id_next']==auctions_1['device_id'], auctions_1['date_dif'], datetime.datetime(2019,4,20)-auctions_1['date'])
auctions_1['in_seconds'] = auctions_1['date_dif'].dt.total_seconds()
auctions_1['status_censored'] = auctions_1['device_id_next']==auctions_1['device_id']
auctions_1.drop(['device_id_next','date_dif'], axis='columns', inplace=True)
auctions_1['device_id'] = auctions_1['device_id'].astype('object')
#calculate previus time in seconds
auctions_1['date_prev'] = auctions_1['date'].shift()
auctions_1['date_dif_prev'] = auctions_1['date']- auctions_1['date_prev']
auctions_1['device_id_prev'] = auctions_1['device_id'].astype('object').shift()
auctions_1['date_dif_prev'] = np.where(auctions_1['device_id_prev']==auctions_1['device_id'], auctions_1['date_dif_prev'], auctions_1['date']-datetime.datetime(2019,4,18))
auctions_1['last_seen'] = auctions_1['date_dif_prev'].dt.total_seconds()
auctions_1.drop(['device_id_prev','date_dif_prev','date_prev'], axis='columns', inplace=True)
auctions_1 = auctions_1.sort_values(by=['date'])


In [657]:
installs_1 = installs_df.loc[installs_df['created']<'2019-04-20 00:00:00']
installs_1.head(10)

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_brand,user_agent,event_uuid,kind,wifi,trans_id
12,2019-04-18 04:00:27.575,1,1494519392962156891,1084778553542198153,,False,True,,,5d2c31b2-7009-4388-a30c-b8a119b9695f,app_open,,
13,2019-04-18 08:33:46.588,1,1494519392962156891,6412465357609907698,,False,True,,,c4b4577a-e846-4de7-9d40-643d5c66506e,app_open,,
14,2019-04-18 20:38:01.661,1,1494519392962156891,8455113036752892959,,False,True,,,88b28e3e-31bb-4c11-a937-a79ba175cc05,app_open,,
15,2019-04-18 18:51:18.157,1,1494519392962156891,7939167172078327528,,False,False,,,,,,
22,2019-04-19 04:10:34.820,1,1494519392962156891,2251011940536423208,,False,True,,,e9f099f4-47e3-41a6-99e1-e7e6ee154270,app_open,,
23,2019-04-19 04:32:10.018,1,1494519392962156891,8834511447557233212,,False,False,,,,,,
24,2019-04-19 04:31:44.316,1,1494519392962156891,4152412908070003677,,False,True,,,5a53d8bc-edaf-49bd-948a-43cc2b18e40f,app_open,,
25,2019-04-19 04:45:47.500,1,1494519392962156891,5848436212677927803,,False,True,,,54bae8d2-e693-4a0b-9c5f-55fe146ec148,app_open,,
26,2019-04-19 16:57:14.092,1,1494519392962156891,8697087329542169045,,False,False,,,,,,
27,2019-04-19 21:26:47.688,1,1494519392962156891,4137890430319706690,,False,True,,,e6b00fd0-0f10-4b25-a6e4-5053ac135691,app_open,,


In [295]:
installs_df['ref_hash'].describe()

count                  481511
unique                 393565
top       5446085605337844584
freq                       14
Name: ref_hash, dtype: object

In [401]:
installs_1.columns

Index(['created', 'application_id', 'ref_type', 'ref_hash', 'click_hash',
       'attributed', 'implicit', 'device_brand', 'user_agent', 'event_uuid',
       'kind', 'wifi', 'trans_id'],
      dtype='object')

In [658]:
#data X and y
data_full_1 = pd.merge(auctions_1, installs_1, left_on='device_id', right_on='ref_hash', how='inner')
data_full_1 = data_full_1.loc[data_full_1['date']>=data_full_1['created']]
data_full_1['install_diff'] = data_full_1['date']-data_full_1['created']
data_full_1['install_seconds'] = data_full_1['install_diff'].dt.total_seconds()
data_full_1 = data_full_1.loc[data_full_1['in_seconds']>=data_full_1['install_seconds']]
data_x_1 = data_full_1.drop(columns=['in_seconds','status_censored','ref_hash'])
data_y_1 = np.fromiter(zip(data_full_1["status_censored"], data_full_1["in_seconds"]),
                                dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])

In [659]:
class preprocess( BaseEstimator, TransformerMixin ): 
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        X = X.copy()
        # boolean transformations
        X['event_uuid'] = np.where(X['event_uuid'].isnull(), 0,1)
        X['click_hash'] = np.where(X['click_hash'].isnull(), 0,1)
        X['Android'] = np.where(X['user_agent'].str.contains('Android', regex=False),1,0)
        X['iOS'] = np.where(X['user_agent'].str.contains('Darwin', regex=False) | X['user_agent'].str.contains('iOS', regex=False),1,0)
        X['trans_id'] = np.where(X['trans_id'].isnull(), 0,1)
        # date transformations
        X['created_weekday'] = X['created'].dt.weekday
        X['created_hour'] = X['created'].dt.hour
        X['created_minute'] = X['created'].dt.minute
        X['date_weekday'] = X['date'].dt.weekday
        X['date_hour'] = X['date'].dt.hour
        X['date_minute'] = X['date'].dt.minute
        X['date_second'] = X['date'].dt.second
        #remove unused columns
        X = X.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent','device_id'])
        X = pd.get_dummies(X, dummy_na=True, prefix_sep='=')
        self.encoded_columns_ = X.columns
        #returns numpy array
        return X

In [400]:
# format features
#data_x_1['event_uuid'] = np.where(data_x_1['event_uuid'].isnull(), 0,1)
#data_x_1['click_hash'] = np.where(data_x_1['click_hash'].isnull(), 0,1)
#data_x_1['Android'] = np.where(data_x_1['user_agent'].str.contains('Android', regex=False),1,0)
#data_x_1['iOS'] = np.where(data_x_1['user_agent'].str.contains('Darwin', regex=False) | data_x_1['user_agent'].str.contains('iOS', regex=False),1,0)
#data_x_1['trans_id'] = np.where(data_x_1['trans_id'].isnull(), 0,1)
#data_x_1['created_weekday'] = data_x_1['created'].dt.weekday
#data_x_1['created_hour'] = data_x_1['created'].dt.hour
#data_x_1['created_minute'] = data_x_1['created'].dt.minute
#data_x_1['date_weekday'] = data_x_1['date'].dt.weekday
#data_x_1['date_hour'] = data_x_1['date'].dt.hour
#data_x_1['date_minute'] = data_x_1['date'].dt.minute
#data_x_1['date_second'] = data_x_1['date'].dt.second
#data_x_1.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent'], inplace=True)
#data_x_1_numeric = pd.get_dummies(data_x_1, dummy_na=True, prefix_sep='=')


In [660]:
data_y_1 = np.fromiter(zip(data_full_1.head(100)["status_censored"], data_full_1.head(100)["in_seconds"]),
                                dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])

In [408]:
estimator = CoxPHSurvivalAnalysis(alpha=0.1)
estimator.fit(data_x_1_numeric.head(100), data_y_1)
estimator.score(data_x_1_numeric.head(100), data_y_1)

0.8191370444891571

In [662]:
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores


In [None]:
scores = fit_and_score_features(data_x_1_numeric.head(100).values, data_y_1)
pd.Series(scores, index=data_x_1_numeric.columns).sort_values(ascending=False)

1     9
21    8
4     8
14    8
0     6
6     6
18    6
17    6
16    6
5     6
12    5
8     4
7     4
3     3
2     3
22    2
13    2
15    2
20    2
23    2
9     1
19    1
Name: date, dtype: int64

In [663]:
def custom_cv_folds(X):
    myCViterator = []
    trainIndices = X.loc[X['date']<'2019-04-18 12:00:00'].index.values.astype(int)
    testIndices =  X.loc[('2019-04-18 12:00:00'<=X['date']) & (X['date']<'2019-04-19 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-18 12:00:00'<=X['date']) & (X['date']<'2019-04-19 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-19 00:00:00'<=X['date']) & (X['date']<'2019-04-19 12:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-19 00:00:00'<=X['date']) & (X['date']<'2019-04-19 12:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-19 12:00:00'<=X['date']) & (X['date']<'2019-04-20 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    return myCViterator


In [664]:
pipe = Pipeline([('preprocess', preprocess()),
                 ('select', SelectKBest(fit_and_score_features, k=50)),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.01))])

In [665]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [695]:
data_x_11 = data_x_1.head(100).copy()
data_x_11.reset_index(inplace=True)
custom_cv = custom_cv_folds(data_x_11)
param_grid = {'select__k': np.arange(1, data_x_11.shape[1] + 1)}
gcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=custom_cv, iid=True) #, n_jobs=-1)
gcv.fit(data_x_11, data_y_1)

pd.DataFrame(gcv.cv_results_).sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
17,3.444344,0.545257,0.04144,0.007898,18,{'select__k': 18},0.728601,0.577904,0.604651,0.64504,0.069675,1,0.836538,0.88309,0.875354,0.864994,0.020368
18,3.317043,0.422457,0.037895,0.001451,19,{'select__k': 19},0.726514,0.575071,0.612403,0.644921,0.068765,2,0.836538,0.885177,0.875354,0.86569,0.021
14,2.95263,0.439941,0.031623,0.001879,15,{'select__k': 15},0.741127,0.620397,0.449612,0.630387,0.11158,3,0.831731,0.853862,0.855524,0.847039,0.010846
15,2.966385,0.338318,0.033349,0.001286,16,{'select__k': 16},0.730689,0.580737,0.457364,0.613367,0.107291,4,0.831731,0.862213,0.872521,0.855488,0.017318
16,3.310719,0.414562,0.033396,0.002868,17,{'select__k': 17},0.732777,0.566572,0.457364,0.609013,0.110026,5,0.831731,0.887265,0.866856,0.86195,0.022936
12,2.892683,0.377848,0.032805,0.001675,13,{'select__k': 13},0.772443,0.524079,0.44186,0.605949,0.140797,6,0.807692,0.805846,0.852691,0.822076,0.021661
13,3.199416,0.585621,0.033671,0.00298,14,{'select__k': 14},0.745303,0.492918,0.465116,0.588815,0.129544,7,0.826923,0.778706,0.852691,0.81944,0.030664
11,2.80754,0.318907,0.031924,0.000535,12,{'select__k': 12},0.743215,0.410765,0.55814,0.579007,0.146329,8,0.8125,0.795407,0.858357,0.822088,0.026578
19,3.303842,0.393862,0.037158,0.000438,20,{'select__k': 20},0.730689,0.368272,0.612403,0.570699,0.160645,9,0.836538,0.941545,0.875354,0.884479,0.043352
2,3.058368,0.366888,0.0361,0.000774,3,{'select__k': 3},0.623173,0.352691,0.767442,0.556754,0.164885,10,0.737981,0.783925,0.784703,0.768869,0.021844


Unnamed: 0,index,date,device_id,ref_type_id,source_id,last_seen,created,application_id,ref_type,click_hash,attributed,implicit,device_brand,user_agent,event_uuid,kind,wifi,trans_id,install_diff,install_seconds
0,218,2019-04-19 21:35:30.813413,5372400508976774833,7,0,8.026659,2019-04-19 21:07:19.165,339,1494519392962156891,,False,False,,TikTok/109005 CFNetwork/978.0.7 Darwin/18.5.0,,,false,,00:28:11.648413,1691.648413
1,598,2019-04-18 12:59:43.010829,2227766999717522159,1,0,41.261367,2019-04-18 12:53:51.436,121,1891515180541284343,,False,True,6.115025880051902e+18,Dalvik/2.1.0 (Linux; U; Android 7.0; HUAWEI VN...,4c59cd3c-a17e-4357-8003-318c3fd9df1a,open,true,,00:05:51.574829,351.574829
2,599,2019-04-18 12:59:43.010829,2227766999717522159,1,0,41.261367,2019-04-18 12:53:53.593,121,1891515180541284343,,False,False,6.115025880051902e+18,Dalvik/2.1.0 (Linux; U; Android 7.0; HUAWEI VN...,,,true,,00:05:49.417829,349.417829
3,1137,2019-04-19 17:26:29.664836,8362777024167972305,1,3,3.234293,2019-04-19 17:13:28.908,36,1891515180541284343,,False,False,1.1367705089334932e+18,Dalvik/2.1.0 (Linux; U; Android 8.1.0; FROZEN ...,,,true,,00:13:00.756836,780.756836
4,1138,2019-04-19 17:46:09.577152,8362777024167972305,1,3,1179.912316,2019-04-19 17:13:28.908,36,1891515180541284343,,False,False,1.1367705089334932e+18,Dalvik/2.1.0 (Linux; U; Android 8.1.0; FROZEN ...,,,true,,00:32:40.669152,1960.669152
5,1168,2019-04-18 03:48:02.977207,521116236442257594,7,1,14.395733,2019-04-18 01:37:39.068,68,1494519392962156891,,False,True,,Grability/17420 CFNetwork/978.0.7 Darwin/18.5.0,1699c5a2-547c-41e9-8153-596084e9758b,open,false,,02:10:23.909207,7823.909207
6,1181,2019-04-18 18:44:46.380013,521116236442257594,7,1,0.835005,2019-04-18 01:37:39.068,68,1494519392962156891,,False,True,,Grability/17420 CFNetwork/978.0.7 Darwin/18.5.0,1699c5a2-547c-41e9-8153-596084e9758b,open,false,,17:07:07.312013,61627.312013
7,1186,2019-04-18 00:21:43.696288,3976437970206438205,1,3,1302.855185,2019-04-18 00:05:50.074,21,1891515180541284343,,False,False,,,,,,,00:15:53.622288,953.622288
8,1549,2019-04-19 17:52:52.892606,4220751993672827036,1,3,0.064043,2019-04-19 17:51:37.436,210,1891515180541284343,,False,False,,Dalvik/2.1.0 (Linux; U; Android 7.0; SM-G920V ...,,,,,00:01:15.456606,75.456606
9,1550,2019-04-19 17:55:14.187190,4220751993672827036,1,3,141.294584,2019-04-19 17:51:37.436,210,1891515180541284343,,False,False,,Dalvik/2.1.0 (Linux; U; Android 7.0; SM-G920V ...,,,,,00:03:36.751190,216.751190


In [563]:
gcv.best_params_

{'model__alpha': 1.0, 'select__k': 15}

In [564]:
data_x_11 = data_x_1.head(100).copy()
pipe.set_params(**gcv.best_params_)
pipe.fit(data_x_11, data_y_1)

encoder, transformer, final_estimator = [s[1] for s in pipe.steps]
pd.Series(final_estimator.coef_, index=encoder.encoded_columns_[transformer.get_support()])

last_seen                      -0.000039
created_weekday                 0.760496
date_hour                       0.016006
date_second                     0.021819
ref_type_id=1                   0.117072
ref_type_id=7                  -0.117072
source_id=1                    -0.890550
source_id=3                     0.539926
source_id=7                    -1.156081
application_id=121             -0.333614
application_id=210             -0.006043
ref_type=1494519392962156891   -0.117072
ref_type=1891515180541284343    0.117072
kind=open                      -0.107620
wifi=nan                        0.273920
dtype: float64

In [565]:
data_x_11 = data_x_1.head(100).copy()
pipe.score(data_x_11, data_y_1)

0.7289740698985344

In [490]:
data_x_11.columns

Index(['date', 'ref_type_id', 'source_id', 'created', 'application_id',
       'ref_type', 'click_hash', 'attributed', 'implicit', 'device_brand',
       'user_agent', 'event_uuid', 'kind', 'wifi', 'trans_id', 'install_diff',
       'install_seconds'],
      dtype='object')

In [485]:
data_y_1.shape

(100,)

In [333]:
data_x_1_numeric.shape

(31377, 1601)

In [336]:
data_y_1.shape

(31377,)

In [None]:
test1 = auctions_1[['device_id','date']]

In [89]:
test1 = test1.sort_values(by=['device_id','date'])

In [90]:
test1 = test1.head(1000)

In [91]:
test1.shape

(1000, 2)

In [92]:
test1['date_dif'] = test1['date'].shift(periods=-1) - test1['date']

In [97]:
test1['device_id_next'] = test1['device_id'].astype('object').shift(periods=-1)

In [99]:
test1['date_dif'] = np.where(test1['device_id_next']==test1['device_id'], test1['date_dif'], datetime.datetime(2019,4,20)-test1['date'])

In [101]:
test1['in_seconds'] = test1['date_dif'].dt.total_seconds()

In [103]:
test1['status-censored'] = test1['device_id_next']==test1['device_id']

In [104]:
test1.head(50)

Unnamed: 0,device_id,date,date_dif,in_seconds,device_id_next,status-censored
30900602,41863526108385,2019-04-19 19:40:28.465866,0 days 04:19:31.534134,15571.534134,161514654074162,False
46872336,161514654074162,2019-04-18 02:52:46.357746,0 days 00:00:28.744020,28.74402,161514654074162,True
7330779,161514654074162,2019-04-18 02:53:15.101766,0 days 00:00:25.982516,25.982516,161514654074162,True
16177030,161514654074162,2019-04-18 02:53:41.084282,0 days 00:00:18.239779,18.239779,161514654074162,True
19683863,161514654074162,2019-04-18 02:53:59.324061,0 days 00:00:02.851261,2.851261,161514654074162,True
22213367,161514654074162,2019-04-18 02:54:02.175322,0 days 00:00:47.365665,47.365665,161514654074162,True
20455826,161514654074162,2019-04-18 02:54:49.540987,1 days 21:05:10.459013,162310.459013,186034136943920,False
16127540,186034136943920,2019-04-18 16:42:46.331894,0 days 02:40:55.874138,9655.874138,186034136943920,True
4859451,186034136943920,2019-04-18 19:23:42.206032,1 days 04:36:17.793968,102977.793968,283297668933729,False
293739,283297668933729,2019-04-18 22:24:44.520583,1 days 01:35:15.479417,92115.479417,345999128501141,False


In [61]:
test1['date_dif'] = test1['date'].shift(periods=-1) - test1['date']

In [69]:
test1.head(50)

Unnamed: 0,date,device_id,date_dif,in_seconds
36601349,2019-04-23 15:00:57.222979,40621410000000.0,NaT,
30900602,2019-04-19 19:40:28.465866,41863530000000.0,07:11:58.427014,25918.427014
14875535,2019-04-20 02:52:26.892880,41863530000000.0,00:06:35.616350,395.61635
32062481,2019-04-20 02:59:02.509230,41863530000000.0,00:06:59.166558,419.166558
45613542,2019-04-20 03:06:01.675788,41863530000000.0,00:02:55.712372,175.712372
42215232,2019-04-20 03:08:57.388160,41863530000000.0,00:02:29.075743,149.075743
22374394,2019-04-20 03:11:26.463903,41863530000000.0,00:00:14.547768,14.547768
12690007,2019-04-20 03:11:41.011671,41863530000000.0,00:00:45.669391,45.669391
42215538,2019-04-20 03:12:26.681062,41863530000000.0,00:00:30.801420,30.80142
12690107,2019-04-20 03:12:57.482482,41863530000000.0,00:01:58.661626,118.661626
