In [3]:
import pandas as pd
import numpy as np
import datetime
import gc
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import NMF
from sklearn.externals import joblib
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

In [4]:
def fix_str_float(ds, col):
    ds[col] = ds[col].str.replace(r'[^0-9\.]','')
    ds[col] = np.where(ds[col]=='',np.nan,ds[col])
    ds[col] = ds[col].astype('float32')
    return ds[col].astype('float32')

In [3]:
# clicks
clicks_df = pd.read_csv('data/clicks.csv', low_memory=False,dtype={'advertiser_id':'int32','action_id':'float32','source_id':'int32','country_code':'category',\
                                                 'latitude':'float32','longitude':'float32','carrier_id':'float32','os_minor':'category',\
                                                  'os_major':'category','specs_brand':'category','timeToClick':'float32','ref_type':'category'\
                                                                  ,'ref_hash':'object'})

clicks_df['touchX'] = fix_str_float(clicks_df,'touchX')
clicks_df['touchY'] = fix_str_float(clicks_df,'touchY')
clicks_df['created'] = pd.to_datetime(clicks_df['created'])
#events
events_df = pd.read_csv('data/events.csv', low_memory=False, dtype={'event_id':'int32','ref_type':'category','application_id':'category',\
                                                                                            'attributed':'bool','device_countrycode':'category','device_city':'category',\
                                                                                            'trans_id':'category','carrier':'category','device_os':'category',\
                                                                                            'connection_type':'category'})
events_df['date'] = pd.to_datetime(events_df['date'])
events_df['wifi'].astype('bool', inplace=True)
events_df.drop(columns=['device_countrycode','session_user_agent','ip_address','device_language'], inplace=True)
# installs
installs_df = pd.read_csv('data/installs.csv', low_memory=False, dtype={'ref_type':'category','application_id':'category',\
                                                      'device_brand':'category','ref_hash':'object','wifi':'category'})
installs_df['kind'] = installs_df['kind'].str.lower()
installs_df['kind'] = installs_df['kind'].astype('category')
installs_df.drop(columns=['session_user_agent','ip_address','device_language','device_model'], inplace=True)
installs_df['created'] = pd.to_datetime(installs_df['created'])
installs_df.drop(['device_countrycode'], axis=1, inplace=True)
# auctions
auctions_df = pd.read_csv('data/auctions.csv', low_memory=False, dtype={'country':'category','platform':'category',\
                                                                        'ref_type_id':'category','source_id':'category','device_id':'object'})

auctions_df['date'] = pd.to_datetime(auctions_df['date'])
print('setup done')

setup done


In [4]:
installs_df.head(5)

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_brand,user_agent,event_uuid,kind,wifi,trans_id
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,,,79837499-2f2a-4605-a663-e322f759424f,app_open,,
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,,,,,,
2,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,
3,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,
4,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,,,,,,


In [5]:
events_df['connection_type'].describe()

count       1809296
unique            4
top       Cable/DSL
freq        1291512
Name: connection_type, dtype: object

In [6]:
events_df['connection_type'].value_counts()

Cable/DSL    1291512
Cellular      517204
Corporate        527
Dialup            53
Name: connection_type, dtype: int64

In [7]:
events_df['connection_type'].isnull().sum()


5935285

In [8]:
#auctions_df = auctions_df.sort_values(by=['device_id','date'])
#auctions_df['date_dif'] = auctions_df['date'].shift(periods=-1) - auctions_df['date']
#auctions_df['in_seconds'] = np.nan
#last_row = False
#last_index = False
#for index, row in auctions_df.iterrows():
#    if not(isinstance(last_row, bool)):
#        if row['device_id']!=last_row['device_id']:
#            auctions_df.at[last_index,'date_dif'] = np.nan
#    auctions_df.at[index,'in_seconds'] = row['date_dif'].total_seconds()
#    last_row = row
#    last_index = index
#auctions_df['in_seconds'] = np.where(auctions_df['date_dif'].isnull(), np.nan, auctions_df['in_seconds'])

In [9]:
auctions_df.head(10)

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0
5,2019-04-23 18:58:02.675804,6707090658317158573,1,0
6,2019-04-23 18:58:02.848212,8869722088125970841,1,0
7,2019-04-23 18:58:03.048872,7445213948764639634,1,0
8,2019-04-23 18:58:03.285788,2932617030932207332,1,0
9,2019-04-23 18:58:03.532112,6405811806780450397,7,0


In [10]:
#auctions_df.to_csv('data/auctions_seconds.csv')

In [11]:
#datos = pd.merge(auctions_df, installs_df, left_on='device_id', right_on='ref_hash', how='left')

In [12]:
installs_df.shape

(481511, 13)

In [13]:
auctions_df.shape

(47409528, 4)

In [14]:
auctions_1 = auctions_df.loc[auctions_df['date']<'2019-04-20 00:00:00'].copy()

In [15]:
# calculate time in seconds
auctions_1.drop_duplicates(inplace=True)
auctions_1 = auctions_1.sort_values(by=['device_id','date'])
auctions_1['date_dif'] = auctions_1['date'].shift(periods=-1) - auctions_1['date']
auctions_1['device_id_next'] = auctions_1['device_id'].astype('object').shift(periods=-1)
auctions_1['date_dif'] = np.where(auctions_1['device_id_next']==auctions_1['device_id'], auctions_1['date_dif'], datetime.datetime(2019,4,20)-auctions_1['date'])
auctions_1['in_seconds'] = auctions_1['date_dif'].dt.total_seconds()
auctions_1['status_censored'] = auctions_1['device_id_next']==auctions_1['device_id']
auctions_1.drop(['device_id_next','date_dif'], axis='columns', inplace=True)
auctions_1['device_id'] = auctions_1['device_id'].astype('object')
#calculate previous time in seconds
auctions_1['date_prev'] = auctions_1['date'].shift()
auctions_1['date_dif_prev'] = auctions_1['date']- auctions_1['date_prev']
auctions_1['device_id_prev'] = auctions_1['device_id'].astype('object').shift()
auctions_1['date_dif_prev'] = np.where(auctions_1['device_id_prev']==auctions_1['device_id'], auctions_1['date_dif_prev'], auctions_1['date']-datetime.datetime(2019,4,18))
auctions_1['last_seen'] = auctions_1['date_dif_prev'].dt.total_seconds()
auctions_1.drop(['device_id_prev','date_dif_prev','date_prev'], axis='columns', inplace=True)
auctions_1 = auctions_1.sort_values(by=['date'])


In [16]:
auctions_1.head(5)

Unnamed: 0,date,device_id,ref_type_id,source_id,in_seconds,status_censored,last_seen
10129864,2019-04-18 00:00:00.015050,1826643666390887030,7,0,38.041873,True,0.01505
25285906,2019-04-18 00:00:00.029014,7037174172278258682,1,0,394.146288,True,0.029014
3946062,2019-04-18 00:00:00.057540,3392065368947589877,1,1,0.585458,True,0.05754
3946063,2019-04-18 00:00:00.126828,1228982273563226229,1,1,9.739882,True,0.126828
24607726,2019-04-18 00:00:00.132510,4123059034628125459,1,8,21.923076,True,0.13251


In [17]:
installs_1 = installs_df.loc[installs_df['created']<'2019-04-20 00:00:00'].copy()
installs_1.head(5)

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_brand,user_agent,event_uuid,kind,wifi,trans_id
12,2019-04-18 04:00:27.575,1,1494519392962156891,1084778553542198153,,False,True,,,5d2c31b2-7009-4388-a30c-b8a119b9695f,app_open,,
13,2019-04-18 08:33:46.588,1,1494519392962156891,6412465357609907698,,False,True,,,c4b4577a-e846-4de7-9d40-643d5c66506e,app_open,,
14,2019-04-18 20:38:01.661,1,1494519392962156891,8455113036752892959,,False,True,,,88b28e3e-31bb-4c11-a937-a79ba175cc05,app_open,,
15,2019-04-18 18:51:18.157,1,1494519392962156891,7939167172078327528,,False,False,,,,,,
22,2019-04-19 04:10:34.820,1,1494519392962156891,2251011940536423208,,False,True,,,e9f099f4-47e3-41a6-99e1-e7e6ee154270,app_open,,


In [18]:
installs_df['application_id'].describe()

count     481511
unique       311
top          121
freq       71487
Name: application_id, dtype: object

In [19]:
installs_1.columns

Index(['created', 'application_id', 'ref_type', 'ref_hash', 'click_hash',
       'attributed', 'implicit', 'device_brand', 'user_agent', 'event_uuid',
       'kind', 'wifi', 'trans_id'],
      dtype='object')

In [20]:
# search for features
auctions_1.columns

Index(['date', 'device_id', 'ref_type_id', 'source_id', 'in_seconds',
       'status_censored', 'last_seen'],
      dtype='object')

In [21]:
auct_cols = auctions_1.columns.tolist()

In [22]:
auctions_1.shape

(10542794, 7)

In [23]:
#information about last installs
data_1 = pd.merge(auctions_1, installs_1, left_on='device_id', right_on='ref_hash', how='left')
#only previus installs on the window
data_1 = data_1.loc[(data_1['date']>data_1['created']) | data_1['created'].isnull()]

In [24]:
# application_id feature by id
app_id_1 = data_1[['application_id']].copy()
app_id_1 = pd.get_dummies(app_id_1, dummy_na=True, prefix_sep='=')
data_1.drop(columns=['application_id'], inplace=True)
data_1 = pd.merge(data_1, app_id_1, left_index=True, right_index=True, how='inner')

In [25]:
app_id_1_columns = app_id_1.columns.tolist()

In [26]:
group_1 = data_1.groupby(auct_cols).agg({col:'sum' for col in app_id_1_columns})

In [27]:
group_1.loc[group_1['application_id=14']>2][['application_id=14']].head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,application_id=14
date,device_id,ref_type_id,source_id,in_seconds,status_censored,last_seen,Unnamed: 7_level_1
2019-04-18 20:10:16.507879,4576801386712364201,1,1,709.971537,True,72616.507879,3
2019-04-18 20:22:06.479416,4576801386712364201,1,1,1.045925,True,709.971537,3
2019-04-18 20:22:07.525341,4576801386712364201,1,1,99472.474659,False,1.045925,3
2019-04-19 03:20:23.891613,3069324546630310450,1,3,74376.108387,False,98423.891613,5
2019-04-19 03:33:28.677124,1705620787460316875,1,1,451.616207,True,78460.727145,5


In [28]:
group_1.reset_index(inplace=True)

In [29]:
group_1.head(5)

Unnamed: 0,date,device_id,ref_type_id,source_id,in_seconds,status_censored,last_seen,application_id=1,application_id=10,application_id=100,...,application_id=90,application_id=91,application_id=93,application_id=94,application_id=95,application_id=96,application_id=97,application_id=98,application_id=99,application_id=nan
0,2019-04-18 00:00:00.015050,1826643666390887030,7,0,38.041873,True,0.01505,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2019-04-18 00:00:00.029014,7037174172278258682,1,0,394.146288,True,0.029014,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2019-04-18 00:00:00.057540,3392065368947589877,1,1,0.585458,True,0.05754,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2019-04-18 00:00:00.126828,1228982273563226229,1,1,9.739882,True,0.126828,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2019-04-18 00:00:00.154022,2544156497510885464,1,1,7.597262,True,0.154022,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [35]:
##delete when no longer needed
#del events_df
#del clicks_df
#del data_1
#del app_id_1
##collect residual garbage
#gc.collect()

163

In [31]:
auctions_1 = pd.merge(auctions_1, group_1, on=['date','device_id','ref_type_id','source_id','in_seconds','status_censored', 'last_seen'], how='left')
auctions_1 = auctions_1.astype({col:'float32' for col in app_id_1_columns})

In [32]:
##delete when no longer needed
#del group_1
##collect residual garbage
#gc.collect()

In [33]:
auctions_1.fillna(value={'application_id=nan':1}, inplace=True)
auctions_1.fillna(value={col:0 for col in app_id_1_columns}, inplace=True)
auctions_1 = auctions_1.astype({col:'int32' for col in app_id_1_columns})

In [34]:
auctions_1.shape

(10542794, 319)

In [36]:
#save to a file
auctions_1.to_csv('data/auctions_1.csv',index=False)

In [None]:
#read file
auctions_1 = pd.read_csv('data/auctions_1.csv', low_memory=False, dtype={'device_id':'object',\
                                                                        'ref_type_id':'category','source_id':'category','in_seconds':'float64',\
                                                                        'status_censored':'bool','last_seen':'float64'})
print('after load ', auctions_1.memory_usage(index=True).sum())
auctions_1['date'] = pd.to_datetime(auctions_1['date'])
app_cols = []
for col in auctions_1.columns:
    if col.startswith('application_id'):
        app_cols.append(col)
auctions_1 = auctions_1.astype({col:'int32' for col in app_cols})
print('after datatype ', auctions_1.memory_usage(index=True).sum())
gc.collect()
print('after GC ', auctions_1.memory_usage(index=True).sum())
auctions_1.dtypes

In [35]:
group_1.shape
#group_1.head(50)

(5202421, 319)

In [1]:
auctions_1.columns

NameError: name 'auctions_1' is not defined

In [658]:
#data X and y
data_full_1 = pd.merge(auctions_1, installs_1, left_on='device_id', right_on='ref_hash', how='inner')
data_full_1 = data_full_1.loc[data_full_1['date']>=data_full_1['created']]
data_full_1['install_diff'] = data_full_1['date']-data_full_1['created']
data_full_1['install_seconds'] = data_full_1['install_diff'].dt.total_seconds()
data_full_1 = data_full_1.loc[data_full_1['in_seconds']>=data_full_1['install_seconds']]
data_x_1 = data_full_1.drop(columns=['in_seconds','status_censored','ref_hash'])
data_y_1 = np.fromiter((data_full_1["status_censored"], data_full_1["in_seconds"]),
                                dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])

In [2]:
class preprocess( BaseEstimator, TransformerMixin ): 
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        # X = X.copy() # do we need this line
        # boolean transformations
        if 'event_uuid' in X.columns:
            X['event_uuid'] = np.where(X['event_uuid'].isnull(), 0,1)
        if 'click_hash' in X.columns:
            X['click_hash'] = np.where(X['click_hash'].isnull(), 0,1)
        if 'Android' in X.columns:
            X['Android'] = np.where(X['user_agent'].str.contains('Android', regex=False),1,0)
        if 'iOS' in X.columns:
            X['iOS'] = np.where(X['user_agent'].str.contains('Darwin', regex=False) | X['user_agent'].str.contains('iOS', regex=False),1,0)
        if 'trans_id' in X.columns:
            X['trans_id'] = np.where(X['trans_id'].isnull(), 0,1)
        # date transformations
        if 'created' in X.columns:
            X['created_weekday'] = X['created'].dt.weekday
            X['created_hour'] = X['created'].dt.hour
            X['created_minute'] = X['created'].dt.minute
        if 'date' in X.columns:
            X['date_weekday'] = X['date'].dt.weekday
            X['date_hour'] = X['date'].dt.hour
            X['date_minute'] = X['date'].dt.minute
            X['date_second'] = X['date'].dt.second
        #remove unused columns
        to_drop = []
        for col in ['date','created', 'install_diff','device_brand','install_seconds','user_agent','device_id']:
            if col in X.columns:
                to_drop.append(col)
        X = X.drop(columns=to_drop)
        X = pd.get_dummies(X, dummy_na=True, prefix_sep='=')
        #remove empty columns
        to_drop = []
        for col in X.columns:
            if X[col].sum()==0:
                to_drop.append(col)
        if len(to_drop)>0:
            X = X.drop(columns=to_drop)
        self.encoded_columns_ = X.columns
        #returns numpy array
        return X

NameError: name 'BaseEstimator' is not defined

In [17]:
# format features
#data_x_1['event_uuid'] = np.where(data_x_1['event_uuid'].isnull(), 0,1)
#data_x_1['click_hash'] = np.where(data_x_1['click_hash'].isnull(), 0,1)
#data_x_1['Android'] = np.where(data_x_1['user_agent'].str.contains('Android', regex=False),1,0)
#data_x_1['iOS'] = np.where(data_x_1['user_agent'].str.contains('Darwin', regex=False) | data_x_1['user_agent'].str.contains('iOS', regex=False),1,0)
#data_x_1['trans_id'] = np.where(data_x_1['trans_id'].isnull(), 0,1)
#data_x_1['created_weekday'] = data_x_1['created'].dt.weekday
#data_x_1['created_hour'] = data_x_1['created'].dt.hour
#data_x_1['created_minute'] = data_x_1['created'].dt.minute
#data_x_1['date_weekday'] = data_x_1['date'].dt.weekday
#data_x_1['date_hour'] = data_x_1['date'].dt.hour
#data_x_1['date_minute'] = data_x_1['date'].dt.minute
#data_x_1['date_second'] = data_x_1['date'].dt.second
#data_x_1.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent'], inplace=True)
#data_x_1_numeric = pd.get_dummies(data_x_1, dummy_na=True, prefix_sep='=')


In [54]:
data_y_1 = np.fromiter(zip(data_full_1.head(100)["status_censored"], data_full_1.head(100)["in_seconds"]),
                                dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])

NameError: name 'data_full_1' is not defined

In [408]:
estimator = CoxPHSurvivalAnalysis(alpha=0.1)
estimator.fit(data_x_1_numeric.head(100), data_y_1)
estimator.score(data_x_1_numeric.head(100), data_y_1)

0.8191370444891571

In [8]:
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores


In [None]:
scores = fit_and_score_features(data_x_1_numeric.head(100).values, data_y_1)
pd.Series(scores, index=data_x_1_numeric.columns).sort_values(ascending=False)

In [9]:
def custom_cv_folds(X):
    myCViterator = []
    trainIndices = X.loc[X['date']<'2019-04-18 12:00:00'].index.values.astype(int)
    testIndices =  X.loc[('2019-04-18 12:00:00'<=X['date']) & (X['date']<'2019-04-19 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-18 12:00:00'<=X['date']) & (X['date']<'2019-04-19 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-19 00:00:00'<=X['date']) & (X['date']<'2019-04-19 12:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-19 00:00:00'<=X['date']) & (X['date']<'2019-04-19 12:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-19 12:00:00'<=X['date']) & (X['date']<'2019-04-20 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    return myCViterator


In [10]:
# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)
pipe = Pipeline([('preprocess', preprocess()),
                 ('reduce_dim', NMF()),
                 #('select', SelectKBest(fit_and_score_features, k=50)),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.01))],
                memory=memory)

In [11]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [22]:
#auctions data
data_x_1 = auctions_1.drop(columns=['in_seconds','status_censored']).copy()
data_y_1 = np.fromiter(zip(auctions_1["status_censored"], auctions_1["in_seconds"]),
                                dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])


In [23]:
#delete when no longer needed
#del events_df
#del clicks_df
#del data_1
#del app_id_1
#del group_1
del auctions_1
#collect residual garbage
gc.collect()

162

In [None]:
# grid search
data_x_11 = data_x_1.copy()
data_x_11.reset_index(inplace=True)
custom_cv = custom_cv_folds(data_x_11)
param_grid = {'select__k': np.arange(1, data_x_11.shape[1] + 1)}
gcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=custom_cv, iid=True) #, n_jobs=-1)
gcv.fit(data_x_11, data_y_1)

pd.DataFrame(gcv.cv_results_).sort_values(by='mean_test_score', ascending=False)

In [1]:
gcv.best_params_

NameError: name 'gcv' is not defined

In [None]:
# random search
data_x_11 = data_x_1
data_x_11.reset_index(inplace=True)
custom_cv = custom_cv_folds(data_x_11)
#param_grid = {'select__k': np.arange(1, data_x_11.shape[1] + 1)}
param_grid = {'reduce_dim__n_components': np.arange(3, data_x_11.shape[1] + 1)}
rcv = RandomizedSearchCV(pipe, param_grid, return_train_score=True, cv=custom_cv, iid=True, n_iter=10, n_jobs=2)

gc.collect() #release cache

rcv.fit(data_x_11, data_y_1)

# Delete the temporary cache before exiting
rmtree(cachedir)

gc.collect() #release cache

pd.DataFrame(rcv.cv_results_).sort_values(by='mean_test_score', ascending=False).head(20)

In [1]:
rcv.best_params_

NameError: name 'rcv' is not defined

In [564]:
data_x_11 = data_x_1.head(100).copy()
pipe.set_params(**gcv.best_params_)
pipe.fit(data_x_11, data_y_1)

joblib.dump(pipe, 'data/model.sav')
encoder, transformer, final_estimator = [s[1] for s in pipe.steps]

last_seen                      -0.000039
created_weekday                 0.760496
date_hour                       0.016006
date_second                     0.021819
ref_type_id=1                   0.117072
ref_type_id=7                  -0.117072
source_id=1                    -0.890550
source_id=3                     0.539926
source_id=7                    -1.156081
application_id=121             -0.333614
application_id=210             -0.006043
ref_type=1494519392962156891   -0.117072
ref_type=1891515180541284343    0.117072
kind=open                      -0.107620
wifi=nan                        0.273920
dtype: float64

In [None]:
pd.Series(final_estimator.coef_, index=encoder.encoded_columns_[transformer.get_support()])

In [565]:
data_x_11 = data_x_1.head(100).copy()
pipe.score(data_x_11, data_y_1)

0.7289740698985344

In [490]:
data_x_11.columns

Index(['date', 'ref_type_id', 'source_id', 'created', 'application_id',
       'ref_type', 'click_hash', 'attributed', 'implicit', 'device_brand',
       'user_agent', 'event_uuid', 'kind', 'wifi', 'trans_id', 'install_diff',
       'install_seconds'],
      dtype='object')

In [485]:
data_y_1.shape

(100,)

In [333]:
data_x_1_numeric.shape

(31377, 1601)

In [336]:
data_y_1.shape

(31377,)

In [None]:
test1 = auctions_1[['device_id','date']]

In [89]:
test1 = test1.sort_values(by=['device_id','date'])

In [90]:
test1 = test1.head(1000)

In [91]:
test1.shape

(1000, 2)

In [92]:
test1['date_dif'] = test1['date'].shift(periods=-1) - test1['date']

In [97]:
test1['device_id_next'] = test1['device_id'].astype('object').shift(periods=-1)

In [99]:
test1['date_dif'] = np.where(test1['device_id_next']==test1['device_id'], test1['date_dif'], datetime.datetime(2019,4,20)-test1['date'])

In [101]:
test1['in_seconds'] = test1['date_dif'].dt.total_seconds()

In [103]:
test1['status-censored'] = test1['device_id_next']==test1['device_id']

In [104]:
test1.head(50)

Unnamed: 0,device_id,date,date_dif,in_seconds,device_id_next,status-censored
30900602,41863526108385,2019-04-19 19:40:28.465866,0 days 04:19:31.534134,15571.534134,161514654074162,False
46872336,161514654074162,2019-04-18 02:52:46.357746,0 days 00:00:28.744020,28.74402,161514654074162,True
7330779,161514654074162,2019-04-18 02:53:15.101766,0 days 00:00:25.982516,25.982516,161514654074162,True
16177030,161514654074162,2019-04-18 02:53:41.084282,0 days 00:00:18.239779,18.239779,161514654074162,True
19683863,161514654074162,2019-04-18 02:53:59.324061,0 days 00:00:02.851261,2.851261,161514654074162,True
22213367,161514654074162,2019-04-18 02:54:02.175322,0 days 00:00:47.365665,47.365665,161514654074162,True
20455826,161514654074162,2019-04-18 02:54:49.540987,1 days 21:05:10.459013,162310.459013,186034136943920,False
16127540,186034136943920,2019-04-18 16:42:46.331894,0 days 02:40:55.874138,9655.874138,186034136943920,True
4859451,186034136943920,2019-04-18 19:23:42.206032,1 days 04:36:17.793968,102977.793968,283297668933729,False
293739,283297668933729,2019-04-18 22:24:44.520583,1 days 01:35:15.479417,92115.479417,345999128501141,False


In [61]:
test1['date_dif'] = test1['date'].shift(periods=-1) - test1['date']

In [69]:
test1.head(50)

Unnamed: 0,date,device_id,date_dif,in_seconds
36601349,2019-04-23 15:00:57.222979,40621410000000.0,NaT,
30900602,2019-04-19 19:40:28.465866,41863530000000.0,07:11:58.427014,25918.427014
14875535,2019-04-20 02:52:26.892880,41863530000000.0,00:06:35.616350,395.61635
32062481,2019-04-20 02:59:02.509230,41863530000000.0,00:06:59.166558,419.166558
45613542,2019-04-20 03:06:01.675788,41863530000000.0,00:02:55.712372,175.712372
42215232,2019-04-20 03:08:57.388160,41863530000000.0,00:02:29.075743,149.075743
22374394,2019-04-20 03:11:26.463903,41863530000000.0,00:00:14.547768,14.547768
12690007,2019-04-20 03:11:41.011671,41863530000000.0,00:00:45.669391,45.669391
42215538,2019-04-20 03:12:26.681062,41863530000000.0,00:00:30.801420,30.80142
12690107,2019-04-20 03:12:57.482482,41863530000000.0,00:01:58.661626,118.661626
