In [1]:
import pandas as pd
import numpy as np
import datetime
import gc
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import NMF, PCA
from sklearn.externals import joblib
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

In [2]:
## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
## Insert whatever audio file you want above

In [3]:
def fix_str_float(ds, col):
    ds[col] = ds[col].str.replace(r'[^0-9\.]','')
    ds[col] = np.where(ds[col]=='',np.nan,ds[col])
    ds[col] = ds[col].astype('float32')
    return ds[col].astype('float32')

In [4]:
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores


In [5]:
def remove_unused_cols(X):
    #remove empty columns
    to_drop = []
    for col in X.select_dtypes(include='number').columns:
        if X[col].sum()==0:
            to_drop.append(col)
    if len(to_drop)>0:
        X = X.drop(columns=to_drop)
    return X

In [6]:
class preprocess( BaseEstimator, TransformerMixin ): 
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        # X = X.copy() # do we need this line
        # boolean transformations
        if 'event_uuid' in X.columns:
            X['event_uuid'] = np.where(X['event_uuid'].isnull(), 0,1)
        if 'click_hash' in X.columns:
            X['click_hash'] = np.where(X['click_hash'].isnull(), 0,1)
        if 'Android' in X.columns:
            X['Android'] = np.where(X['user_agent'].str.contains('Android', regex=False),1,0)
        if 'iOS' in X.columns:
            X['iOS'] = np.where(X['user_agent'].str.contains('Darwin', regex=False) | X['user_agent'].str.contains('iOS', regex=False),1,0)
        if 'trans_id' in X.columns:
            X['trans_id'] = np.where(X['trans_id'].isnull(), 0,1)
        # date transformations
        if 'created' in X.columns:
            X['created_weekday'] = X['created'].dt.weekday
            X['created_hour'] = X['created'].dt.hour
            X['created_minute'] = X['created'].dt.minute
        if 'date' in X.columns:
            X['date_weekday'] = X['date'].dt.weekday
            X['date_hour'] = X['date'].dt.hour
            X['date_minute'] = X['date'].dt.minute
            X['date_second'] = X['date'].dt.second
        #remove unused columns
        to_drop = []
        for col in ['date','created', 'install_diff','device_brand','install_seconds','user_agent','device_id']:
            if col in X.columns:
                to_drop.append(col)
        X = X.drop(columns=to_drop)
        X = pd.get_dummies(X, dummy_na=True, prefix_sep='=')
        #returns numpy array
        return X

    

In [7]:
def custom_cv_folds(X):
    myCViterator = []
    trainIndices = X.loc[X['date']<'2019-04-21 00:00:00'].index.values.astype(int)
    testIndices =  X.loc[('2019-04-21 00:00:00'<=X['date']) & (X['date']<'2019-04-24 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-19 00:00:00'<=X['date']) & (X['date']<'2019-04-22 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-22 00:00:00'<=X['date']) & (X['date']<'2019-04-25 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-20 00:00:00'<=X['date']) & (X['date']<'2019-04-23 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-23 00:00:00'<=X['date']) & (X['date']<'2019-04-26 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-21 00:00:00'<=X['date']) & (X['date']<'2019-04-24 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-24 00:00:00'<=X['date']) & (X['date']<'2019-04-27 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    return myCViterator


In [8]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [9]:
#read from sample
auctions_ids_sample = pd.read_csv('data/auctions_ids_sample.csv', dtype={'device_id':'object'})['device_id']

In [10]:
# auctions
auctions_df = pd.read_csv('data/auctions.csv', low_memory=False, dtype={'country':'category','platform':'category',\
                                                                        'ref_type_id':'category','source_id':'category','device_id':'object'})
auctions_df['date'] = pd.to_datetime(auctions_df['date'])
auctions_sample = auctions_df.loc[auctions_df['device_id'].isin(auctions_ids_sample)].copy()
# installs
installs_df = pd.read_csv('data/installs.csv', low_memory=False, dtype={'ref_type':'category','application_id':'category',\
                                                      'device_brand':'category','ref_hash':'object','wifi':'category'})
installs_df['kind'] = installs_df['kind'].str.lower()
installs_df['kind'] = installs_df['kind'].astype('category')
installs_df.drop(columns=['session_user_agent','ip_address','device_language','device_model'], inplace=True)
installs_df['created'] = pd.to_datetime(installs_df['created'])
installs_df.drop(['device_countrycode'], axis=1, inplace=True)
installs_sample = installs_df.loc[installs_df['ref_hash'].isin(auctions_ids_sample)].copy()
#installs_sample.to_csv('data/installs_sample.csv', index=False)
#events
events_df = pd.read_csv('data/events.csv', low_memory=False, dtype={'event_id':'int32','ref_type':'category','application_id':'category',\
                                                                                            'attributed':'bool','device_countrycode':'category','device_city':'category',\
                                                                                            'trans_id':'category','carrier':'category','device_os':'category',\
                                                                                            'connection_type':'category'})
events_df['date'] = pd.to_datetime(events_df['date'])
events_df['wifi'].astype('bool', inplace=True)
events_df.drop(columns=['device_countrycode','session_user_agent','ip_address','device_language'], inplace=True)
events_sample = events_df.loc[events_df['ref_hash'].isin(auctions_ids_sample)].copy()
#events_sample.to_csv('data/events_sample.csv', index=False) 
del events_df
del auctions_df
del installs_df
gc.collect()
allDone()
print('setup done')

setup done


In [12]:
auctions_sample.shape

(241085, 4)

In [221]:
# calculate time in seconds
auctions_sample.drop_duplicates(inplace=True)
auctions_sample = auctions_sample.sort_values(by=['device_id','date'])
auctions_sample['date_dif'] = auctions_sample['date'].shift(periods=-1) - auctions_sample['date']
auctions_sample['device_id_next'] = auctions_sample['device_id'].astype('object').shift(periods=-1)
auctions_sample['date_dif'] = pd.to_timedelta(np.where(auctions_sample['device_id_next']==auctions_sample['device_id'], auctions_sample['date_dif'], np.where(auctions_sample['date']+pd.DateOffset(3)>'2019-04-27 00:00:00', datetime.datetime(2019,4,27)-auctions_sample['date'], pd.to_timedelta(3, unit='d'))))
auctions_sample['in_seconds'] = auctions_sample['date_dif'].dt.total_seconds()
auctions_sample['status_censored'] = ((auctions_sample['device_id_next']==auctions_sample['device_id']) & (auctions_sample['in_seconds']<259200.0))
auctions_sample.drop(['device_id_next','date_dif'], axis='columns', inplace=True)
auctions_sample['device_id'] = auctions_sample['device_id'].astype('object')
#calculate previous time in seconds
auctions_sample['date_prev'] = auctions_sample['date'].shift()
auctions_sample['date_dif_prev'] = auctions_sample['date']- auctions_sample['date_prev']
auctions_sample['device_id_prev'] = auctions_sample['device_id'].astype('object').shift()
auctions_sample['date_dif_prev'] = pd.to_timedelta(np.where(auctions_sample['device_id_prev']==auctions_sample['device_id'], auctions_sample['date_dif_prev'], np.where(auctions_sample['date']-pd.DateOffset(3)<'2019-04-18 00:00:00', auctions_sample['date']-datetime.datetime(2019,4,18), pd.to_timedelta(3, unit='d'))))
auctions_sample['last_seen'] = auctions_sample['date_dif_prev'].dt.total_seconds()
auctions_sample.drop(['device_id_prev','date_dif_prev','date_prev'], axis='columns', inplace=True)
auctions_sample = auctions_sample.sort_values(by=['date'])



In [222]:
auct_cols = auctions_sample.columns.tolist()

In [224]:
#some features
auct_cols = auctions_sample.columns.tolist()
print(1)
#installs and evetns
events_sample['kind'] = 'event'
print(2)
installs_sample['kind'] = installs_sample['kind'].str.replace(' ', '_').str.lower()
print(3)
#information about last installs and events
applications = installs_sample[['ref_hash','created','application_id','kind']].append(events_sample[['ref_hash', 'date', 'application_id','kind']].rename(columns = {'date':'created'}), ignore_index=True)
print(4)
data_1 = pd.merge(auctions_sample, applications, left_on='device_id', right_on='ref_hash', how='left')
print(5)
#only previus installs or events
data_1 = data_1.loc[(data_1['date']>data_1['created']) | data_1['created'].isnull()]
print(6)

# application_id feature by id
app_id_1 = data_1[['application_id','kind']].copy()
print(7)
app_id_1 = pd.get_dummies(app_id_1, dummy_na=True, prefix_sep='=')
print(8)
data_1.drop(columns=['application_id'], inplace=True)
print(9)
data_1 = pd.merge(data_1, app_id_1, left_index=True, right_index=True, how='inner')
print(10)

app_id_1_columns = app_id_1.columns.tolist()
print(11)

group_1 = data_1.groupby(auct_cols).agg({col:'sum' for col in app_id_1_columns})
print(12)
group_1.reset_index(inplace=True)

del installs_sample
del events_sample
gc.collect()
print(13)
auctions_sample = pd.merge(auctions_sample, group_1, on=['date','device_id','ref_type_id','source_id','in_seconds','status_censored', 'last_seen'], how='left')
print(14)
auctions_sample = auctions_sample.astype({col:'float32' for col in app_id_1_columns})
print(15)

auctions_sample['hora'] = auctions_sample['date'].dt.hour
auctions_sample['dia'] = auctions_sample['date'].dt.day
pivot_auctions = auctions_sample.groupby(['dia','hora']).size().reset_index()
pivot_auctions.columns = ['dia','hora', 'hour_day_dist']
max_auctions = pivot_auctions['hour_day_dist'].max().max()
pivot_auctions['hour_day_dist'] = pivot_auctions['hour_day_dist']/max_auctions
display(pivot_auctions.head(5))
print(auctions_sample.shape)
auctions_sample = pd.merge(auctions_sample, pivot_auctions, on=['dia','hora'], how='left')
auctions_sample.drop(columns=['hora','dia'], inplace=True)
print(auctions_sample.shape)

auctions_sample.fillna(value={'application_id=nan':1,'kind=nan':1}, inplace=True)
print(16)
auctions_sample.fillna(value={col:0 for col in app_id_1_columns}, inplace=True)
print(17)
auctions_sample = auctions_sample.astype({col:'int32' for col in app_id_1_columns})
print(18)
auctions_sample.reset_index(inplace=True, drop=True)
print(19)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


Unnamed: 0,dia,hora,hour_day_dist
0,18,0,0.720353
1,18,1,0.625736
2,18,2,0.664424
3,18,3,0.651388
4,18,4,0.660219


(241085, 128)
(241085, 127)
16
17
18
19


In [225]:
auctions_sample.memory_usage(index=True).sum()

125123691

In [226]:
new_dtypes = {col: ('datetime64' if col=='date' else ('object' if col=='device_id' else ('category' if col=='ref_type_id' or col=='source_id' else ('float64' if col=='in_seconds' or col=='last_seen' or col=='hour_day_dist' else ('bool' if col=='status_censored' else 'int16'))))) for col in auctions_sample.columns}
auctions_sample = auctions_sample.astype(new_dtypes, copy=False)
features = np.setdiff1d(auctions_sample.columns, ['in_seconds', 'status_censored'] ).tolist()
corr = auctions_sample[features].corr()
#remove uncorrelated columns (less than: -0.95)
corr = pd.DataFrame(np.tril(corr), columns=corr.columns, index=corr.index) #triangular inferior
removed_corr = corr.loc[(corr<-0.95).any()].index.tolist()
print(removed_corr)
corr_columns = np.setdiff1d(auctions_sample.columns, removed_corr ).tolist()
if not('date' in corr_columns):
    corr_columns.append('date')
if not('device_id' in corr_columns):
    corr_columns.append('device_id')
if not('ref_type_id' in corr_columns):
    corr_columns.append('ref_type_id')
if not('source_id' in corr_columns):
    corr_columns.append('source_id')
if not('in_seconds' in corr_columns):
    corr_columns.append('in_seconds')
if not('last_seen' in corr_columns):
    corr_columns.append('last_seen')
if not('status_censored' in corr_columns):
    corr_columns.append('status_censored')
if not('hour_day_dist' in corr_columns):
    corr_columns.append('hour_day_dist')

auctions_sample = auctions_sample[corr_columns]
new_dtypes = {col: ('datetime64' if col=='date' else ('object' if col=='device_id' else ('category' if col=='ref_type_id' or col=='source_id' else ('float64' if col=='in_seconds' or col=='last_seen' or col=='hour_day_dist' else ('bool' if col=='status_censored' else 'int16'))))) for col in auctions_sample.columns}
auctions_sample.to_csv('data/auctions_sample_merged_05.csv', index=False) 



[]


In [227]:
allDone()
new_dtypes

{'application_id=101': 'int16',
 'application_id=116': 'int16',
 'application_id=117': 'int16',
 'application_id=121': 'int16',
 'application_id=122': 'int16',
 'application_id=123': 'int16',
 'application_id=126': 'int16',
 'application_id=128': 'int16',
 'application_id=129': 'int16',
 'application_id=13': 'int16',
 'application_id=133': 'int16',
 'application_id=135': 'int16',
 'application_id=14': 'int16',
 'application_id=140': 'int16',
 'application_id=143': 'int16',
 'application_id=145': 'int16',
 'application_id=147': 'int16',
 'application_id=153': 'int16',
 'application_id=155': 'int16',
 'application_id=158': 'int16',
 'application_id=159': 'int16',
 'application_id=161': 'int16',
 'application_id=163': 'int16',
 'application_id=165': 'int16',
 'application_id=167': 'int16',
 'application_id=170': 'int16',
 'application_id=175': 'int16',
 'application_id=180': 'int16',
 'application_id=185': 'int16',
 'application_id=187': 'int16',
 'application_id=188': 'int16',
 'applicat

In [None]:
auctions_sample.shape

# old code:

In [44]:
df_chunk = auctions_sample
#remove empty columns
print('start')
df_chunk = remove_unused_cols(df_chunk)
print('unused cols')
custom_cv = custom_cv_folds(df_chunk)
print('custom_cv')
df_chunk = preprocess().transform(df_chunk)

print('transform whole data')

cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=10)
print('memory cached', cachedir)
#pipeline
pipe = Pipeline([('preprocess', preprocess()),
                 #('select', SelectKBest(fit_and_score_features, k=50)),
                 ('reduce_dim', NMF(max_iter=100)),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.01))], memory=memory)
print('pipeline')
#hyperparameters search
N_FEATURES_OPTIONS = [3, 4, 5] #np.arange(3, df_chunk.shape[1] + 1)
param_grid = {
    #'select__k': N_FEATURES_OPTIONS
    'reduce_dim__n_components': N_FEATURES_OPTIONS
}
rcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=custom_cv, iid=True)
print('Grinf config')
#get X and y data
data_y = np.fromiter(zip(df_chunk["status_censored"], df_chunk["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])
df_chunk = df_chunk.drop(columns=['in_seconds','status_censored'])
print('X, y data')
#gc.collect() #release cache
#start searchin hyperparameters
rcv.fit(df_chunk, data_y)
print('Grid Fit')


#with hyperparameters fit pipeline
pipe.set_params(**rcv.best_params_)
print('pipe params')
pipe.fit(df_chunk, data_y)
joblib.dump(pipe, 'model.sav')
print('pipe fit')

rmtree(cachedir)
allDone()



start
unused cols
custom_cv
transform whole data
memory cached /var/folders/4j/xd9tnb7n42z3jsjhy_wqwcyc0000gn/T/tmpla4i__uq
pipeline
Grinf config
X, y data
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(preprocess(),             last_seen  application_id=101  application_id=116  \
0            3.049129                   0                   0   
1            4.198257                   0                   0   
2            0.377013                   0                   0   
3            4.713358                   0                   0   
4            0.531197                   0                   0   
5            5.467602                   0                   0   
6            6.280256                   0                   0   
7            9.017695                   0                   0   
8           12.009020                   0                   0   
9           13.363530   

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 6.9s, 0.1min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=100,
  n_components=4, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0), 
          last_seen  application_id=101  application_id=116  \
0          3.049129                   0                   0   
1          4.198257                   0                   0   
2          0.377013                   0                   0   
3          4.713358                   0                   0   
4          0.531197                   0                   0   
5          5.467602                   0                   0   
6          6.280256                   0                   0   
7          9.017695                   0                   0   
8         12.009020        

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


_______________________________________________fit_transform_one - 17.0s, 0.3min
Grid Fit
pipe params
[Memory]6793.5s, 113.2min: Loading _fit_transform_one from /var/folders/4j/xd9tnb7n42z3jsjhy_wqwcyc0000gn/T/tmpla4i__uq/joblib/sklearn/pipeline/_fit_transform_one/c1b8ba70e6e4b48573ae0fd161912cf8
___________________________________fit_transform_one cache loaded - 0.4s, 0.0min
[Memory]6794.6s, 113.2min: Loading _fit_transform_one from /var/folders/4j/xd9tnb7n42z3jsjhy_wqwcyc0000gn/T/tmpla4i__uq/joblib/sklearn/pipeline/_fit_transform_one/49d0f1b8262d29b942584517acb8a1f5
___________________________________fit_transform_one cache loaded - 0.0s, 0.0min
pipe fit


In [51]:
pd.DataFrame(rcv.cv_results_).sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reduce_dim__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
1,47.601805,0.618391,275.817964,32.387254,4,{'reduce_dim__n_components': 4},0.537848,0.538023,0.545068,0.541817,0.540686,0.003003,1,0.537016,0.536272,0.541994,0.540719,0.539,0.002413
0,48.975152,1.538559,255.89334,22.571795,3,{'reduce_dim__n_components': 3},0.538079,0.537701,0.546252,0.540555,0.540661,0.003443,2,0.538021,0.536572,0.542058,0.541048,0.539425,0.002218
2,48.817669,0.426649,244.821878,15.144905,5,{'reduce_dim__n_components': 5},0.524604,0.533816,0.541542,0.54164,0.535337,0.007005,3,0.531774,0.534657,0.534764,0.540212,0.535352,0.003052


In [45]:
rcv.best_params_

{'reduce_dim__n_components': 4}

In [47]:
df_chunk.shape

(490041, 132)

In [16]:
# auctions
auctions_df = pd.read_csv('data/auctions.csv', low_memory=False, dtype={'country':'category','platform':'category',\
                                                                        'ref_type_id':'category','source_id':'category','device_id':'object'})
auctions_df['date'] = pd.to_datetime(auctions_df['date'])
auctions_sample = auctions_df.loc[auctions_df['device_id'].isin(auctions_ids_sample)].copy()
# installs
installs_df = pd.read_csv('data/installs.csv', low_memory=False, dtype={'ref_type':'category','application_id':'category',\
                                                      'device_brand':'category','ref_hash':'object','wifi':'category'})
installs_df['kind'] = installs_df['kind'].str.lower()
installs_df['kind'] = installs_df['kind'].astype('category')
installs_df.drop(columns=['session_user_agent','ip_address','device_language','device_model'], inplace=True)
installs_df['created'] = pd.to_datetime(installs_df['created'])
installs_df.drop(['device_countrycode'], axis=1, inplace=True)
installs_sample = installs_df.loc[installs_df['ref_hash'].isin(auctions_ids_sample)].copy()
#events
events_df = pd.read_csv('data/events.csv', low_memory=False, dtype={'event_id':'int32','ref_type':'category','application_id':'category',\
                                                                                            'attributed':'bool','device_countrycode':'category','device_city':'category',\
                                                                                            'trans_id':'category','carrier':'category','device_os':'category',\
                                                                                            'connection_type':'category'})
events_df['date'] = pd.to_datetime(events_df['date'])
events_df['wifi'].astype('bool', inplace=True)
events_df.drop(columns=['device_countrycode','session_user_agent','ip_address','device_language'], inplace=True)
events_sample = events_df.loc[events_df['ref_hash'].isin(auctions_ids_sample)].copy()
del events_df
del auctions_df
del installs_df
gc.collect()

# calculate time in seconds
auctions_sample.drop_duplicates(inplace=True)
auctions_sample = auctions_sample.sort_values(by=['device_id','date'])
auctions_sample['date_dif'] = auctions_sample['date'].shift(periods=-1) - auctions_sample['date']
auctions_sample['device_id_next'] = auctions_sample['device_id'].astype('object').shift(periods=-1)
auctions_sample['date_dif'] = pd.to_timedelta(np.where(auctions_sample['device_id_next']==auctions_sample['device_id'], auctions_sample['date_dif'], np.where(auctions_sample['date']+pd.DateOffset(3)>'2019-04-27 00:00:00', datetime.datetime(2019,4,27)-auctions_sample['date'], pd.to_timedelta(3, unit='d'))))
auctions_sample['in_seconds'] = auctions_sample['date_dif'].dt.total_seconds()
auctions_sample['status_censored'] = auctions_sample['device_id_next']==auctions_sample['device_id']
auctions_sample.drop(['device_id_next','date_dif'], axis='columns', inplace=True)
auctions_sample['device_id'] = auctions_sample['device_id'].astype('object')
#calculate previous time in seconds
auctions_sample['date_prev'] = auctions_sample['date'].shift()
auctions_sample['date_dif_prev'] = auctions_sample['date']- auctions_sample['date_prev']
auctions_sample['device_id_prev'] = auctions_sample['device_id'].astype('object').shift()
auctions_sample['date_dif_prev'] = pd.to_timedelta(np.where(auctions_sample['device_id_prev']==auctions_sample['device_id'], auctions_sample['date_dif_prev'], np.where(auctions_sample['date']-pd.DateOffset(3)<'2019-04-18 00:00:00', auctions_sample['date']-datetime.datetime(2019,4,18), pd.to_timedelta(3, unit='d'))))
auctions_sample['last_seen'] = auctions_sample['date_dif_prev'].dt.total_seconds()
auctions_sample.drop(['device_id_prev','date_dif_prev','date_prev'], axis='columns', inplace=True)
auctions_sample = auctions_sample.sort_values(by=['date'])

auct_cols = auctions_sample.columns.tolist()

#information about last installs
applications = installs_sample[['ref_hash','created','application_id']].append(events_sample[['ref_hash', 'date', 'application_id']].rename(columns = {'date':'created'}), ignore_index=True)
data_1 = pd.merge(auctions_sample, applications, left_on='device_id', right_on='ref_hash', how='left')
#only previus installs on the window
data_1 = data_1.loc[(data_1['date']>data_1['created']) | data_1['created'].isnull()]

# application_id feature by id
app_id_1 = data_1[['application_id']].copy()
app_id_1 = pd.get_dummies(app_id_1, dummy_na=True, prefix_sep='=')
data_1.drop(columns=['application_id'], inplace=True)
data_1 = pd.merge(data_1, app_id_1, left_index=True, right_index=True, how='inner')

app_id_1_columns = app_id_1.columns.tolist()

group_1 = data_1.groupby(auct_cols).agg({col:'sum' for col in app_id_1_columns})
group_1.reset_index(inplace=True)
auctions_sample = pd.merge(auctions_sample, group_1, on=['date','device_id','ref_type_id','source_id','in_seconds','status_censored', 'last_seen'], how='left')
auctions_sample = auctions_sample.astype({col:'float32' for col in app_id_1_columns})

auctions_sample.fillna(value={'application_id=nan':1}, inplace=True)
auctions_sample.fillna(value={col:0 for col in app_id_1_columns}, inplace=True)
auctions_sample = auctions_sample.astype({col:'int32' for col in app_id_1_columns})
auctions_sample.reset_index(inplace=True, drop=True)

auctions_sample = auctions_sample.loc[auctions_sample['date']>'2019-04-24 00:00:00']

#get X and y data
data_y = np.fromiter(zip(auctions_sample["status_censored"], auctions_sample["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])
auctions_sample = auctions_sample.drop(columns=['in_seconds','status_censored'])



In [22]:
auctions_sample.shape

(159651, 317)

In [15]:
pipe = joblib.load('model.sav')

In [23]:
auctions_sample = preprocess().transform(auctions_sample)
pipe.score(auctions_sample, data_y)

0.5441012011351973

In [49]:
auctions_sample = preprocess().transform(auctions_sample)
pipe.score(auctions_sample, data_y)

0.5457859696275422

In [14]:
allDone()

In [None]:
# random search
data_x_11 = data_x_1
data_x_11.reset_index(inplace=True)
#param_grid = {'select__k': np.arange(1, data_x_11.shape[1] + 1)}
param_grid = {'reduce_dim__n_components': np.arange(3, data_x_11.shape[1] + 1)}
rcv = RandomizedSearchCV(pipe, param_grid, return_train_score=True, cv=1, iid=True, n_iter=10, n_jobs=2)

gc.collect() #release cache

rcv.fit(data_x_11, data_y_1)

# Delete the temporary cache before exiting
rmtree(cachedir)

gc.collect() #release cache

pd.DataFrame(rcv.cv_results_).sort_values(by='mean_test_score', ascending=False).head(20)

In [None]:
df_chunk = auctions_sample
#remove empty columns
print('start')
df_chunk = remove_unused_cols(df_chunk)
print('unused cols')
custom_cv = custom_cv_folds(df_chunk)
print('custom_cv')
df_chunk = preprocess().transform(df_chunk)

print('transform whole data')

cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=10)
print('memory cached', cachedir)
#pipeline
pipe = Pipeline([('preprocess', preprocess()),
                 ('select', SelectKBest(fit_and_score_features, k=50)),
                 #('reduce_dim', NMF(max_iter=100)),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.01))], memory=memory)
print('pipeline')
#hyperparameters search
N_FEATURES_OPTIONS = [3, 4, 5] #np.arange(3, df_chunk.shape[1] + 1)
param_grid = {
    'select__k': N_FEATURES_OPTIONS
    #'reduce_dim__n_components': N_FEATURES_OPTIONS
}
rcv = RandomizedSearchCV(pipe, param_grid, return_train_score=True, cv=custom_cv, iid=True, n_iter=10)#, n_jobs=2)
print('Grinf config')
#get X and y data
data_y = np.fromiter(zip(df_chunk["status_censored"], df_chunk["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])
df_chunk = df_chunk.drop(columns=['in_seconds','status_censored'])
print('X, y data')
#gc.collect() #release cache
#start searchin hyperparameters
rcv.fit(df_chunk, data_y)
print('Grid Fit')


#with hyperparameters fit pipeline
pipe.set_params(**rcv.best_params_)
print('pipe params')
pipe.fit(df_chunk, data_y)
joblib.dump(pipe, 'model.sav')
print('pipe fit')

rmtree(cachedir)
allDone()



start
unused cols
custom_cv
transform whole data
memory cached /var/folders/4j/xd9tnb7n42z3jsjhy_wqwcyc0000gn/T/tmpsav7kwec
pipeline
Grinf config
X, y data




________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(preprocess(),             last_seen  application_id=101  application_id=116  \
0            3.049129                   0                   0   
1            4.198257                   0                   0   
2            0.377013                   0                   0   
3            4.713358                   0                   0   
4            0.531197                   0                   0   
5            5.467602                   0                   0   
6            6.280256                   0                   0   
7            9.017695                   0                   0   
8           12.009020                   0                   0   
9           13.363530                   0         ..., 
array([( True, 65.363585), ..., ( True,  1.085793)],
      dtype=[('status_censored', '?'), ('in_seconds', '<f8')]), 
None)


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 2.6s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(SelectKBest(k=3, score_func=<function fit_and_score_features at 0x1cf9e427b8>),             last_seen  application_id=101  application_id=116  \
108080      22.347455                   0                   0   
108081      49.599996                   0                   0   
108082    4884.979737                   0                   0   
108083       1.507216                   0                   0   
108084      33.051305                   0                   0   
108085       5.247523                   0                   0   
108086     927.440767                   0                   0   
108087      13.010950                   0                   0   
108088       3.384204                   0                   0   
108089     233.463594            

In [23]:
rmtree(cachedir)

In [27]:
pd.DataFrame(rcv.cv_results_).sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reduce_dim__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
1,47.973351,1.196737,275.128404,31.512102,4,{'reduce_dim__n_components': 4},0.537848,0.538023,0.545068,0.541817,0.540686,0.003003,1,0.537016,0.536272,0.541994,0.540719,0.539,0.002413
0,48.412022,1.192643,254.328632,18.308084,3,{'reduce_dim__n_components': 3},0.538079,0.537701,0.546252,0.540555,0.540661,0.003443,2,0.538021,0.536572,0.542058,0.541048,0.539425,0.002218
2,47.820269,0.65989,246.968961,14.303463,5,{'reduce_dim__n_components': 5},0.524604,0.533816,0.541542,0.54164,0.535337,0.007005,3,0.531774,0.534657,0.534764,0.540212,0.535352,0.003052
