In [1]:
import pandas as pd
import numpy as np
import datetime
import gc
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import NMF, PCA
from sklearn.externals import joblib
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

In [2]:
## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
## Insert whatever audio file you want above

In [3]:
def fix_str_float(ds, col):
    ds[col] = ds[col].str.replace(r'[^0-9\.]','')
    ds[col] = np.where(ds[col]=='',np.nan,ds[col])
    ds[col] = ds[col].astype('float32')
    return ds[col].astype('float32')

In [4]:
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores


In [5]:
def remove_unused_cols(X):
    #remove empty columns
    to_drop = []
    for col in X.select_dtypes(include='number').columns:
        if X[col].sum()==0:
            to_drop.append(col)
    if len(to_drop)>0:
        X = X.drop(columns=to_drop)
    return X

In [6]:
class preprocess( BaseEstimator, TransformerMixin ): 
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        # X = X.copy() # do we need this line
        # boolean transformations
        if 'event_uuid' in X.columns:
            X['event_uuid'] = np.where(X['event_uuid'].isnull(), 0,1)
        if 'click_hash' in X.columns:
            X['click_hash'] = np.where(X['click_hash'].isnull(), 0,1)
        if 'Android' in X.columns:
            X['Android'] = np.where(X['user_agent'].str.contains('Android', regex=False),1,0)
        if 'iOS' in X.columns:
            X['iOS'] = np.where(X['user_agent'].str.contains('Darwin', regex=False) | X['user_agent'].str.contains('iOS', regex=False),1,0)
        if 'trans_id' in X.columns:
            X['trans_id'] = np.where(X['trans_id'].isnull(), 0,1)
        # date transformations
        if 'created' in X.columns:
            X['created_weekday'] = X['created'].dt.weekday
            X['created_hour'] = X['created'].dt.hour
            X['created_minute'] = X['created'].dt.minute
        if 'date' in X.columns:
            X['date_weekday'] = X['date'].dt.weekday
            X['date_hour'] = X['date'].dt.hour
            X['date_minute'] = X['date'].dt.minute
            X['date_second'] = X['date'].dt.second
        #remove unused columns
        to_drop = []
        for col in ['date','created', 'install_diff','device_brand','install_seconds','user_agent','device_id']:
            if col in X.columns:
                to_drop.append(col)
        X = X.drop(columns=to_drop)
        X = pd.get_dummies(X, dummy_na=True, prefix_sep='=')
        #returns numpy array
        return X

    

In [7]:
def custom_cv_folds(X):
    myCViterator = []
    trainIndices = X.loc[X['date']<'2019-04-21 00:00:00'].index.values.astype(int)
    testIndices =  X.loc[('2019-04-21 00:00:00'<=X['date']) & (X['date']<'2019-04-24 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-19 00:00:00'<=X['date']) & (X['date']<'2019-04-22 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-22 00:00:00'<=X['date']) & (X['date']<'2019-04-25 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-20 00:00:00'<=X['date']) & (X['date']<'2019-04-23 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-23 00:00:00'<=X['date']) & (X['date']<'2019-04-26 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[('2019-04-21 00:00:00'<=X['date']) & (X['date']<'2019-04-24 00:00:00')].index.values.astype(int)
    testIndices =  X.loc[('2019-04-24 00:00:00'<=X['date']) & (X['date']<'2019-04-27 00:00:00')].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )
    return myCViterator


In [8]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [9]:
#read from sample
auctions_ids_sample = pd.read_csv('data/auctions_ids_sample.csv', dtype={'device_id':'object'})['device_id']

In [10]:
# auctions
auctions_df = pd.read_csv('data/auctions.csv', low_memory=False, dtype={'country':'category','platform':'category',\
                                                                        'ref_type_id':'category','source_id':'category','device_id':'object'})
auctions_df['date'] = pd.to_datetime(auctions_df['date'])
auctions_sample = auctions_df.loc[auctions_df['device_id'].isin(auctions_ids_sample)].copy()
# installs
installs_df = pd.read_csv('data/installs.csv', low_memory=False, dtype={'ref_type':'category','application_id':'category',\
                                                      'device_brand':'category','ref_hash':'object','wifi':'category'})
installs_df['kind'] = installs_df['kind'].str.lower()
installs_df['kind'] = installs_df['kind'].astype('category')
installs_df.drop(columns=['session_user_agent','ip_address','device_language','device_model'], inplace=True)
installs_df['created'] = pd.to_datetime(installs_df['created'])
installs_df.drop(['device_countrycode'], axis=1, inplace=True)
installs_sample = installs_df.loc[installs_df['ref_hash'].isin(auctions_ids_sample)].copy()
del auctions_df
del installs_df
gc.collect()
allDone()
print('setup done')

setup done


In [11]:
# calculate time in seconds
auctions_sample.drop_duplicates(inplace=True)
auctions_sample = auctions_sample.sort_values(by=['device_id','date'])
auctions_sample['date_dif'] = auctions_sample['date'].shift(periods=-1) - auctions_sample['date']
auctions_sample['device_id_next'] = auctions_sample['device_id'].astype('object').shift(periods=-1)
auctions_sample['date_dif'] = pd.to_timedelta(np.where(auctions_sample['device_id_next']==auctions_sample['device_id'], auctions_sample['date_dif'], np.where(auctions_sample['date']+pd.DateOffset(3)>'2019-04-27 00:00:00', datetime.datetime(2019,4,27)-auctions_sample['date'], pd.to_timedelta(3, unit='d'))))
auctions_sample['in_seconds'] = auctions_sample['date_dif'].dt.total_seconds()
auctions_sample['status_censored'] = auctions_sample['device_id_next']==auctions_sample['device_id']
auctions_sample.drop(['device_id_next','date_dif'], axis='columns', inplace=True)
auctions_sample['device_id'] = auctions_sample['device_id'].astype('object')
#calculate previous time in seconds
auctions_sample['date_prev'] = auctions_sample['date'].shift()
auctions_sample['date_dif_prev'] = auctions_sample['date']- auctions_sample['date_prev']
auctions_sample['device_id_prev'] = auctions_sample['device_id'].astype('object').shift()
auctions_sample['date_dif_prev'] = pd.to_timedelta(np.where(auctions_sample['device_id_prev']==auctions_sample['device_id'], auctions_sample['date_dif_prev'], np.where(auctions_sample['date']-pd.DateOffset(3)<'2019-04-18 00:00:00', auctions_sample['date']-datetime.datetime(2019,4,18), pd.to_timedelta(3, unit='d'))))
auctions_sample['last_seen'] = auctions_sample['date_dif_prev'].dt.total_seconds()
auctions_sample.drop(['device_id_prev','date_dif_prev','date_prev'], axis='columns', inplace=True)
auctions_sample = auctions_sample.sort_values(by=['date'])



In [12]:
auct_cols = auctions_sample.columns.tolist()

In [13]:
#information about last installs
data_1 = pd.merge(auctions_sample, installs_sample, left_on='device_id', right_on='ref_hash', how='left')
#only previus installs on the window
data_1 = data_1.loc[(data_1['date']>data_1['created']) | data_1['created'].isnull()]

# application_id feature by id
app_id_1 = data_1[['application_id']].copy()
app_id_1 = pd.get_dummies(app_id_1, dummy_na=True, prefix_sep='=')
data_1.drop(columns=['application_id'], inplace=True)
data_1 = pd.merge(data_1, app_id_1, left_index=True, right_index=True, how='inner')

app_id_1_columns = app_id_1.columns.tolist()

group_1 = data_1.groupby(auct_cols).agg({col:'sum' for col in app_id_1_columns})
group_1.reset_index(inplace=True)
auctions_sample = pd.merge(auctions_sample, group_1, on=['date','device_id','ref_type_id','source_id','in_seconds','status_censored', 'last_seen'], how='left')
auctions_sample = auctions_sample.astype({col:'float32' for col in app_id_1_columns})

auctions_sample.fillna(value={'application_id=nan':1}, inplace=True)
auctions_sample.fillna(value={col:0 for col in app_id_1_columns}, inplace=True)
auctions_sample = auctions_sample.astype({col:'int32' for col in app_id_1_columns})
auctions_sample.reset_index(inplace=True, drop=True)

In [15]:
auctions_sample.shape

(958704, 319)

In [14]:
auctions_sample.shape

(490041, 319)

In [None]:
#read file in chunks a a million
models = []
df_chunk = auctions_sample
#remove empty columns
print('start')
df_chunk = remove_unused_cols(df_chunk)
print('unused cols')
custom_cv = custom_cv_folds(df_chunk)
print('custom_cv')
df_chunk = preprocess().transform(df_chunk)

print('transform whole data')

cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=10)
print('memory cached', cachedir)
#pipeline
pipe = Pipeline([('preprocess', preprocess()),
                 #('select', SelectKBest(fit_and_score_features, k=50)),
                 ('reduce_dim', NMF(max_iter=100)),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.01))], memory=memory)
print('pipeline')
#hyperparameters search
N_FEATURES_OPTIONS = [3, 30, 50] #np.arange(3, df_chunk.shape[1] + 1)
param_grid = {
    #'select__k': N_FEATURES_OPTIONS
    'reduce_dim__n_components': N_FEATURES_OPTIONS
}
rcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=custom_cv, iid=True)
print('Grinf config')
#get X and y data
data_y = np.fromiter(zip(df_chunk["status_censored"], df_chunk["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])
df_chunk = df_chunk.drop(columns=['in_seconds','status_censored'])
print('X, y data')
#gc.collect() #release cache
#start searchin hyperparameters
rcv.fit(df_chunk, data_y)
print('Grid Fit')

#with hyperparameters fit pipeline
pipe.set_params(**rcv.best_params_)
print('pipe params')
pipe.fit(df_chunk, data_y)
print('pipe fit')
models.append(pipe)

rmtree(cachedir)
allDone()



start
unused cols
custom_cv
transform whole data
memory cached
pipeline
Grinf config
X, y data
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(preprocess(),             last_seen  application_id=101  application_id=116  \
0            3.049129                   0                   0   
1            4.198257                   0                   0   
2            0.377013                   0                   0   
3            4.713358                   0                   0   
4            0.531197                   0                   0   
5            5.467602                   0                   0   
6            6.280256                   0                   0   
7            9.017695                   0                   0   
8           12.009020                   0                   0   
9           13.363530                   0         ..., 
array([( True, 65.363585), ..

In [1]:
allDone()

NameError: name 'allDone' is not defined

In [1]:
df_chunk.head()

NameError: name 'df_chunk' is not defined

In [31]:
df_chunk.head()

Unnamed: 0,date,device_id,ref_type_id,source_id,last_seen,application_id=1,application_id=10,application_id=100,application_id=101,application_id=102,...,application_id=90,application_id=91,application_id=93,application_id=94,application_id=95,application_id=96,application_id=97,application_id=98,application_id=99,application_id=nan
0,2019-04-18 00:00:00.015050,1826643666390887030,7,0,0.01505,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2019-04-18 00:00:00.029014,7037174172278258682,1,0,0.029014,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2019-04-18 00:00:00.057540,3392065368947589877,1,1,0.05754,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2019-04-18 00:00:00.126828,1228982273563226229,1,1,0.126828,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2019-04-18 00:00:00.132510,4123059034628125459,1,8,0.13251,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# random search
data_x_11 = data_x_1
data_x_11.reset_index(inplace=True)
#param_grid = {'select__k': np.arange(1, data_x_11.shape[1] + 1)}
param_grid = {'reduce_dim__n_components': np.arange(3, data_x_11.shape[1] + 1)}
rcv = RandomizedSearchCV(pipe, param_grid, return_train_score=True, cv=1, iid=True, n_iter=10, n_jobs=2)

gc.collect() #release cache

rcv.fit(data_x_11, data_y_1)

# Delete the temporary cache before exiting
rmtree(cachedir)

gc.collect() #release cache

pd.DataFrame(rcv.cv_results_).sort_values(by='mean_test_score', ascending=False).head(20)