In [1]:
import pandas as pd
import numpy as np
import datetime
import gc
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import NMF, PCA
from sklearn.externals import joblib
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

In [2]:
def fix_str_float(ds, col):
    ds[col] = ds[col].str.replace(r'[^0-9\.]','')
    ds[col] = np.where(ds[col]=='',np.nan,ds[col])
    ds[col] = ds[col].astype('float32')
    return ds[col].astype('float32')

In [3]:
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores


In [4]:
def remove_unused_cols(X):
    #remove empty columns
    to_drop = []
    for col in X.select_dtypes(include='number').columns:
        if X[col].sum()==0:
            to_drop.append(col)
    if len(to_drop)>0:
        X = X.drop(columns=to_drop)
    return X

In [5]:
class preprocess( BaseEstimator, TransformerMixin ): 
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        # X = X.copy() # do we need this line
        # boolean transformations
        if 'event_uuid' in X.columns:
            X['event_uuid'] = np.where(X['event_uuid'].isnull(), 0,1)
        if 'click_hash' in X.columns:
            X['click_hash'] = np.where(X['click_hash'].isnull(), 0,1)
        if 'Android' in X.columns:
            X['Android'] = np.where(X['user_agent'].str.contains('Android', regex=False),1,0)
        if 'iOS' in X.columns:
            X['iOS'] = np.where(X['user_agent'].str.contains('Darwin', regex=False) | X['user_agent'].str.contains('iOS', regex=False),1,0)
        if 'trans_id' in X.columns:
            X['trans_id'] = np.where(X['trans_id'].isnull(), 0,1)
        # date transformations
        if 'created' in X.columns:
            X['created_weekday'] = X['created'].dt.weekday
            X['created_hour'] = X['created'].dt.hour
            X['created_minute'] = X['created'].dt.minute
        if 'date' in X.columns:
            X['date_weekday'] = X['date'].dt.weekday
            X['date_hour'] = X['date'].dt.hour
            X['date_minute'] = X['date'].dt.minute
            X['date_second'] = X['date'].dt.second
        #remove unused columns
        to_drop = []
        for col in ['date','created', 'install_diff','device_brand','install_seconds','user_agent','device_id']:
            if col in X.columns:
                to_drop.append(col)
        X = X.drop(columns=to_drop)
        X = pd.get_dummies(X, dummy_na=True, prefix_sep='=')
        #returns numpy array
        return X

    

In [6]:
def custom_cv_folds(X):
    max_date = X['date'].max()
    min_date = X['date'].min()
    one_third = min_date+((max_date-min_date)/3)
    two_third = min_date+((max_date-min_date)*2/3)
    myCViterator = []
    trainIndices = X.loc[X['date']<one_third].index.values.astype(int)
    testIndices =  X.loc[(one_third<=X['date']) & (X['date']<two_third)].index.values.astype(int)
    print('trainIndices.shape', trainIndices.shape)
    print('testIndices.shape', testIndices.shape)
    myCViterator.append( (trainIndices, testIndices) )
    trainIndices = X.loc[(one_third<=X['date']) & (X['date']<two_third)].index.values.astype(int)
    testIndices =  X.loc[two_third<=X['date']].index.values.astype(int)
    print('trainIndices.shape', trainIndices.shape)
    print('testIndices.shape', testIndices.shape)
    myCViterator.append( (trainIndices, testIndices) )
    print(min_date)
    print(one_third)
    print(two_third)
    print(max_date)
    return myCViterator


In [7]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [None]:
#read file in chunks a a million
models = []
dtypes = {'device_id':'object', 'ref_type_id':'category','source_id':'category','in_seconds':'float64', 'status_censored':'bool','last_seen':'float64'}
for df_chunk in pd.read_csv('data/auctions_1.csv', low_memory=False, dtype=dtypes, chunksize=100000):
    #datatypes
    print('datatypes')
    df_chunk['date'] = pd.to_datetime(df_chunk['date'])
    app_cols = []
    for col in df_chunk.columns:
        if col.startswith('application_id'):
            app_cols.append(col)
    df_chunk = df_chunk.astype({col:'int32' for col in app_cols})
    #remove empty columns
    print('remove empty columns')
    df_chunk = remove_unused_cols(df_chunk)
    custom_cv = custom_cv_folds(df_chunk)
    df_chunk = preprocess().transform(df_chunk)

    #pipeline
    print('pipeline')
    pipe = Pipeline([('preprocess', preprocess()),
                     #('select', SelectKBest(fit_and_score_features, k=50)),
                     ('reduce_dim', NMF()),
                     ('model', CoxPHSurvivalAnalysis(alpha=0.01))])
    #hyperparameters search
    print('hyperparameters search')
    N_FEATURES_OPTIONS = np.arange(3, df_chunk.shape[1] + 1)
    param_grid = {
        #'select__k': N_FEATURES_OPTIONS
        'reduce_dim__n_components': N_FEATURES_OPTIONS
    }
    rcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=3, iid=True)

    #get X and y data
    print('get X and y data')
    data_y = np.fromiter(zip(df_chunk["status_censored"], df_chunk["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])
    df_chunk = df_chunk.drop(columns=['in_seconds','status_censored'])
    #gc.collect() #release cache
    #start searchin hyperparameters
    print('start searchin hyperparameters')
    rcv.fit(df_chunk, data_y)

    #with hyperparameters fit pipeline
    print('with hyperparameters fit pipeline')
    pipe.set_params(**rcv.best_params_)
    pipe.fit(df_chunk, data_y)
    models.append(pipe)



datatypes
remove empty columns
trainIndices.shape (33281,)
testIndices.shape (34293,)
trainIndices.shape (34293,)
testIndices.shape (32426,)
2019-04-18 00:00:00.015050
2019-04-18 00:06:38.808932666
2019-04-18 00:13:17.602815333
2019-04-18 00:19:56.396698
pipeline
hyperparameters search
get X and y data
start searchin hyperparameters
with hyperparameters fit pipeline
datatypes
remove empty columns
trainIndices.shape (33696,)
testIndices.shape (32044,)
trainIndices.shape (32044,)
testIndices.shape (34260,)
2019-04-18 00:19:56.414567
2019-04-18 00:27:00.528663
2019-04-18 00:34:04.642759
2019-04-18 00:41:08.756855
pipeline
hyperparameters search
get X and y data
start searchin hyperparameters
with hyperparameters fit pipeline
datatypes
remove empty columns
trainIndices.shape (33154,)
testIndices.shape (33288,)
trainIndices.shape (33288,)
testIndices.shape (33558,)
2019-04-18 00:41:08.760438
2019-04-18 00:48:19.686582
2019-04-18 00:55:30.612726
2019-04-18 01:02:41.538870
pipeline
hyperparam

  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


with hyperparameters fit pipeline
datatypes
remove empty columns
trainIndices.shape (33538,)
testIndices.shape (33661,)
trainIndices.shape (33661,)
testIndices.shape (32801,)
2019-04-18 01:23:35.311001
2019-04-18 01:30:27.283348333
2019-04-18 01:37:19.255695666
2019-04-18 01:44:11.228043
pipeline
hyperparameters search
get X and y data
start searchin hyperparameters


  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, 

with hyperparameters fit pipeline
datatypes
remove empty columns
trainIndices.shape (33410,)
testIndices.shape (33490,)
trainIndices.shape (33490,)
testIndices.shape (33100,)
2019-04-18 01:44:11.230585
2019-04-18 01:50:51.362379333
2019-04-18 01:57:31.494173666
2019-04-18 02:04:11.625968
pipeline
hyperparameters search
get X and y data
start searchin hyperparameters


  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


with hyperparameters fit pipeline
datatypes
remove empty columns
trainIndices.shape (32675,)
testIndices.shape (33264,)
trainIndices.shape (33264,)
testIndices.shape (34061,)
2019-04-18 02:04:11.629277
2019-04-18 02:10:46.684640
2019-04-18 02:17:21.740003
2019-04-18 02:23:56.795366
pipeline
hyperparameters search
get X and y data
start searchin hyperparameters


  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


In [37]:
del df_chunk
gc.collect() #release cache

860

In [1]:
models

NameError: name 'models' is not defined

In [None]:
#read file in chunks a a million
models = []
dtypes = {'device_id':'object', 'ref_type_id':'category','source_id':'category','in_seconds':'float64', 'status_censored':'bool','last_seen':'float64'}
df_chunk = pd.read_csv('data/auctions_1.csv', low_memory=False, dtype=dtypes, nrows=1000000)
#datatypes
df_chunk['date'] = pd.to_datetime(df_chunk['date'])
app_cols = []
for col in df_chunk.columns:
    if col.startswith('application_id'):
        app_cols.append(col)
df_chunk = df_chunk.astype({col:'int32' for col in app_cols})
#remove empty columns
df_chunk = remove_unused_cols(df_chunk)
custom_cv = custom_cv_folds(df_chunk)
df_chunk = preprocess().transform(df_chunk)

#pipeline
pipe = Pipeline([('preprocess', preprocess()),
                 #('select', SelectKBest(fit_and_score_features, k=50)),
                 ('reduce_dim', NMF()),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.01))])
#hyperparameters search
N_FEATURES_OPTIONS = np.arange(3, df_chunk.shape[1] + 1)
param_grid = {
    #'select__k': N_FEATURES_OPTIONS
    'reduce_dim__n_components': N_FEATURES_OPTIONS
}
rcv = GridSearchCV(pipe, param_grid, return_train_score=True, cv=3, iid=True)

#get X and y data
data_y = np.fromiter(zip(df_chunk["status_censored"], df_chunk["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)])
df_chunk = df_chunk.drop(columns=['in_seconds','status_censored'])
#gc.collect() #release cache
#start searchin hyperparameters
rcv.fit(df_chunk, data_y)

#with hyperparameters fit pipeline
pipe.set_params(**rcv.best_params_)
pipe.fit(df_chunk, data_y)
models.append(pipe)



trainIndices.shape (320358,)
testIndices.shape (328242,)
trainIndices.shape (328242,)
testIndices.shape (351400,)
2019-04-18 00:00:00.015050
2019-04-18 01:06:58.139688333
2019-04-18 02:13:56.264326666
2019-04-18 03:20:54.388965


In [66]:
df_chunk.head()

Unnamed: 0,date,device_id,ref_type_id,source_id,last_seen,application_id=nan
0,2019-04-18 00:00:00.015050,1826643666390887030,7,0,0.01505,1
1,2019-04-18 00:00:00.029014,7037174172278258682,1,0,0.029014,1
2,2019-04-18 00:00:00.057540,3392065368947589877,1,1,0.05754,1
3,2019-04-18 00:00:00.126828,1228982273563226229,1,1,0.126828,1
4,2019-04-18 00:00:00.132510,4123059034628125459,1,8,0.13251,1


In [31]:
df_chunk.head()

Unnamed: 0,date,device_id,ref_type_id,source_id,last_seen,application_id=1,application_id=10,application_id=100,application_id=101,application_id=102,...,application_id=90,application_id=91,application_id=93,application_id=94,application_id=95,application_id=96,application_id=97,application_id=98,application_id=99,application_id=nan
0,2019-04-18 00:00:00.015050,1826643666390887030,7,0,0.01505,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2019-04-18 00:00:00.029014,7037174172278258682,1,0,0.029014,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2019-04-18 00:00:00.057540,3392065368947589877,1,1,0.05754,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2019-04-18 00:00:00.126828,1228982273563226229,1,1,0.126828,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2019-04-18 00:00:00.132510,4123059034628125459,1,8,0.13251,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# random search
data_x_11 = data_x_1
data_x_11.reset_index(inplace=True)
#param_grid = {'select__k': np.arange(1, data_x_11.shape[1] + 1)}
param_grid = {'reduce_dim__n_components': np.arange(3, data_x_11.shape[1] + 1)}
rcv = RandomizedSearchCV(pipe, param_grid, return_train_score=True, cv=1, iid=True, n_iter=10, n_jobs=2)

gc.collect() #release cache

rcv.fit(data_x_11, data_y_1)

# Delete the temporary cache before exiting
rmtree(cachedir)

gc.collect() #release cache

pd.DataFrame(rcv.cv_results_).sort_values(by='mean_test_score', ascending=False).head(20)