# Introduction

## Imports

In [1]:
import pandas as pd

import numpy as np

import os

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Data info

In [3]:
for file in os.listdir('../data/'):
    print(file + '\t\t\t\t\t\t' + str(os.stat("../data/" + file).st_size/1000000))

auctions.csv						2412.11824
.ipynb_checkpoints						0.004096
target_competencia_ids.csv						0.200915
installs.csv						123.502317
desc.json						0.009146
Recomendaciones y aclaraciones.docx						0.006909
clicks.csv						16.147446
events.csv						2252.988966


# Useful functions

## General

In [4]:
# days to consider
all_days = [18,19,20,21,22,23,24,25,26]

secs_in_3_days = 3*24*60*60

In [5]:
def get_n_3_days(n):
    """
    get nth block of 3 consecutive days
    n can go from 1 to 7.
    If n == 8, then last two days are given.
    If n == 9, then last day is given.
    """
    n -= 1
    return all_days[n:n+3]

### Target related

In [6]:
# load target
def load_target():
    target = pd.read_csv('../data/target_competencia_ids.csv')

    # to avoid misunderstandings with data when predicting, and avoid accidentally predicting value zero
#     target.obj = np.nan
    
    return target

# para que quede cargado desde el principio
target = load_target()

# target ids related
def get_target_ids():
    """ get all target ids """
    return target['ref_hash'].apply(lambda x: x[:-3]).unique()

def get_target_ids_chunk(chunk_num):
    """ chunk num can go from 1 to 41 """
    chunk_size = 100
    start = (chunk_num - 1) * chunk_size
    stop = chunk_size * chunk_num
    return get_target_ids()[start:stop]

### Predictions related

In [7]:
# para guardar predicciones
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="mati", description = "no description"):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=False)
    _save_description(authors_name, timestamp, description)

In [8]:
def store_predictions(target_df, new_values, value_column_name, suffix):
    """
    adds predictions from value_column_name from new_values df
    to target_df merging by ref_hash and the given suffix 
    suffix: "_st" for auction prediction
            "_sc" for conversion prediction
    """
    new_values['ref_hash'] = new_values['ref_hash'] + suffix
    
    target_df = target_df.merge(new_values[['ref_hash',value_column_name]], how='left', on='ref_hash')
    
    target_df.fillna(0, inplace=True)
    
#     assign values to 'obj' column and remove the column added on merge.
#     after sum, fillna is needed because there are values which are left as NaNs.
    target_df['obj'] = target_df['obj'] + target_df[value_column_name]
    
    target_df.drop([value_column_name], axis='columns', inplace=True)
    return target_df

In [9]:
# play a sound
import os
def ring(duration = 1, freq = 1500):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

### ML related

In [10]:
def set_source_col(df, source):
    """
    for a given dataframe, create a column indicating
    from which csv file it originated
    """
    df['source_csv'] = source

    
def process_time_diffs(df):
    """
    create column indicating difference between two
    consecutive registers for each device id
    """
    asdf = df
    
    asdf = asdf.sort_values(['date'])
    
    asdf['diff'] = asdf.groupby(['ref_hash'])['date'].diff()

    asdf['diff'].fillna(value=asdf['date']-asdf['date'].dt.floor('d'), inplace=True)

    asdf['diff_in_sec'] = asdf['diff'].dt.total_seconds()

    asdf.drop('diff', axis='columns', inplace=True)
    
    return asdf


def process_time_diffs_vs_min_day(df):
    """
    create column indicating difference between
    time in registers and min day in df
    """
    asdf = df

    min_timestamp = asdf['date'].min().floor('d')

    asdf['diff'] = asdf['date'] - min_timestamp

    asdf['diff_in_sec'] = asdf['diff'].dt.total_seconds()

    asdf.drop('diff', axis='columns', inplace=True)
    
    return asdf

def is_source_that_defines_death(data, source_that_defines_death):
    return data == source_that_defines_death

def set_observed_column(df, csv_source_that_defines_death):
    """
    create column indicating if death has been observed or not.
    """
    asdf = df
    
    asdf['observed'] = asdf.source_csv.apply(lambda x: is_source_that_defines_death(x,csv_source_that_defines_death))
    
    return asdf


def fill_with_mode(x):
    """
    If there is any value present in group, fill nans with the mode of the group. 
    If there are all nans, leave them all as nans.
    """
    if x.isnull().all():
        return np.nan
    else:
        mode = x.mode()[0]
        return x.fillna(mode)

    
def fill_nans(df):
    """ Fill nan spaces with the mode of the group by ref_hash. """
    nans_filled = df
    nans_filled = nans_filled.sort_values(by=['ref_hash','date'])

    asdf = nans_filled.groupby('ref_hash', as_index=False, sort=False).transform(lambda x: x.ffill().bfill())

    asdf['ref_hash'] = nans_filled['ref_hash']
    
    return asdf

#     nans_filled = df.groupby('ref_hash', as_index=False, sort=False).transform(lambda x: x.ffill().bfill())
    
#     for col in nans_filled.columns:
#         print("filling column " + col)
# #         col_mode = nans_filled[col].mode()[0]
#         nans_filled[col].fillna(col_mode, inplace=True)
    
#     nans_filled['ref_hash'] = df['ref_hash']
#     return nans_filled

# def fill_all_nans_but_diff_in_sec(df):
#     nans_filled = fill_all_nans(df.drop(['diff_in_sec'],axis='columns'))
#     nans_filled['diff_in_sec'] = df['diff_in_sec']
#     return nans_filled

# def object_to_categorical(df):
#     """
#     Transform all 'object' dtypes to 'category'.
#     The following answers helped address the issue:
#         - https://stackoverflow.com/a/46762926
#         - https://stackoverflow.com/a/39092877
#     """
#     asdf = df
    
#     for col in asdf.columns:
#         if asdf[col].dtype.kind == 'O':
#             print(col)
#             asdf[col] = asdf[col].astype('category')
        
#     return asdf

def col_to_bool(df, cols):
    for col in cols:
        df[col] = df[col].astype('bool')
    return df

def get_time_to_event_of_interest(df, source_of_interest):
    asdf = df
    
    asdf = asdf.sort_values(['ref_hash','date'])

    asdf['timestamp_of_next_occurrence'] = asdf['date']
    asdf.loc[asdf['source_csv'] != source_of_interest, 'timestamp_of_next_occurrence'] = np.nan

    # asdf = nans_filled.groupby('ref_hash', as_index=False, sort=False).transform(lambda x: x.ffill().bfill())

    asdf['timestamp_of_next_occurrence'] = asdf['timestamp_of_next_occurrence'].bfill().ffill()

    asdf['diff_in_sec'] = asdf['date']-asdf['timestamp_of_next_occurrence']
    
    asdf['diff_in_sec'] = asdf['diff_in_sec'].dt.total_seconds()

    asdf['diff_in_sec'] = asdf['diff_in_sec'].apply(lambda x: x if x <= secs_in_3_days else secs_in_3_days)
    asdf['diff_in_sec'] = asdf['diff_in_sec'].apply(lambda x: x if x > 0 else secs_in_3_days)
    
    asdf = asdf.drop('timestamp_of_next_occurrence', axis='columns')
    
    return asdf

## Dfs loading functions

Luego de appendear los 4 dfs y rellenar los nans con la moda por cada grupo, se ha observado, con una muestra de device ids (ref_hashes), la siguiente proporcion de nans:

> source_id has 0.0% of nans.<br>
date has 0.0% of nans.<br>
latitude has 68.65% of nans.<br>
longitude has 68.65% of nans.<br>
wifi_connection has 68.65% of nans.<br>
carrier_id has 68.65% of nans.<br>
os_minor has 68.65% of nans.<br>
os_major has 68.65% of nans.<br>
specs_brand has 68.65% of nans.<br>
timeToClick has 68.65% of nans.<br>
touchX has 68.65% of nans.<br>
touchY has 68.65% of nans.<br>
ref_type has 68.65% of nans.<br>
diff_in_sec has 0.0% of nans.<br>
source_csv has 0.0% of nans.<br>
application_id has 0.0% of nans.<br>
attributed has 0.0% of nans.<br>
implicit has 0.0% of nans.<br>
device_brand has 34.18% of nans.<br>
device_model has 2.14% of nans.<br>
session_user_agent has 0.13% of nans.<br>
device_language has 3.31% of nans.<br>
ip_address has 16.25% of nans.<br>
ref_type_id has 0.0% of nans.<br>
ref_hash has 0.0% of nans.<br>

Se decide no trabajar con las columnas que tengan mas de 50% de nans.

Codigo ejecutado:
```python
for col in nans_filled.columns:
    total_rows = nans_filled.shape[0]
    print(str(col) + " has " + str(100*nans_filled[col].isna().sum()/total_rows) + "% of nans.")
```

### Clicks

In [11]:
clicks_cols = ['source_id','created','ref_hash',]

clicks_dtypes = {
#     'advertiser_id':'category',
#                  'action_id':'category',
                 'source_id':'category',
#                  'country_code':'category',
#                  'latitude':'float64',
#                  'longitude':'float64',
#                  'wifi_connection':'bool',
#                  'carrier_id':'category',
#                  'trans_id':'category',
#                  'os_minor':'category',
#                  'agent_device':'category',
#                  'os_major':'category',
#                  'specs_brand':'category',
#                  'brand':'category',
#                  'timeToClick':'float64',
#                  'touchX':'object',
#                  'touchY':'object',
#                  'ref_type':'category',
                 'ref_hash':'category'}

def load_clicks(users=get_target_ids(), days=all_days):
    """
    load clicks csv, only users and days specified in users and days lists.
    If lists left empty, consider whole set of users and days respectively.
    """
    
    df_clicks = pd.read_csv('../data/clicks.csv', engine='c', dtype=clicks_dtypes, parse_dates=['created'], usecols=clicks_cols)
    
    def load_condition(chunk):
        return chunk['ref_hash'].isin(users) & chunk['created'].dt.day.isin(days)
    
    df = df_clicks.loc[load_condition(df_clicks)].copy()
    
    df.rename(columns={'created':'date'}, inplace=True)
    
    df['date'] = df['date'].dt.tz_localize(None)
    
    return df

### Installs

In [12]:
installs_cols = ['created','application_id','ref_hash','attributed','implicit','device_brand','device_model','session_user_agent','device_language']

installs_dtypes = {"application_id":          "category",
#                    "ref_type":                "category",
                   "ref_hash":                "object",
#                    "click_hash":             "category",
                   "attributed":               "bool",
                   "implicit":                 "bool",
#                    "device_countrycode":      "category",
                   "device_brand":          "category",
                   "device_model":          "category",
                   "session_user_agent":     "category",
#                    "user_agent":             "category",
#                    "event_uuid":             "category",
#                    "kind":                   "category",
#                    "wifi":                   "category",
#                    "trans_id":               "category",
#                    "ip_address":              "category",
                   "device_language":       "category"}

def load_installs(users=get_target_ids(), days=all_days):
    """
    load installs csv, only users and days specified in users and days lists.
    If lists left empty, consider whole set of users and days respectively.
    """
    
    df_installs = pd.read_csv('../data/installs.csv', engine='c', dtype=installs_dtypes, parse_dates=['created'], usecols=installs_cols)
    
    def load_condition(df):
        return df['ref_hash'].isin(users) & df['created'].dt.day.isin(days)
    
    df = df_installs.loc[load_condition(df_installs)].copy()
    
    df.rename(columns={'created':'date'}, inplace=True)
    
    return df

### Events

In [13]:
events_cols = ['date','ref_hash','application_id','attributed','device_model','ip_address']

events_dtypes = {
#     "index":                   "category",
#                  "event_id":                "category",
#                  "ref_type":                "category",
                 "ref_hash":                "category",
                 "application_id":          "category",
                 "attributed":               "bool",
#                  "device_countrycode":      "category",
#                  "device_os_version":     "category",
#                  "device_brand":          "category",
                 "device_model":          "category",
#                  "device_city":           "category",
#                  "session_user_agent":    "category",
#                  "trans_id":               "category",
#                  "user_agent":            "category",
#                  "event_uuid":             "category",
#                  "carrier":               "category",
#                  "kind":                  "category",
#                  "device_os":             "category",
#                  "wifi":                     "bool",
#                  "connection_type":        "category",
                 "ip_address":              "category",
#                  "device_language":       "category"
}

def load_events(users=get_target_ids(), days=all_days):
    """
    load events csv, only users and days specified in users and days lists.
    If lists left empty, consider whole set of users and days respectively.
    """
    
    iter_events = pd.read_csv('../data/events.csv', engine='c', dtype=events_dtypes, parse_dates=['date'], chunksize=10000, usecols=events_cols)
    
    def load_condition(chunk):
        return chunk['ref_hash'].isin(users) & chunk['date'].dt.day.isin(days)
    
    df = pd.concat(chunk.loc[load_condition(chunk)] for chunk in iter_events)
    
    return df

### Auctions

In [14]:
auction_cols = ['date','device_id']

auctions_dtypes = {'device_id':'category',
#                  'ref_type_id':'category',
#                  'source_id':'category'
                  }

def load_auctions(users=get_target_ids(), days=all_days):
    """
    load auctions csv, only users and days specified in users and days lists.
    If lists left empty, consider whole set of users and days respectively.
    """
    iter_auctions = pd.read_csv('../data/auctions.csv', engine='c', dtype=auctions_dtypes, parse_dates=['date'], chunksize=10000, usecols=auction_cols)
    def load_condition(chunk):
        return chunk['device_id'].isin(users) & chunk['date'].dt.day.isin(days)
    df = pd.concat(chunk.loc[load_condition(chunk)] for chunk in iter_auctions)
    
    df.rename(columns={'device_id':'ref_hash'}, inplace=True)
    
    return df

# Recommended bibliography

#### Pseudo resumen de cosas utiles para aplicar
Para survival analysis se necesitan dos cosas:
- an array of durations
- either a boolean or binary array representing whether the “death” was observed or not (alternatively an individual can be censored).

# Un vistazo sobre los dfs

## Clicks

In [14]:
df = load_clicks(get_target_ids_chunk(1), all_days[:3])

In [15]:
df.head()

Unnamed: 0,source_id,date,ref_hash
11899,1,2019-04-20 04:23:39.214,1102680423242413676
11922,1,2019-04-20 04:30:28.785,1102680423242413676
25085,1,2019-04-18 16:55:31.227,1102680423242413676
61291,1,2019-04-20 19:53:18.984,1058525390691423513
63806,1,2019-04-19 12:46:43.763,1058525390691423513


## Installs

In [16]:
df = load_installs(get_target_ids_chunk(1), all_days[:3])

In [17]:
df.head()

Unnamed: 0,date,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,device_language
7016,2019-04-19 02:37:34.033,14,1128814228344083814,False,False,2.208834667126999e+18,4.445013666528814e+18,,3.3013777759777e+18
7022,2019-04-19 02:37:34.101,14,1128814228344083814,False,True,2.208834667126999e+18,4.445013666528814e+18,HasOffers Mobile AppTracking v1.0,3.3013777759777e+18
28381,2019-04-18 22:23:29.656,49,1048782984015604883,False,True,,6.794880020077885e+18,http-kit/2.0,8.441417429938962e+18
48676,2019-04-20 12:35:20.625,77,1054881396892383323,False,False,,1.3445980799392305e+18,adjust.com,4.060929664968129e+18
50673,2019-04-20 02:31:26.269,78,1010265377387765028,False,False,6.115025880051902e+18,1.670346184923358e+18,http-kit/2.0,6.977049253562486e+18


## Events

In [18]:
df = load_events(get_target_ids_chunk(1), all_days[:3])

In [19]:
df.head()

Unnamed: 0,date,ref_hash,application_id,attributed,device_model,ip_address
170519,2019-04-18 03:43:09.185,1106971792117053344,65,False,5.186986602616849e+18,7209709704711395089
170521,2019-04-18 03:43:25.436,1106971792117053344,65,False,5.186986602616849e+18,7209709704711395089
170525,2019-04-18 03:43:21.320,1106971792117053344,65,False,5.186986602616849e+18,7209709704711395089
170529,2019-04-18 03:43:08.996,1106971792117053344,65,False,5.186986602616849e+18,7209709704711395089
171835,2019-04-19 17:01:38.181,1117811498061299916,65,False,3.0574023248014715e+18,3382054713307838865


## Auctions

In [20]:
df = load_auctions(get_target_ids_chunk(1), all_days[:3])

In [21]:
df.head()

Unnamed: 0,date,ref_hash,ref_type_id,source_id
289649,2019-04-18 19:58:47.826462,1114026657194419748,1,0
317697,2019-04-18 23:34:28.216676,1102680423242413676,1,1
317713,2019-04-18 23:34:36.035822,1102680423242413676,1,1
317978,2019-04-18 23:42:35.347774,1102680423242413676,1,1
318297,2019-04-18 23:44:54.822177,1102680423242413676,1,1


# ML: Approaches

***

## Approach 1: mean value per device

### Auctions

In [48]:
df_auctions = load_auctions()

df_auctions2 = process_time_diffs(df_auctions)

current_predictions = df_auctions2.groupby('ref_hash', as_index=False)[['diff_in_sec']].mean()

target = store_predictions(target_df=target, new_values=current_predictions, value_column_name='diff_in_sec', suffix='_st')

In [49]:
del df_auctions
del df_auctions2

### Installs

In [51]:
df_installs = load_installs()

df_installs2 = process_time_diffs(df_installs)

current_predictions = df_installs2[['ref_hash','diff_in_sec']].groupby('ref_hash', as_index=False).mean()

target = store_predictions(target_df=target, new_values=current_predictions, value_column_name='diff_in_sec', suffix='_sc')

In [50]:
del df_installs
del df_installs2

***

In [57]:
save_submission(target, description="por cada grupo, avg. de los tiempos entre cada registro.")

## End of Approach 1: mean value per device

***

## Approach 2: mean value per device taking time from one common start

### Auctions

In [15]:
df_auctions = load_auctions(days=get_n_3_days(1))

df_auctions2 = process_time_diffs_vs_min_day(df_auctions)

current_predictions = df_auctions2.groupby('ref_hash', as_index=False)[['diff_in_sec']].mean()

target = store_predictions(target_df=target, new_values=current_predictions, value_column_name='diff_in_sec', suffix='_st')

In [16]:
del df_auctions
del df_auctions2

### Installs

In [17]:
df_installs = load_installs(days=get_n_3_days(1))

df_installs2 = process_time_diffs_vs_min_day(df_installs)

current_predictions = df_installs2[['ref_hash','diff_in_sec']].groupby('ref_hash', as_index=False).mean()

target = store_predictions(target_df=target, new_values=current_predictions, value_column_name='diff_in_sec', suffix='_sc')

In [18]:
del df_installs
del df_installs2

***

In [19]:
save_submission(target, description="por cada grupo, avg. de los tiempos tomados a partir del primer dia de los elegidos.")

## End of Approach 2: mean value per device taking time from one common start

***

## Approach 3:

### Define current users/days

In [15]:
current_users = get_target_ids()
current_days = get_n_3_days(1)

In [16]:
df_clicks = load_clicks(current_users, current_days)
df_installs = load_installs(current_users, current_days)
df_events = load_events(current_users, current_days)
df_auctions = load_auctions(current_users, current_days)

In [17]:
print(df_clicks.memory_usage(deep=True).sum()/1e6)
print(df_installs.memory_usage(deep=True).sum()/1e6)
print(df_events.memory_usage(deep=True).sum()/1e6)
print(df_auctions.memory_usage(deep=True).sum()/1e6)

3.727891
2.071247
6.177316
25.952431


### Calculate time diffs per ref hash

In [18]:
df_clicks = process_time_diffs(df_clicks)
df_installs = process_time_diffs(df_installs)
df_events = process_time_diffs(df_events)
df_auctions = process_time_diffs(df_auctions)

### Append 'em!

In [19]:
set_source_col(df_clicks, "clicks")
set_source_col(df_installs, "installs")
set_source_col(df_events, "events")
set_source_col(df_auctions, "auctions")

In [31]:
appended = df_clicks.append(df_installs, sort=False).append(df_events, sort=False).append(df_auctions, sort=False)

In [32]:
appended.diff_in_sec = appended.diff_in_sec.astype('float')

976372

In [21]:
appended = fill_all_nans(appended)

In [24]:
appended = set_observed_column(appended, "auctions")

In [27]:
appended.shape

(185944, 15)

In [33]:
appended.head()

Unnamed: 0,source_id,date,ref_hash,source_csv,application_id,attributed,implicit,device_brand,device_model,session_user_agent,device_language,ip_address,diff_in_sec
1001,1,2019-04-18 14:21:13.536,7425652559562776089,clicks,,,,,,,,,
10276,1,2019-04-18 04:26:38.599,9001894065986101363,clicks,,,,,,,,,
10279,3,2019-04-18 04:31:18.746,7906060474484893014,clicks,,,,,,,,,
10296,1,2019-04-18 04:26:56.196,4164621178487252757,clicks,,,,,,,,,
10297,1,2019-04-18 04:27:00.527,4164621178487252757,clicks,,,,,,,,,


In [39]:
X_train = appended.drop(['diff_in_sec','observed','date'], axis='columns')

In [None]:
# construct y_train array of tuples as needed.
# To understand this, see: https://scikit-survival.readthedocs.io/en/latest/generated/sksurv.util.Surv.html#sksurv.util.Surv.from_dataframe
from sksurv.util import Surv

helper = Surv()

y_train = helper.from_dataframe('observed','diff_in_sec', appended)

In [None]:
from sksurv.preprocessing import OneHotEncoder

X_train_numeric = OneHotEncoder().fit_transform(X_train)
X_train_numeric.head()

In [None]:
from sksurv.linear_model import CoxPHSurvivalAnalysis

estimator = CoxPHSurvivalAnalysis(alpha=0.001)
estimator.fit(X_train_numeric, y_train)

In [111]:
# pd.Series(estimator.coef_, index=X_train_numeric.columns)

***
El approach con `scikit-survival` fue abandonado debido a las incesantes complicaciones debidas a la pobre implementacion de la libreria.

## End of Approach 3

## Approach 4

### Define current users/days

In [15]:
current_users = get_target_ids()
current_days = get_n_3_days(1)

In [16]:
df_clicks = load_clicks(current_users, current_days)
df_installs = load_installs(current_users, current_days)
df_events = load_events(current_users, current_days)

In [17]:
df_auctions = load_auctions(current_users, current_days)

In [18]:
print(df_clicks.memory_usage(deep=True).sum()/1e6)
print(df_installs.memory_usage(deep=True).sum()/1e6)
print(df_events.memory_usage(deep=True).sum()/1e6)
print(df_auctions.memory_usage(deep=True).sum()/1e6)

3.727891
2.071247
6.177316
15.009189


### Calculate time diffs per ref hash

In [19]:
# df_auctions = process_time_diffs(df_auctions)

# df_auctions.drop('diff_in_sec',axis='columns',inplace=True)

# df_auctions.head()

### Append 'em!

In [20]:
set_source_col(df_clicks, "clicks")
set_source_col(df_installs, "installs")
set_source_col(df_events, "events")
set_source_col(df_auctions, "auctions")

In [21]:
appended = df_clicks.append(df_installs, sort=False).append(df_events, sort=False).append(df_auctions, sort=False)

# appended.diff_in_sec = appended.diff_in_sec.astype('float')

In [22]:
appended.head()

Unnamed: 0,source_id,date,ref_hash,source_csv,application_id,attributed,implicit,device_brand,device_model,session_user_agent,device_language,ip_address
1001,1,2019-04-18 14:21:13.536,7425652559562776089,clicks,,,,,,,,
10276,1,2019-04-18 04:26:38.599,9001894065986101363,clicks,,,,,,,,
10279,3,2019-04-18 04:31:18.746,7906060474484893014,clicks,,,,,,,,
10296,1,2019-04-18 04:26:56.196,4164621178487252757,clicks,,,,,,,,
10297,1,2019-04-18 04:27:00.527,4164621178487252757,clicks,,,,,,,,


In [23]:
appended = get_time_to_event_of_interest(appended, "auctions")

In [24]:
appended = appended.loc[appended['source_csv'] != "auctions"]

In [25]:
# asdf2 = pd.get_dummies(data=asdf.drop(['date','diff_in_sec','ref_hash'], axis='columns'), dummy_na=True)
# asdf2[['date','diff_in_sec','ref_hash']] = asdf[['date','diff_in_sec','ref_hash']]

# asdf2.attributed.isna().sum()

In [26]:
appended.head()

Unnamed: 0,source_id,date,ref_hash,source_csv,application_id,attributed,implicit,device_brand,device_model,session_user_agent,device_language,ip_address,diff_in_sec
4623047,,2019-04-20 21:17:43.910,1000169251625791246,events,21,False,,,1.805456287734329e+18,,,8443069915943166930,183861.9991
4025197,,2019-04-20 21:17:53.248,1000169251625791246,events,21,False,,,1.805456287734329e+18,,,8443069915943166930,183871.3371
3193009,,2019-04-20 23:41:55.279,1000169251625791246,events,21,False,,,1.805456287734329e+18,,,9169560242611875077,192513.3681
3193107,,2019-04-20 23:41:55.996,1000169251625791246,events,21,False,,,1.805456287734329e+18,,,9169560242611875077,192514.0851
4461154,,2019-04-20 23:44:35.185,1000169251625791246,events,21,False,,,1.805456287734329e+18,,,9169560242611875077,192673.2741
