# Code For Setting Days, Hours, Minutes and Seconds

In [1]:
import pandas as pd
import numpy as np
import gc

# Important code that reduces size of csv: sed  s/2017-11-//g < train.csv > train_reduced.csv
# This operation eliminates information about year and month, equal to every observation, reducing size by 15-20%
# Reference: https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/51347

In [2]:
# Setting the data types before importing (instead of letting pandas guess it) can improve 
# performance and use less RAM
# Reference: https://www.kaggle.com/yuliagm/how-to-work-with-big-datasets-on-16g-ram-dask
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }

In [3]:
# We delete the attributed_time column since it has a lot of missing values and might not be so informative
# Reference: https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/51411
cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

In [4]:
full_data = pd.read_csv('/Users/Raul/Dropbox/Code/raul-mmd/kaggle/train_sample_reduced.csv', dtype=dtypes, usecols = cols)

In [5]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
ip               100000 non-null uint32
app              100000 non-null uint16
device           100000 non-null uint16
os               100000 non-null uint16
channel          100000 non-null uint16
click_time       100000 non-null object
is_attributed    100000 non-null uint8
dtypes: object(1), uint16(4), uint32(1), uint8(1)
memory usage: 2.0+ MB


In [6]:
#Cleaning the environment
gc.collect()

7

The test data comes from a single day, different from the ones we have for training. This implies that we can also ignore the day data, at least ideally. We clean now data from click_time into day, hour, minute and second.

In [7]:
# This will take a while!!!
v = full_data.click_time.str.split()
full_data['days'] = v.str[0].astype('uint8')
full_data[['hours', 'minutes', 'seconds']] = (
      pd.to_timedelta(v.str[-1]).dt.components.iloc[:, 1:4]
).astype('uint8')

In [8]:
full_data = full_data.drop(['click_time'], 1)

In [9]:
gc.collect()

53

In [10]:
full_data.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,days,hours,minutes,seconds
0,87540,12,1,13,497,0,7,9,30,38
1,105560,25,1,17,259,0,7,13,40,27
2,101424,12,1,19,212,0,7,18,5,24
3,94584,13,1,13,477,0,7,4,58,8
4,68413,12,1,1,178,0,9,9,0,9


In [11]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
ip               100000 non-null uint32
app              100000 non-null uint16
device           100000 non-null uint16
os               100000 non-null uint16
channel          100000 non-null uint16
is_attributed    100000 non-null uint8
days             100000 non-null uint8
hours            100000 non-null uint8
minutes          100000 non-null uint8
seconds          100000 non-null uint8
dtypes: uint16(4), uint32(1), uint8(5)
memory usage: 1.6 MB


In [12]:
# Saving the results!
pd.DataFrame.to_csv(full_data,'/Users/Raul/Dropbox/Code/raul-mmd/kaggle/train_sample_timed.csv')