# Modelo Preditivo - Competição "Talking Data"
## Alunos: Alessandro Rivello e Raul Guarini

Tentativa de usar um modelo com blacklists de features, tal qual a EDA sugeriu ser promissor, e usando dados de timestamp.

In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Definindo o estilo dos plots
sns.set_style("whitegrid")
sns.set(font_scale=1.5)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

path_raul = '/Users/Raul/Desktop/'

In [32]:
# Definir os datatypes antes da importação otimiza o uso de memória pelo Pandas.
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint8',
        'device'        : 'uint8',
        'os'            : 'uint8',
        'channel'       : 'uint8',
        'is_attributed' : 'uint8',
        'days'          : 'uint8',
        'hours'         : 'uint8',
        'minutes'       : 'uint8',
        'seconds'       : 'uint8',
        }

In [33]:
# Forçando o Pandas a ler as mesmas colunas, sem necessidade de ler click_time, click_id e days, uma vez que os dados
# de teste e treinamento são de dias diferentes. Logo essa feature não é informativa.

cols_train = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'hours', 'minutes', 'seconds']
cols_test = ['ip', 'app', 'device', 'os', 'channel', 'hours', 'minutes', 'seconds']

In [34]:
train = pd.read_csv(path_raul + '10_train_timed.csv', dtype = dtypes, usecols = cols_train)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18490388 entries, 0 to 18490387
Data columns (total 9 columns):
ip               uint32
app              uint8
device           uint8
os               uint8
channel          uint8
is_attributed    uint8
hours            uint8
minutes          uint8
seconds          uint8
dtypes: uint32(1), uint8(8)
memory usage: 211.6 MB


### Timestamps
Vamos fazer one-hot-encoding das horas e minutos e iniciar nossa engenharia de features.

In [5]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hours,minutes,seconds
0,103022,3,1,23,379,0,14,37,44
1,114221,3,1,19,379,0,14,37,59
2,47902,3,1,17,379,0,14,48,7
3,23550,3,1,13,379,0,14,53,39
4,84644,3,1,19,379,0,14,56,49


In [58]:
# Instanciando o Encoder
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder
#mlb = MultiLabelBinarizer()
encoder = OneHotEncoder(sparse = False, dtype = 'uint8')

In [85]:
# Fazendo encoding das horas
cols = ['h'+str(hour) for hour in np.sort(train.hours.unique())]
H = pd.DataFrame(encoder.fit_transform(train.hours.values.reshape(-1,1)), columns = cols)

In [86]:
H.head()

Unnamed: 0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,h14,h15,h16,h17,h18,h19,h20,h21,h22,h23
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [87]:
train = pd.concat([train, H], axis=1)
train.head()

In [94]:
# Fazendo encoding dos minutos
cols = ['m'+str(m) for m in np.sort(train.minutes.unique())]
M = pd.DataFrame(encoder.fit_transform(train.minutes.values.reshape(-1,1)), columns = cols)

In [95]:
M.head()

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,...,m50,m51,m52,m53,m54,m55,m56,m57,m58,m59
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [96]:
train = pd.concat([train, M], axis = 1)
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hours,minutes,seconds,h0,...,m50,m51,m52,m53,m54,m55,m56,m57,m58,m59
0,103022,3,1,23,123,0,14,37,44,0,...,0,0,0,0,0,0,0,0,0,0
1,114221,3,1,19,123,0,14,37,59,0,...,0,0,0,0,0,0,0,0,0,0
2,47902,3,1,17,123,0,14,48,7,0,...,0,0,0,0,0,0,0,0,0,0
3,23550,3,1,13,123,0,14,53,39,0,...,0,0,0,1,0,0,0,0,0,0
4,84644,3,1,19,123,0,14,56,49,0,...,0,0,0,0,0,0,1,0,0,0


In [97]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18490388 entries, 0 to 18490387
Data columns (total 93 columns):
ip               uint32
app              uint8
device           uint8
os               uint8
channel          uint8
is_attributed    uint8
hours            uint8
minutes          uint8
seconds          uint8
h0               uint8
h1               uint8
h2               uint8
h3               uint8
h4               uint8
h5               uint8
h6               uint8
h7               uint8
h8               uint8
h9               uint8
h10              uint8
h11              uint8
h12              uint8
h13              uint8
h14              uint8
h15              uint8
h16              uint8
h17              uint8
h18              uint8
h19              uint8
h20              uint8
h21              uint8
h22              uint8
h23              uint8
m0               uint8
m1               uint8
m2               uint8
m3               uint8
m4               uint8
m5        

In [103]:
# Limpando a memória
del M, H
gc.collect()

0