# Modelo Preditivo - Competição "Talking Data"
## Alunos: Alessandro Rivello e Raul Guarini

Tentativa de usar um modelo com blacklists de features, tal qual a EDA sugeriu ser promissor, e usando dados de timestamp.

In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Definindo o estilo dos plots
sns.set_style("whitegrid")
sns.set(font_scale=1.5)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

path_raul = '/Users/Raul/Desktop/'

In [2]:
# Definir os datatypes antes da importação otimiza o uso de memória pelo Pandas.
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint8',
        'device'        : 'uint8',
        'os'            : 'uint8',
        'channel'       : 'uint8',
        'is_attributed' : 'uint8',
        'days'          : 'uint8',
        'hours'         : 'uint8',
        'minutes'       : 'uint8',
        'seconds'       : 'uint8',
        }

In [3]:
# Forçando o Pandas a ler as mesmas colunas, sem necessidade de ler click_time, click_id e days, uma vez que os dados
# de teste e treinamento são de dias diferentes. Logo essa feature não é informativa.

cols_train = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'hours', 'minutes']
cols_test = ['ip', 'app', 'device', 'os', 'channel', 'hours', 'minutes']

In [4]:
train = pd.read_csv(path_raul + '10_train_timed.csv', dtype = dtypes, usecols = cols_train)
test = pd.read_csv(path_raul + 'test_timed.csv', dtype = dtypes, usecols = cols_test)

### Timestamps
Vamos fazer one-hot-encoding das horas e minutos e iniciar nossa engenharia de features.

In [5]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hours,minutes
0,103022,3,1,23,123,0,14,37
1,114221,3,1,19,123,0,14,37
2,47902,3,1,17,123,0,14,48
3,23550,3,1,13,123,0,14,53
4,84644,3,1,19,123,0,14,56


In [6]:
# Instanciando o Encoder
from sklearn.preprocessing import OneHotEncoder
#mlb = MultiLabelBinarizer()
encoder = OneHotEncoder(sparse = False, dtype = 'uint8')

In [7]:
# Fazendo encoding das horas
cols = ['h'+str(hour) for hour in np.sort(train.hours.unique())]
H = pd.DataFrame(encoder.fit_transform(train.hours.values.reshape(-1,1)), columns = cols)

In [8]:
#H.head()

In [9]:
train = pd.concat([train, H], axis=1)
#train.head()

In [10]:
# Fazendo encoding dos minutos
cols = ['m'+str(m) for m in np.sort(train.minutes.unique())]
M = pd.DataFrame(encoder.fit_transform(train.minutes.values.reshape(-1,1)), columns = cols)

In [11]:
#M.head()

In [12]:
train = pd.concat([train, M], axis = 1)
#train.head()

In [13]:
#train.info()

In [14]:
# Limpando a memória
del M, H
gc.collect()

41

Fazendo o mesmo para os dados de teste:

In [15]:
# Fazendo encoding das horas
cols = ['h'+str(hour) for hour in np.sort(test.hours.unique())]
H = pd.DataFrame(encoder.fit_transform(test.hours.values.reshape(-1,1)), columns = cols)

In [16]:
test = pd.concat([test, H], axis=1)
# test.head()
# Note que devemos preencher algumas colunas pois os dados de teste não possuem observações para algumas horas

In [17]:
# Fazendo encoding dos minutos
cols = ['m'+str(m) for m in np.sort(test.minutes.unique())]
M = pd.DataFrame(encoder.fit_transform(test.minutes.values.reshape(-1,1)), columns = cols)

In [18]:
test = pd.concat([test, M], axis=1)
#test.head()

In [19]:
# Limpando a memória
del M, H
gc.collect();

In [20]:
# Preenchendo as colunas de H com zeros onde necessário
cols = [h for h in np.sort(test.hours.unique())]
rem = [i for i in [j for j in range(0, 24)] if i not in cols]

fill = pd.DataFrame(np.zeros((len(test.hours), len(rem)), dtype = 'uint8'), columns = ['h'+str(h) for h in rem])
test = pd.concat([test, fill], axis = 1)

In [21]:
#test.info()

In [22]:
del fill, cols, rem
gc.collect();

Vamos rearranjar agora os dados de teste para ficarem com a mesma ordenação dos dados de treinamento (melhoria estética).

In [23]:
cols = list(train.columns)
cols.remove('is_attributed')
test = test[cols]
test.head()

Unnamed: 0,ip,app,device,os,channel,hours,minutes,h0,h1,h2,...,m50,m51,m52,m53,m54,m55,m56,m57,m58,m59
0,5744,9,1,3,107,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,119901,9,1,3,210,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,72287,21,1,19,128,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,78477,15,1,13,111,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,123080,12,1,13,72,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hours,minutes,h0,h1,...,m50,m51,m52,m53,m54,m55,m56,m57,m58,m59
0,103022,3,1,23,123,0,14,37,0,0,...,0,0,0,0,0,0,0,0,0,0
1,114221,3,1,19,123,0,14,37,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47902,3,1,17,123,0,14,48,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23550,3,1,13,123,0,14,53,0,0,...,0,0,0,1,0,0,0,0,0,0
4,84644,3,1,19,123,0,14,56,0,0,...,0,0,0,0,0,0,1,0,0,0


### Blacklists
Nossa EDA sugeriu que alguns poucos IPs geravam muitos cliques. Um padrão similiar ficou evidenciado no caso das outras features catgóricas.

In [25]:
def blacklist(feature, lim_conv, lim_count, dataframe):
    # Esta função cria, dada uma feature, um blacklist de low converters e high clickers na forma de uma 
    # Pandas Series
    
    conversion = dataframe[[feature, 'is_attributed']].groupby(feature, as_index=False).mean().sort_values('is_attributed', 
                                                                                             ascending=True)
    counts = dataframe[[feature, 'is_attributed']].groupby(feature, as_index=False).count().sort_values('is_attributed', 
                                                                                             ascending=True)
    merge = counts.merge(conversion, on = feature , how ='left')
    merge.columns = [feature , 'counts', 'conversion']
    blacklist = merge[(merge.counts > lim_count) & (merge.conversion < lim_conv)][feature]
    return blacklist

In [61]:
%%time
# Gerando as features de blacklist nos dados de treino e teste -> leva um tempinho! Em torno de 1m.

# Definindo o limite de taxa de conversão para as blacklists
lim_conv = 0.01

# Definindo um limite de contagem para cada variável pois elas tem número de contagens em escalas um pouco diferentes
# Estes parametros devem ser bem importantes para o desempenho preditivo
cont_dict = {'ip': 1e4, 'os': 1e6, 'device': 1e3, 'app': 1e5, 'channel': 1e5}

for feature in cont_dict.keys():
    black = blacklist(feature, lim_conv, lim_count = cont_dict[feature], dataframe = train)
    train['black_'+feature] = train[feature].isin(black).astype('uint8')
    test['black_'+feature] = test[feature].isin(black).astype('uint8')

CPU times: user 13.8 s, sys: 4.88 s, total: 18.7 s
Wall time: 19.2 s


### Machine Learning
A primeira tentativa de modelo preditivo será uma regressão logística com regularização Elastic Net.

In [49]:
import sklearn
from sklearn.linear_model import SGDClassifier

from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score

In [44]:
# Dividingo o DataFrame train em treino e cross-validation. Leva um tempinho também!
%%time
predictors = [cols for cols in train.columns if cols not in cols_train]
X = train[predictors]
y = train['is_attributed']

X_train, X_cv, y_train, y_cv = model_selection.train_test_split(X,y, test_size = 0.3)
gc.collect();

In [76]:
mae_dina = SGDClassifier(loss = 'log', penalty = 'elasticnet', alpha = 0.001, l1_ratio = 0)

In [78]:
# Fitar o modelo leva um tempo também. Em torno de 7 minutos.
%%time
modelo_treinado = mae_dina.fit(X_train, y_train)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 16.9 µs




In [79]:
%%time
pred = modelo_treinado.predict(test[predictors])

CPU times: user 12.7 s, sys: 23.3 s, total: 36 s
Wall time: 30.3 s


In [80]:
pred = pd.DataFrame(pred, index = test.index)

In [81]:
pred.describe()

Unnamed: 0,0
count,18790469.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0
