In [11]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Definindo o estilo dos plots
sns.set_style("whitegrid")
sns.set(font_scale=1.5)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

path_raul = '/Users/alessandrorivello/Kaggle/dados_kaggle/'


In [12]:
# Definir os datatypes antes da importação otimiza o uso de memória pelo Pandas.
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint8',
        'device'        : 'uint8',
        'os'            : 'uint8',
        'channel'       : 'uint8',
        'is_attributed' : 'uint8',
        'days'          : 'uint8',
        'hours'         : 'uint8',
        'minutes'       : 'uint8',
        'seconds'       : 'uint8',
        }

In [13]:
# Forçando o Pandas a ler as mesmas colunas, sem necessidade de ler click_time, click_id e days, uma vez que os dados
# de teste e treinamento são de dias diferentes. Logo essa feature não é informativa.

cols_train = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'hours', 'minutes']
cols_test = ['ip', 'app', 'device', 'os', 'channel', 'hours', 'minutes']

train = pd.read_csv(path_raul + '10_train_timed.csv', dtype = dtypes, usecols = cols_train)
test = pd.read_csv(path_raul + 'test_timed.csv', dtype = dtypes, usecols = cols_test)

In [110]:
def blacklist(features, nick_names, dataframe, numb_class=4):
    # Esta função adiciona duas novas colunas no dataframe dado, feature_count e feature_conv.
    # feature_count = em qual quartil(ou outra separação especificada por numb_class) 
    # está aquela observação, em relação ao número de cliques, 
    # na distribuição do total de cliques dessa feature;
    # feature_conv = o mesmo que a feature_count, mas em relação a taxa de conversão
    
    categories=range(1,numb_class+1)
    df=dataframe.copy()
        
    for feature, nick_name in zip(features, nick_names):
        
        conversion = dataframe[[feature, 'is_attributed']].groupby(feature, as_index=False).mean().sort_values(
                                                                                                'is_attributed', 
                                                                                                 ascending=True)
        counts = dataframe[[feature, 'is_attributed']].groupby(feature, as_index=False).count().sort_values(
                                                                                                'is_attributed', 
                                                                                                 ascending=True)

        # Fazendo um filtro na taxa de conversão mínima considerada devido a problemas técnicos com o qcut
        conversion = conversion[conversion.is_attributed > 0.001]
        conversion[nick_name+'_conv']=pd.qcut(conversion['is_attributed'], numb_class, labels=False, duplicates='drop')
        conversion[nick_name+'_conv']=conversion[nick_name+'_conv']+1 # Apenas padronizando a categoria criada
        counts[nick_name+'_count']=pd.qcut(counts['is_attributed'], numb_class, categories)

        # Dropando a coluna is_attributed para não ter problemas no merge
        conversion=conversion.drop('is_attributed', axis=1)
        counts = counts.drop('is_attributed', axis=1)

        df=df.merge(conversion, on=feature, how='left')
        df[nick_name+'_conv']=df[nick_name+'_conv'].fillna(value=1) # Criou-se NA ao se fixar um mínimo na tx de conversão
        df[nick_name+'_conv']=df[nick_name+'_conv'].astype('uint8')

        df=df.merge(counts, on=feature, how='left')
    
    return df

In [111]:
train_eng = blacklist(['ip', 'app', 'channel'], ['ip', 'app', 'ch'], train)

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


In [112]:
train_eng.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hours,minutes,ip_conv,ip_count,app_conv,app_count,ch_conv,ch_count
0,103022,3,1,23,123,0,14,37,1,4,1,4,1,4
1,114221,3,1,19,123,0,14,37,2,4,1,4,1,4
2,47902,3,1,17,123,0,14,48,1,4,1,4,1,4
3,23550,3,1,13,123,0,14,53,1,4,1,4,1,4
4,84644,3,1,19,123,0,14,56,1,4,1,4,1,4


In [115]:
gc.collect();

# Machine Learning

Com essas novas features criadas, vamos tentar usar random forest para predição

In [118]:
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score

In [119]:
# Primeiro separamos os dados de treino em treino e cross-validation

X = train_eng[['ip_conv', 'ip_count', 'app_conv', 'app_count', 'ch_conv', 'ch_count', 'os', 'hours', 'minutes']]
Y = train_eng['is_attributed']
X_train, X_cv, Y_train, Y_cv = model_selection.train_test_split(X,Y, test_size = 0.3)

In [120]:
from sklearn.ensemble import RandomForestClassifier



clf_rf = RandomForestClassifier(n_estimators=300, 
                                criterion='gini', 
                                max_depth=None, 
                                min_samples_split=3, #2,
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_features='auto', 
                                max_leaf_nodes=None, 
                                bootstrap=True, 
                                oob_score=False, 
                                n_jobs=-1, 
                                random_state=0, 
                                verbose=0, 
                                warm_start=False, 
                                class_weight=None)

clf_rf.fit(X_train, Y_train)

KeyboardInterrupt: 