# Naive Elastic Net Implementation for Talking Data
## Alessandro Rivello and Raul Guarini - "Econometristas''
Versão com undersampling

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
path = "../input/"

import gc, time
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

def handleClickHour(df):
    df['click_hour']= (pd.to_datetime(df['click_time']).dt.round('H')).dt.hour
    df['click_hour'] = df['click_hour'].astype('uint8')
    df = df.drop(['click_time'], axis=1)   
    return df

In [2]:
# Importing data
train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

# Load training df (partly)
start_time = time.time()
start = 133333333
size = 33333333
df_train_30m = pd.read_csv(path + 'train.csv', dtype=dtypes, skiprows=range(1,33333333), nrows=133333333, usecols=train_columns)
# df_train_30m = pd.read_csv(path + 'train.csv', dtype=dtypes, usecols=train_columns)
print('Load df_train_30m with {} seconds'.format(round(time.time() - start_time, 3)))

Load df_train_30m with 140.21 seconds


In [3]:
# Load testing df
start_time = time.time()
df_test = pd.read_csv(path + 'test.csv', dtype=dtypes)
print('Load df_test with {} seconds'.format(round(time.time() - start_time, 3)))

train_record_index = df_train_30m.shape[0]

Load df_test with 21.816 seconds


In [6]:
# Handle click hour 
df_train_30m = handleClickHour(df_train_30m)
df_test = handleClickHour(df_test)
gc.collect();
print('ClickTime data correctly handled.')

# df for submit
df_submit = pd.DataFrame()
df_submit['click_id'] = df_test['click_id']
# Extracting learning data
Learning_Y = df_train_30m['is_attributed']
print('Training target correctly extracted.')

#drop zone
df_test = df_test.drop(['click_id'], axis=1)
df_train_30m = df_train_30m.drop(['is_attributed'], axis=1)
gc.collect();

ClickTime data correctly handled.


In [None]:
df_merge = pd.concat([df_train_30m, df_test])
del df_train_30m, df_test
gc.collect();
print('Data was correctly concatenated')

# Count ip for both train and test df 
start_time = time.time()
df_ip_count = df_merge['ip'].value_counts().reset_index(name = 'ip_count')
df_ip_count.columns = ['ip', 'ip_count']
print('Loaded df_ip_count with {} seconds'.format(round(time.time() - start_time, 3)))

In [None]:
print('Starting to merge with main dataset...')
df_merge = df_merge.merge(df_ip_count, on='ip', how='left', sort=False)
df_merge['ip_count'] = df_merge['ip_count'].astype('uint16')
print('Merging operation completed.')

In [None]:
# We don't need the ip information anymore
df_merge = df_merge.drop(['ip'], axis=1)
del df_ip_count
gc.collect();

# Tracing back what is training data and what is test data
df_train = df_merge[:train_record_index]
df_test = df_merge[train_record_index:]

del df_merge
gc.collect();

In [None]:
# Using the RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
start_time = time.time()
train_resampled, Y_resampled, idx_resampled = rus.fit_sample(X = df_train, y = Learning_Y)
print("Elapsed time until undersampling was complete: {:.2f} seconds".format(time.time() - start_time))

In [None]:
# Transforming the undersampled data in DataFrames
X_resampled = pd.DataFrame(train_resampled, columns=df_train.columns)
Y_resampled = pd.DataFrame(Y_resampled, columns=['is_attributed'])

del train_resampled, idx_resampled
gc.collect();

In [None]:
# Creating a cross-validation structure for simple evaluation
from sklearn import model_selection

X_train, X_cv, Y_train, Y_cv = model_selection.train_test_split(X_resampled, Y_resampled, train_size = 0.7)
gc.collect();
print('Data splitting into training and cross validation is done.')

In [None]:
# Creating a way to evaluate the model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score

def clf_eval(y_true, y_pred):
    print('Classification Report')
    
    print('ROC Score: {}'.format(roc_auc_score(y_true, y_pred)))
    
    return roc_auc_score(y_true, y_pred)

In [None]:
# Starting Elastic Net Implementation
from sklearn.linear_model import SGDClassifier

print('Starting to fit the model to the training dataset... The machine is learning...')

start_time = time.time()
mae_dina = SGDClassifier(loss = 'log', penalty = 'elasticnet', alpha = 50, l1_ratio = 0.001, n_jobs = 5,
                        verbose = 0, )
trained_model = mae_dina.fit(X_train, Y_train)

print('Machine has learned. Elapsed time: {:.2f} seconds'.format(time.time() - start_time))

In [None]:
# Predicting
print('Starting cross-validation prediction phase...')
start_time = time.time()
predictions = trained_model.predict_proba(X_cv)[:,1]
print('Prediction done. Elapsed time: {:.2f} seconds'.format(time.time() - start_time))

# Evaluating the model
clf_eval(Y_cv, predictions);

Acabamos por nem submeter o csv gerado desta forma pois o desempenho foi bem abaixo do esperado.