# Naive Elastic Net Implementation for Talking Data
## Alessandro Rivello and Raul Guarini - "Econometristas'' Team

In [1]:
# This is hardly inspired by the work of Raven Ron on https://www.kaggle.com/codeastar/random-forest-classification-on-talkingdata
# Here, I try to implement Elastic Net logistic estimator
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
# print(os.listdir("../input"))

import gc, time

dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

# Correcting path to data inside EMAp servers
path = '/dados/Dados/Kaggle/'

def handleClickHour(df):
    df['click_hour']= (pd.to_datetime(df['click_time']).dt.round('H')).dt.hour
    df['click_hour'] = df['click_hour'].astype('uint8')
    df = df.drop(['click_time'], axis=1)   
    return df

In [2]:
# Importing data
train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

# Load training df (partly)
start_time = time.time()
df_train_30m = pd.read_csv(path + 'train.csv', dtype=dtypes, skiprows=range(1,133333333), nrows=33333333, usecols=train_columns)
print('Load df_train_30m with {} seconds'.format(round(time.time() - start_time, 3)))

Load df_train_30m with 138.148 seconds


In [3]:
# Load testing df
start_time = time.time()
df_test = pd.read_csv(path + 'test.csv', dtype=dtypes)
print('Load df_test with {} seconds'.format(round(time.time() - start_time, 3)))

train_record_index = df_train_30m.shape[0]

Load df_test with 21.781 seconds


In [4]:
# Handle click hour 
df_train_30m = handleClickHour(df_train_30m)
df_test = handleClickHour(df_test)
gc.collect();
print('ClickTime data correctly handled.')

ClickTime data correctly handled.


In [5]:
df_train_30m.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,click_hour
0,89278,9,1,13,134,0,0
1,112296,21,1,22,128,0,0
2,119724,11,1,19,137,0,0
3,165053,12,1,13,19,0,0
4,338996,9,1,26,466,0,0


In [6]:
df_test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_hour
0,0,5744,9,1,3,107,4
1,1,119901,9,1,3,466,4
2,2,72287,21,1,19,128,4
3,3,78477,15,1,13,111,4
4,4,123080,12,1,13,328,4


In [7]:
#df for submit
df_submit = pd.DataFrame()
df_submit['click_id'] = df_test['click_id']

In [8]:
#Extracting learning data
Learning_Y = df_train_30m['is_attributed']
print('Training target correctly extracted.')

#drop zone
df_test = df_test.drop(['click_id'], axis=1)
df_train_30m = df_train_30m.drop(['is_attributed'], axis=1)
gc.collect();

Training target correctly extracted.


In [9]:
df_merge = pd.concat([df_train_30m, df_test])
del df_train_30m, df_test
gc.collect();
print('Data was correctly concatenated')

Data was correctly concatenated


In [11]:
# Count ip for both train and test df 
start_time = time.time()
df_ip_count = df_merge['ip'].value_counts().reset_index(name = 'ip_count')
df_ip_count.columns = ['ip', 'ip_count']
print('Loaded df_ip_count with {} seconds'.format(round(time.time() - start_time, 3)))

Loaded df_ip_count with 7.429 seconds


In [12]:
df_ip_count.head()

Unnamed: 0,ip,ip_count
0,5348,346892
1,5314,314366
2,73516,164026
3,73487,163330
4,53454,128719


In [13]:
print('Starting to merge with main dataset...')
df_merge = df_merge.merge(df_ip_count, on='ip', how='left', sort=False)
df_merge['ip_count'] = df_merge['ip_count'].astype('uint16')
print('Merging operation completed.')

Starting to merge with main dataset...
Merging operation completed.


In [15]:
# We don't need the ip information anymore
df_merge = df_merge.drop(['ip'], axis=1)
del df_ip_count
gc.collect();

df_train = df_merge[:train_record_index]
df_test = df_merge[train_record_index:]

del df_merge
gc.collect();

df_train['Download'] = Learning_Y

In [26]:
df_train.head()

Unnamed: 0,app,device,os,channel,click_hour,ip_count,Download
0,9,1,13,134,0,4368,0
1,21,1,22,128,0,3719,0
2,11,1,19,137,0,1191,0
3,12,1,13,19,0,1170,0
4,9,1,26,466,0,549,0


In [27]:
df_test.head()

Unnamed: 0,app,device,os,channel,click_hour,ip_count
33333333,9,1,3,107,4,383
33333334,9,1,3,466,4,6161
33333335,21,1,19,128,4,4615
33333336,15,1,13,111,4,3549
33333337,12,1,13,328,4,484


In [31]:
# Now I split the training data in a real training dataset and cross validation dataset
from sklearn import model_selection

cols = list(df_train.columns)
cols.remove('Download')
X_train, X_cv, y_train, y_cv = model_selection.train_test_split(df_train[cols], 
                                                                df_train['Download'], train_size = 0.7)

del cols
gc.collect();

print('Data splitting into training and cross validation is done.)



In [59]:
# Creating a way to evaluate the model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score

def clf_eval(y_true, y_pred):
    print('Classification Report')
    
    print('ROC Score: {}'.format(roc_auc_score(y_true, y_pred)))
    
    return roc_auc_score(y_true, y_pred)

#### Machine Learning Phase

In [88]:
# Starting Elastic Net Implementation
from sklearn.linear_model import SGDClassifier

print('Starting to fit the model to the training dataset... The machine is learning...')

start_time = time.time()
mae_dina = SGDClassifier(loss = 'log', penalty = 'elasticnet', alpha = 0.01, l1_ratio = 0.1, n_jobs = 5,
                        verbose = 1, )
trained_model = mae_dina.fit(X_train, y_train)

print('Machine has learned. Elapsed time: {} seconds'.format(round(time.time() - start_time, 3)))

Starting to fit the model to the training dataset... The machine is learning...




-- Epoch 1
Norm: 7.64, NNZs: 5, Bias: -1.121971, T: 23333333, Avg. loss: 4.794118
Total training time: 12.71 seconds.
-- Epoch 2
Norm: 5.42, NNZs: 4, Bias: -1.185583, T: 46666666, Avg. loss: 0.042739
Total training time: 25.56 seconds.
-- Epoch 3
Norm: 4.41, NNZs: 4, Bias: -1.229895, T: 69999999, Avg. loss: 0.033771
Total training time: 38.30 seconds.
-- Epoch 4
Norm: 3.80, NNZs: 5, Bias: -1.264082, T: 93333332, Avg. loss: 0.030369
Total training time: 51.10 seconds.
-- Epoch 5
Norm: 3.38, NNZs: 5, Bias: -1.292020, T: 116666665, Avg. loss: 0.028634
Total training time: 64.02 seconds.
Machine has learned. Elapsed time: 70.609 seconds


In [89]:
# Predicting
print('Starting cross-validation prediction phase...')
start_time = time.time()
predictions = trained_model.predict_proba(X_cv)[:,1]
print('Prediction done. Elapsed time: {} seconds'.format(round(time.time() - start_time, 3)))

Starting cross-validation prediction phase...
Prediction done. Elapsed time: 1.784 seconds


In [90]:
# Evaluating the model
clf_eval(y_cv, predictions);

Classification Report
ROC Score: 0.7669744078835646


#### Code for submission!

In [96]:
# Predicting
print('Starting prediction phase on test data...')
start_time = time.time()
predictions = trained_model.predict_proba(df_test)[:,1]
print('Prediction done. Elapsed time: {} seconds'.format(round(time.time() - start_time, 3)))

Starting prediction phase on test data...
Prediction done. Elapsed time: 3.299 seconds


In [97]:
# Creating the submission dataset
df_submit['is_attributed'] = predictions

print('Submission dataset created.')

Submission dataset created.


In [98]:
# Preparing submssion
df_submit.to_csv('elasticnet_talking_data.csv', index=False)
print('Submission dataset saved correctly.')

Submission dataset saved correctly.
