Random Forest com mais features e, talvez, algum undersampling.

In [3]:
import numpy as np # linear algebra
import pandas as pd

import os
# print(os.listdir("../input"))

import gc, time

dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

# Correcting path to data inside EMAp's server
path = '/dados/Dados/Kaggle/'

def handleClickHour_raul(df):
    df['click_hour']= pd.to_datetime(df['click_time']).dt.hour.astype('uint8')
    df['click_minute'] = pd.to_datetime(df['click_time']).dt.minute.astype('uint8')
    df['click_second'] = pd.to_datetime(df['click_time']).dt.second.astype('uint8')
    df = df.drop(['click_time'], axis=1)   
    return df

In [4]:
# Importing data
train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

# Load training df (partly) - this takes 2-4 minutes
start_time = time.time()
df_train_30m = pd.read_csv(path + 'train.csv', dtype=dtypes, skiprows=range(1,133333333), 
                           nrows=35000000, usecols=train_columns)
print('Loaded df_train_30m with {} seconds'.format(round(time.time() - start_time, 3)))

Loaded df_train_30m with 137.732 seconds


In [5]:
# Load testing df
start_time = time.time()
df_test = pd.read_csv(path + 'test.csv', dtype=dtypes)
print('Loaded df_test with {} seconds'.format(round(time.time() - start_time, 3)))

train_record_index = df_train_30m.shape[0]

Loaded df_test with 22.245 seconds


In [6]:
# Handle click hour 
df_train_30m = handleClickHour_raul(df_train_30m)
df_test = handleClickHour_raul(df_test)
gc.collect();
print('ClickTime data correctly handled.')

ClickTime data correctly handled.


In [7]:
#df for submit
df_submit = pd.DataFrame()
df_submit['click_id'] = df_test['click_id']

In [8]:
#Extracting learning data
Learning_Y = df_train_30m['is_attributed']
print('Training target correctly extracted.')

#drop zone
df_test = df_test.drop(['click_id'], axis=1)
df_train_30m = df_train_30m.drop(['is_attributed'], axis=1)
gc.collect();

Training target correctly extracted.


In [9]:
# The df_merge just places training data and testing data together so we can create features together
df_merge = pd.concat([df_train_30m, df_test])

# We have already extracted the target column and merged everything, so we can delete these dataframes and trace them
# back as we need!!
del df_train_30m, df_test
gc.collect();
print('Data was correctly concatenated')

Data was correctly concatenated


In [10]:
# Count ip for both train and test df 
start_time = time.time()
df_ip_count = df_merge['ip'].value_counts().reset_index(name = 'ip_count')
df_ip_count.columns = ['ip', 'ip_count']
print('Loaded df_ip_count with {} seconds'.format(round(time.time() - start_time, 3)))

Loaded df_ip_count with 6.311 seconds


In [11]:
print('Starting to merge with main dataset...')
df_merge = df_merge.merge(df_ip_count, on='ip', how='left', sort=False)
df_merge['ip_count'] = df_merge['ip_count'].astype('uint16')
print('Merging operation completed.')

Starting to merge with main dataset...
Merging operation completed.


In [12]:
# Doing the same for OS
start_time = time.time()
df_os_count = df_merge['os'].value_counts().reset_index(name = 'os_count')
df_os_count.columns = ['os', 'os_count']
print('Loaded df_os_count with {:.2f} seconds'.format(time.time() - start_time))
print('Starting to merge with main dataset...')
df_merge = df_merge.merge(df_os_count, on='os', how='left', sort=False).astype('uint8')
print('Merging operation completed.')
del df_os_count
gc.collect();

Loaded df_os_count with 1.29 seconds
Starting to merge with main dataset...
Merging operation completed.


In [13]:
# Doing the same for APP
start_time = time.time()
df_app_count = df_merge['app'].value_counts().reset_index(name = 'app_count')
df_app_count.columns = ['app', 'app_count']
print('Loaded df_app_count with {:.2f} seconds'.format(time.time() - start_time))
print('Starting to merge with main dataset...')
df_merge = df_merge.merge(df_app_count, on='app', how='left', sort=False).astype('uint8')
print('Merging operation completed.')
del df_app_count
gc.collect();

Loaded df_app_count with 0.70 seconds
Starting to merge with main dataset...
Merging operation completed.


In [14]:
# Doing the same for DEVICE
start_time = time.time()
df_dev_count = df_merge['device'].value_counts().reset_index(name = 'dev_count')
df_dev_count.columns = ['device', 'dev_count']
print('Loaded df_dev_count with {:.2f} seconds'.format(time.time() - start_time))
print('Starting to merge with main dataset...')
df_merge = df_merge.merge(df_dev_count, on='device', how='left', sort=False).astype('uint8')
print('Merging operation completed.')
del df_dev_count
gc.collect();

Loaded df_dev_count with 1.25 seconds
Starting to merge with main dataset...
Merging operation completed.


In [15]:
df_merge.head()

Unnamed: 0,ip,app,device,os,channel,click_hour,click_minute,click_second,ip_count,os_count,app_count,dev_count
0,190,9,1,13,134,0,25,59,144,205,219,24
1,168,21,1,22,128,0,25,59,251,211,195,24
2,172,11,1,19,137,0,25,59,215,9,221,24
3,189,12,1,13,19,0,25,59,194,205,157,24
4,52,9,1,26,210,0,25,59,61,125,219,24


### Preparing everything for some Machine Learning.

In [16]:
# Creating a way to evaluate the model
from sklearn.metrics import roc_auc_score
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score

def clf_eval(y_true, y_pred):
    print('Classification Report')
   # print('F-1 Score: {}'.format(f1_score(y_true, y_pred)))
    print('ROC Score: {}'.format(roc_auc_score(y_true, y_pred)))
    return roc_auc_score(y_true, y_pred)

In [17]:
# Tracing back
df_train = df_merge[:train_record_index].copy()
df_test = df_merge[train_record_index:].copy()

# Creating a cross-validation set
from sklearn import model_selection
X_train, X_cv, Y_train, Y_cv = model_selection.train_test_split(df_train, Learning_Y, train_size = 0.7)
gc.collect();
print('Data splitting into training and cross validation is done.')
del df_merge
gc.collect();



Data splitting into training and cross validation is done.


### Naive implementation of Random Forest

In [18]:
# Start of Random Forest Implementation with no fancy stuff
from sklearn.ensemble import RandomForestClassifier

print('Starting to fit Random Forest Model... The machine is learning...')
start_time = time.time()

rf = RandomForestClassifier(n_estimators=13, max_depth=13, random_state=13, verbose=2, n_jobs = 13)
rf.fit(X_train, Y_train)
print('The machine has learned.')
print('RandomForest has fitted df_train_30m with {:.2f} seconds'.format(time.time() - start_time))

Starting to fit Random Forest Model... The machine is learning...
building tree 1 of 13building tree 2 of 13
building tree 3 of 13
building tree 4 of 13building tree 5 of 13
building tree 6 of 13
building tree 7 of 13building tree 8 of 13
building tree 9 of 13
building tree 10 of 13building tree 11 of 13

building tree 12 of 13building tree 13 of 13






[Parallel(n_jobs=13)]: Done   2 out of  13 | elapsed:  6.9min remaining: 38.0min
[Parallel(n_jobs=13)]: Done   9 out of  13 | elapsed:  7.3min remaining:  3.2min


The machine has learned.
RandomForest has fitted df_train_30m with 445.49 seconds


[Parallel(n_jobs=13)]: Done  13 out of  13 | elapsed:  7.4min finished


In [19]:
print('Starting cross-validation prediction phase...')
start_time = time.time()
predictions = rf.predict_proba(X_cv)[:,1]
print('Prediction done. Elapsed time: {:.2f} seconds'.format(time.time() - start_time))

Starting cross-validation prediction phase...


[Parallel(n_jobs=13)]: Done   2 out of  13 | elapsed:    3.4s remaining:   18.8s
[Parallel(n_jobs=13)]: Done   9 out of  13 | elapsed:    3.9s remaining:    1.7s
[Parallel(n_jobs=13)]: Done  13 out of  13 | elapsed:    4.2s finished


Prediction done. Elapsed time: 4.98 seconds


In [20]:
# Evaluating the model
clf_eval(Y_cv, predictions)

Classification Report
ROC Score: 0.9483143345829378


0.9483143345829378

### UnderSampling to see what happens

In [26]:
# Using the RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
start_time = time.time()

train_resampled, Y_resampled, idx_resampled = rus.fit_sample(X = X_train, y = Y_train)

In [27]:
# Transforming the undersampled data in DataFrames
X_resampled = pd.DataFrame(train_resampled, columns=df_train.columns)
Y_resampled = pd.DataFrame(Y_resampled, columns=['is_attributed'])

del train_resampled, idx_resampled
gc.collect();

In [50]:
# Running Random Forest again
start_time = time.time()
rf_resampled = RandomForestClassifier(n_estimators=13, 
                                      max_depth=13, 
                                      random_state=13, 
                                      verbose=1, 
                                      n_jobs = 18)


rf_resampled.fit(X_resampled, Y_resampled)
print('The machine has learned.')
print('RandomForest has fitted X_train with {} seconds'.format(time.time() - start_time))

print('Starting cross-validation prediction phase...')
start_time = time.time()
predictions = rf_resampled.predict_proba(X_cv)[:,1]
print('Prediction done. Elapsed time: {} seconds'.format(time.time() - start_time))

# Evaluating the model
clf_eval(Y_cv, predictions);

  # Remove the CWD from sys.path while we load stuff.
[Parallel(n_jobs=18)]: Done   6 out of  13 | elapsed:    0.5s remaining:    0.6s
[Parallel(n_jobs=18)]: Done  13 out of  13 | elapsed:    0.5s finished


The machine has learned.
RandomForest has fitted X_train with 0.6496496200561523 seconds
Starting cross-validation prediction phase...


[Parallel(n_jobs=13)]: Done   2 out of  13 | elapsed:    3.5s remaining:   19.3s
[Parallel(n_jobs=13)]: Done  13 out of  13 | elapsed:    4.3s finished


Prediction done. Elapsed time: 5.129887819290161 seconds
Classification Report
ROC Score: 0.957046765680315
