# Data Importing

## Import Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

from code.common import *

## Setup Spark Application

In [2]:
initSpark(workingSet, application = 'prep')


--------------------------------------------------------------------------------
Starting Spark Initializing
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Spark Initializing in 6.482 Seconds
--------------------------------------------------------------------------------


# Load Criteo Dataset

In [3]:
loadData(workingSet, data = 'data', clean = True)


--------------------------------------------------------------------------------
Starting Data Loading at /home/jovyan/work/notebooks/data
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Data Loading in 66.340 Seconds
--------------------------------------------------------------------------------


## Split Data into Training, Test, and Dev Subsets

In [4]:
splitData(workingSet, ratios = [.8,.1,.1])


--------------------------------------------------------------------------------
Starting Data Splitting at [0.8, 0.1, 0.1]
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Data Splitting in 95.907 Seconds
--------------------------------------------------------------------------------


## Feature Engineering : Numerical Variables

In [5]:
# from code.engineering import numFillUndefined, numStandardize

# Feature Engineering : Categorical Variables

In [6]:
from code.engineering import catFillUndefined, catFindFrequent, catMaskUncommon, catPickFeatures, catCodeFeatures, catHashFeatures

## Replace Undefined Values with Special Term

In [7]:
catFillUndefined(workingSet, subset = 'train', term = 'deadbeef')
catFillUndefined(workingSet, subset = 'test',  term = 'deadbeef')
catFillUndefined(workingSet, subset = 'dev',   term = 'deadbeef')


--------------------------------------------------------------------------------
Starting Categorical Fill Undefined Terms on train
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Fill Undefined Terms in 44.238 Seconds
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Starting Categorical Fill Undefined Terms on test
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Fill Undefined Terms in 7.677 Seconds
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Starting Categorical Fill Undefined Terms on d

## Find Uncommon Values for Each Feature

In [8]:
catFindFrequent(workingSet, subset = 'train', threshold = 180000, fit = True)


--------------------------------------------------------------------------------
Starting Categorical Find Frequent Terms on train
--------------------------------------------------------------------------------
s01 found     1447 uncommon categories of     1460 distinct categories ->  13 frequent categories = ['05db9164', '68fd1e64', '5a9ed9b0', '8cf07265', 'be589b51', '5bfa8ab5', '87552397', 'f473b8dc', '39af2607', 'ae82ea21', '9a89b36c', '17f69355', '241546e0']
s02 found      541 uncommon categories of      581 distinct categories ->  40 frequent categories = ['38a947a1', '207b2d81', '38d50e09', '1cfdf714', '287130e0', '4f25e98b', '09e68b86', '421b43cd', '58e67aaf', '80e26c9b', '89ddfee8', '08d6d899', '2c16a946', '95e2d337', '04e09220', 'f0cf0024', 'a796837e', '0468d672', 'e112a9de', '6887a43c', '78ccd99e', 'd833535f', '8084ee93', '8947f767', 'b56822db', '3f0d3f28', '90081f33', 'e5fb1af3', 'd7988e72', 'ae46a29d', '6e638bbc', 'b961056b', 'e77e5e6e', '4c2bc594', 'f6f4fe4b', '39dfaa0d

## Replace Uncommon Values with Special Term

In [10]:
catMaskUncommon(workingSet, subset = 'train', threshold = 180000, term = 'rarebeef')
catMaskUncommon(workingSet, subset = 'test',  threshold = 180000, term = 'rarebeef')
catMaskUncommon(workingSet, subset = 'dev',   threshold = 180000, term = 'rarebeef')


--------------------------------------------------------------------------------
Starting Categorical Mask Uncommon Terms on train
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Mask Uncommon Terms in 63.349 Seconds
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Starting Categorical Mask Uncommon Terms on test
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Mask Uncommon Terms in 11.728 Seconds
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Starting Categorical Mask Uncommon Terms on dev
-

## Explore Replaced Features

In [11]:
catFindFrequent(workingSet, subset = 'train', threshold = 180000, fit = False)


--------------------------------------------------------------------------------
Starting Categorical Find Frequent Terms on train
--------------------------------------------------------------------------------
s01 found        0 uncommon categories of       14 distinct categories ->  14 frequent categories = ['05db9164', '68fd1e64', '5a9ed9b0', 'rarebeef', '8cf07265', 'be589b51', '5bfa8ab5', '87552397', 'f473b8dc', '39af2607', 'ae82ea21', '9a89b36c', '17f69355', '241546e0']
s02 found        0 uncommon categories of       41 distinct categories ->  41 frequent categories = ['rarebeef', '38a947a1', '207b2d81', '38d50e09', '1cfdf714', '287130e0', '4f25e98b', '09e68b86', '421b43cd', '58e67aaf', '80e26c9b', '89ddfee8', '08d6d899', '2c16a946', '95e2d337', '04e09220', 'f0cf0024', 'a796837e', '0468d672', 'e112a9de', '6887a43c', '78ccd99e', 'd833535f', '8084ee93', '8947f767', 'b56822db', '3f0d3f28', '90081f33', 'e5fb1af3', 'd7988e72', 'ae46a29d', '6e638bbc', 'b961056b', 'e77e5e6e', '4c2bc594

## Encode Categorical Features

In [12]:
catCodeFeatures(workingSet, 'train', threshold = 180000, fit = True)


--------------------------------------------------------------------------------
Starting Categorical Feature Encoding on train with 512 numFeatures
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Feature Encoding in 118.969 Seconds
--------------------------------------------------------------------------------


In [30]:
workingSet['df_train_180000_encode'].filter(col('label') == '1.0').show(truncate = False)

+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ctr|features                                                                                                                                                                                                            |
+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1.0|(401,[6,42,55,70,89,99,106,119,129,132,139,160,176,202,212,239,257,290,304,313,316,334,345,350,376,389],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])  |
|1.0|(401,[0,14,55,70,89,102,106,118,129,133,139,160,176,203,212,239,258,267,304,313,316,334,341,350,377,389],[1.0,1.0,1.0,1

In [31]:
workingSet['df_train_180000_encode'].filter(col('label') == '0.0').show(truncate = False)

+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ctr|features                                                                                                                                                                                                           |
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0|(401,[0,14,55,70,89,100,106,118,129,133,139,160,176,205,212,239,261,267,303,312,316,334,340,350,374,388],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|0.0|(401,[0,29,64,81,89,98,106,121,129,132,151,170,192,202,238,250,257,282,303,312,327,335,342,359,374,388],[1.0,1.0,1.0,1.0,1.

## Pick Categorical Features

In [15]:
catPickFeatures(workingSet, 'train', threshold = 180000, features = 300)


--------------------------------------------------------------------------------
Starting Feature Selection on train
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Feature Selection in 298.104 Seconds
--------------------------------------------------------------------------------


In [17]:
workingSet['df_train_180000_picked'].show(truncate = False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                              |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(300,[23,28,31,32,39,95,105,120,137,162,164,167,176,177,195,197,203,211,222,227,248,252,263,272,289],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])           |
|0.0  |(300,[12,87,91,103,105,120,130,140,143,162,163,164,166,167,181,194,211,227,230,253,260,262,263,291,29