# Data Importing

## Import Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

from code.common import *

## Setup Spark Application

In [2]:
initSpark(workingSet, application = 'prep')


--------------------------------------------------------------------------------
Starting Spark Initializing
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Spark Initializing in 6.184 Seconds
--------------------------------------------------------------------------------


# Load Criteo Dataset

In [3]:
loadData(workingSet, data = 'data', clean = True)


--------------------------------------------------------------------------------
Starting Data Loading at /home/jovyan/work/notebooks/data
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Data Loading in 64.787 Seconds
--------------------------------------------------------------------------------


## Split Data into Training, Test, and Dev Subsets

In [4]:
splitData(workingSet, ratios = [.8,.1,.1])


--------------------------------------------------------------------------------
Starting Data Splitting at [0.8, 0.1, 0.1]
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Data Splitting in 125.804 Seconds
--------------------------------------------------------------------------------


# Feature Engineering : Categorical Variables

In [5]:
from code.engineering import catFillUndefined, catFindFrequent, catMaskUncommon

## Replace Undefined Values with Special Term

In [6]:
catFillUndefined(workingSet, subset = 'train', term = 'deadbeef')
catFillUndefined(workingSet, subset = 'test',  term = 'deadbeef')
catFillUndefined(workingSet, subset = 'dev',   term = 'deadbeef')


--------------------------------------------------------------------------------
Starting Categorical Fill Undefined Terms on train
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Fill Undefined Terms in 39.873 Seconds
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Starting Categorical Fill Undefined Terms on test
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Categorical Fill Undefined Terms in 7.521 Seconds
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Starting Categorical Fill Undefined Terms on d

## Find Uncommon Values for Each Feature

In [7]:
catFindFrequent(workingSet, subset = 'train', threshold = 360000)


--------------------------------------------------------------------------------
Starting Categorical Find Frequent Terms on train
--------------------------------------------------------------------------------
s01 found     1451 uncommon categories of     1460 distinct categories ->   9 frequent categories = ['05db9164', '68fd1e64', '5a9ed9b0', '8cf07265', 'be589b51', '5bfa8ab5', '87552397', 'f473b8dc', '39af2607']
s02 found      562 uncommon categories of      581 distinct categories ->  19 frequent categories = ['38a947a1', '207b2d81', '38d50e09', '1cfdf714', '287130e0', '4f25e98b', '09e68b86', '421b43cd', '58e67aaf', '80e26c9b', '89ddfee8', '08d6d899', '2c16a946', '95e2d337', '04e09220', 'f0cf0024', 'a796837e', '0468d672', 'e112a9de']
s03 found  8382435 uncommon categories of  8382439 distinct categories ->   4 frequent categories = ['deadbeef', 'd032c263', '02cf9876', 'aa8c1539']
s04 found  1885229 uncommon categories of  1885236 distinct categories ->   7 frequent categories = 

## Replace Uncommon Values with Special Term

In [None]:
catMaskUncommon(workingSet, subset = 'train', term = 'rarebeef')
catMaskUncommon(workingSet, subset = 'test',  term = 'rarebeef')
catMaskUncommon(workingSet, subset = 'dev',   term = 'rarebeef')


--------------------------------------------------------------------------------
Starting Categorical Mask Uncommon Terms on train
--------------------------------------------------------------------------------


In [None]:
catFindFrequent(workingSet, subset = 'train', threshold = 360000, remember = False)

##### Look at a few columns to confirm it worked

In [None]:
df_rare.groupby('s02').count().sort('count', ascending = False).toPandas()

In [None]:
df_rare.groupby("s03").count().sort('count', ascending = False).toPandas()

In [None]:
df_rare.groupby("s23").count().sort('count', ascending = False).toPandas()

In [None]:
df_rare.count()

## Perform One-Hot Encoding

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorIndexer

In [None]:
# Encode the columns
encoder = OneHotEncoderEstimator(inputCols= cat_features , outputCols=cat_features)

In [None]:
model = encoder.fit(df_rare)

In [None]:
stages   = [StringIndexer(inputCol = f, outputCol= f'{f}_index') for f in cat_features]
pipeline = Pipeline(stages = stages)
model    = pipeline.fit(df_rare)
df_indexed       = model.transform(df_rare)

In [None]:
df_indexed = df_indexed.drop(*[col for col in df_indexed.columns if 'i' not in col])

In [None]:
encoded_features = [f'{feature}_encoded' for feature in cat_features ]

# Encode the columns
encoder = OneHotEncoderEstimator(inputCols= cat_features , outputCols=encoded_features)

## -----------------

In [None]:
df_rare = df_filled.replace(rb_values['s03'], 'rarebeef', 's03')

In [None]:
pd.DataFrame(df_rare.take(10), columns = df_rare.columns).T

In [None]:
df_rare.groupBy('s01').count().toPandas()

In [None]:
%%time

df_rare = df_filled.replace(rb_values['s03'], 'rarebeef', 's03')
df_rare.groupBy('s03').count().toPandas()

## Index Categorical Features

In [None]:
%%time

if  not exists('../data/train.parquet.indexed'):

    stages   = [StringIndexer(inputCol = f, outputCol= f'{f}_index').setHandleInvalid('keep') for f in cat_columns]
    pipeline = Pipeline(stages = stages)
    model    = pipeline.fit(df)
    df       = model.transform(df)

    """
    for c in cat_indexes:
        df = df.withColumn(c, col(c).cast('float'))
    
    df = df.select(['ctr'] + num_columns + cat_indexes)
    """
    df.write.parquet('../data/train.parquet.indexed')
    
df = ss.read.parquet('../data/train.parquet.indexed')
tf = df.sample(fraction = 0.01, seed = 2019)

In [None]:
xf = df

In [None]:
for c in [f'{f}_index' for f in cat_features]:
    df = df.withColumn(c, col(c).cast('float'))

In [None]:
xf.describe()

In [None]:
df.write.parquet('../data/train.parquet')

In [None]:
df = ss.read.parquet('../data/train.parquet')

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = df.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = df.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
%%time
imputer = Imputer(inputCols = num_features, outputCols = num_features)
model   = imputer.fit(df)
xf      = model.transform(df)
xf.describe(num_features).toPandas().T

In [None]:
%%time
num_assembler = VectorAssembler(inputCols = num_features, outputCol = 'num_features')
cat_assembler = VectorAssembler(inputCols = cat_features, outputCol = 'cat_features')
xf            = num_assembler.transform(xf)
#xf            = cat_assembler.transform(xf)

xf.describe(num_features).toPandas().T

In [None]:
frequent_features