# Data Importing

## Import Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

from code.common import workingSet

## Setup Spark Application

In [4]:
from code.common import initSpark

initSpark(workingSet, application = 'prep')


--------------------------------------------------------------------------------
Starting Spark Initializing
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Spark Initializing in 3.009 Seconds
--------------------------------------------------------------------------------


# Load Criteo Dataset

In [6]:
from code.common import loadData

loadData(workingSet)


--------------------------------------------------------------------------------
Starting Data Loading
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Data Loading in 5.439 Seconds
--------------------------------------------------------------------------------


## Split Data into Training, Test, and Dev Subsets

In [8]:
from code.common import splitData

splitData(workingSet)


--------------------------------------------------------------------------------
Starting Data Splitting
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
Finished Data Splitting in 3.513 Seconds
--------------------------------------------------------------------------------


# Feature Engineering : Categorical Variables

In [11]:
from pyspark.sql.functions import countDistinct, col, when, isnan, count, lit

In [9]:
df           = workingSet['df_train'    ]
cat_features = workingSet['cat_features']

In [None]:
%%time

df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).toPandas().T

In [None]:
%%time

df_filled = df.fillna('deadbeef', cat_features).cache()

In [None]:
%%time

rb_thresh = 360000
rb_values = {}

for feature in cat_features :
    df_counts          = df_filled.groupBy(feature).count()
    rb_values[feature] = df_counts.filter(df_counts['count'] < rb_thresh).select(feature).rdd.flatMap(list).collect()

    print(feature, f'found {len(rb_values[feature]):>8} rare categories')

In [None]:
df_rare = df_filled

for feature, rare_categories in rb_values.items():
    df_rare = df_rare.replace(rare_categories, 'rarebeef', feature)

In [None]:
df_rare = df_filled.replace(rb_values['s03'], 'rarebeef', 's03')

In [None]:
pd.DataFrame(df_rare.take(10), columns = df_rare.columns).T

In [None]:
df_rare.groupBy('s01').count().toPandas()

In [None]:
%%time

df_rare = df_filled.replace(rb_values['s03'], 'rarebeef', 's03')
df_rare.groupBy('s03').count().toPandas()

In [None]:
df_rare

## Index Categorical Features

In [None]:
%%time

if  not exists('../data/train.parquet.indexed'):

    stages   = [StringIndexer(inputCol = f, outputCol= f'{f}_index').setHandleInvalid('keep') for f in cat_columns]
    pipeline = Pipeline(stages = stages)
    model    = pipeline.fit(df)
    df       = model.transform(df)

    """
    for c in cat_indexes:
        df = df.withColumn(c, col(c).cast('float'))
    
    df = df.select(['ctr'] + num_columns + cat_indexes)
    """
    df.write.parquet('../data/train.parquet.indexed')
    
df = ss.read.parquet('../data/train.parquet.indexed')
tf = df.sample(fraction = 0.01, seed = 2019)

In [None]:
xf = df

In [None]:
for c in [f'{f}_index' for f in cat_features]:
    df = df.withColumn(c, col(c).cast('float'))

In [None]:
xf.describe()

In [None]:
df.write.parquet('../data/train.parquet')

In [None]:
df = ss.read.parquet('../data/train.parquet')

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = df.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = df.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
%%time
imputer = Imputer(inputCols = num_features, outputCols = num_features)
model   = imputer.fit(df)
xf      = model.transform(df)
xf.describe(num_features).toPandas().T

In [None]:
%%time
num_assembler = VectorAssembler(inputCols = num_features, outputCol = 'num_features')
cat_assembler = VectorAssembler(inputCols = cat_features, outputCol = 'cat_features')
xf            = num_assembler.transform(xf)
#xf            = cat_assembler.transform(xf)

xf.describe(num_features).toPandas().T