In [22]:
import pandas  as pd
import numpy   as np
import time    as ti

import seaborn           as sns
import matplotlib.pyplot as plt
import ipywidgets        as widgets

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg         import Vectors
from pyspark.ml.feature        import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml                import Pipeline

from pyspark.sql               import SparkSession, SQLContext
from pyspark.sql.types         import StructType, StructField, StringType, FloatType
from pyspark.sql.functions     import countDistinct, col, desc

from os.path                   import exists

In [24]:
help(Vectors.sparse)

Help on function sparse in module pyspark.ml.linalg:

sparse(size, *args)
    Create a sparse vector, using either a dictionary, a list of
    (index, value) pairs, or two separate arrays of indices and
    values (sorted by index).
    
    :param size: Size of the vector.
    :param args: Non-zero entries, as a dictionary, list of tuples,
                 or two sorted lists containing indices and values.
    
    >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
    SparseVector(4, {1: 1.0, 3: 5.5})
    >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
    SparseVector(4, {1: 1.0, 3: 5.5})
    >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
    SparseVector(4, {1: 1.0, 3: 5.5})



In [50]:
from pyspark.ml.feature import FeatureHasher

spark = SparkSession.builder \
        .getOrCreate()

In [80]:
dataset = spark.createDataFrame([
    ("a1", "b1", "c1", "d4"),
    ("a1", "b2", "c2", "d2"),
    ("a3", "b2", "c3", "d3"),
    ("a4", "b4", "c3", "d4")
], ["s01", "s02", "s03", "s04"])

hasher = FeatureHasher(inputCols=["s01", "s02", "s03", "s04"],
                       outputCol="features", numFeatures = 16)

featurized = hasher.transform(dataset)
featurized.show(truncate = False)
featurized.toPandas()

+---+---+---+---+---------------------------------+
|s01|s02|s03|s04|features                         |
+---+---+---+---+---------------------------------+
|a1 |b1 |c1 |d4 |(16,[5,6,9,13],[1.0,1.0,1.0,1.0])|
|a1 |b2 |c2 |d2 |(16,[3,8,9,10],[1.0,1.0,1.0,1.0])|
|a3 |b2 |c3 |d3 |(16,[1,3,4,13],[1.0,1.0,1.0,1.0])|
|a4 |b4 |c3 |d4 |(16,[0,1,5,15],[1.0,1.0,1.0,1.0])|
+---+---+---+---+---------------------------------+



Unnamed: 0,s01,s02,s03,s04,features
0,a1,b1,c1,d4,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
1,a1,b2,c2,d2,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,a3,b2,c3,d3,"(0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,a4,b4,c3,d4,"(1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [82]:
dataset = spark.createDataFrame([
    ("a1", "a1", "c1", "d4"),
    ("a1", "a2", "c2", "d2"),
    ("a3", "a2", "c3", "d3"),
    ("a4", "a4", "c3", "d4")
], ["s01", "s02", "s03", "s04"])

hasher = FeatureHasher(inputCols=["s01", "s02", "s03", "s04"],
                       outputCol="features", numFeatures = 2)

featurized = hasher.transform(dataset)
featurized.show(truncate = False)
featurized.toPandas()

+---+---+---+---+-------------------+
|s01|s02|s03|s04|features           |
+---+---+---+---+-------------------+
|a1 |a1 |c1 |d4 |(2,[0,1],[2.0,2.0])|
|a1 |a2 |c2 |d2 |(2,[0,1],[2.0,2.0])|
|a3 |a2 |c3 |d3 |(2,[0,1],[1.0,3.0])|
|a4 |a4 |c3 |d4 |(2,[1],[4.0])      |
+---+---+---+---+-------------------+



Unnamed: 0,s01,s02,s03,s04,features
0,a1,a1,c1,d4,"(2.0, 2.0)"
1,a1,a2,c2,d2,"(2.0, 2.0)"
2,a3,a2,c3,d3,"(1.0, 3.0)"
3,a4,a4,c3,d4,"(0.0, 4.0)"


In [None]:
def initSpark(workingSet):
    
    workingSet['ss'] = SparkSession.builder \
                                   .config('spark.driver.memory', '240G') \
                                   .getOrCreate()
    workingSet['sc'] = workingSet['ss'].sparkContext
    workingSet['sq'] = SQLContext(workingSet['sc'])

In [None]:
def loadData(workingSet):

    start = ti.time()
    
    if  not exists('../data/criteo.parquet.full'):

        ds = StructType([StructField(f'ctr'    ,  FloatType(), True)                      ] + \
                        [StructField(f'i{f:02}',  FloatType(), True) for f in range(1, 14)] + \
                        [StructField(f's{f:02}', StringType(), True) for f in range(1, 27)])

        df = workingSet['sq'].read.format('csv') \
                             .options(header = 'true', delimiter = '\t') \
                             .schema(ds) \
                             .load('../data/train.txt')

        df.write.parquet('../data/criteo.parquet.full')

    df = workingSet['ss'].read.parquet('../data/criteo.parquet.full')

    workingSet['df_full'    ] = df
    workingSet['df_toy'     ] = df.sample(fraction = 0.01, seed = 2019)

    workingSet['num_columns'] = [c for c in df.columns if 'i'       in c]
    workingSet['cat_columns'] = [c for c in df.columns if 's'       in c]
    workingSet['all_columns'] = [c for c in df.columns if 'ctr' not in c]
    
    print(f'\nFinished DataFrame Loading in {ti.time()-start:.3f} Seconds\n')

In [None]:
def splitData(workingSet):

    start = ti.time()
    
    if  not exists('../data/criteo.parquet.train') or \
        not exists('../data/criteo.parquet.test' ) or \
        not exists('../data/criteo.parquet.dev'  )    :

        train, test, dev = workingSet['df_full'].randomSplit([0.8, 0.1, 0.1], seed = 2019)
        
        train.write.parquet('../data/criteo.parquet.train')
        test.write.parquet('../data/criteo.parquet.test')
        dev.write.parquet('../data/criteo.parquet.dev')
        
    workingSet['df_train'] = workingSet['ss'].read.parquet('../data/criteo.parquet.train')
    workingSet['df_test '] = workingSet['ss'].read.parquet('../data/criteo.parquet.test')
    workingSet['df_dev'  ] = workingSet['ss'].read.parquet('../data/criteo.parquet.dev')
    
    print(f'\nFinished DataFrame Splitting in {ti.time()-start:.3f} Seconds\n')    

In [None]:
%%time
df[df.ctr == 1].count() / rc

In [None]:
%%time
df.agg(*[(count(c)/train_total).alias(c) for c in df.columns]).toPandas().T

In [None]:
%%time
df[[c for c in df.columns if 'I' in c or 'CTR' in c]].describe().toPandas().T

In [None]:
df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).show()

In [None]:
%%time
rc_1 = df[df.ctr == 1].count()
rc_0 = df[df.ctr == 0].count()

In [None]:
df_1 = df[df.ctr == 1][cat_features]
df_0 = df[df.ctr == 0][cat_features]

In [None]:
xf_1 = df_1.groupBy(['s17']).count().cache()
xf_0 = df_0.groupBy(['s17']).count().cache()

In [None]:
%%time
xf_1.collect()

In [None]:
%%time
xf_0.collect()

In [None]:
for r_1, r_0 in zip(xf_1.collect(),xf_0.collect()):
    print(r_1['count']/r_0['count'])

In [None]:
pf = df.select(cat_features).toPandas()

In [None]:
pf = ss.read.parquet('../data/train.parquet')

In [None]:
%%time
rc_1 = df_columnar[df_columnar.ctr == 1].count()
rc_0 = df_columnar[df_columnar.ctr == 0].count()

In [None]:
%%time
xf_1 = df_1.groupBy(['s17']).count()
xf_0 = df_0.groupBy(['s17']).count()
xf_1.collect()

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = pf.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
%%time
num_summary_1 = pf[pf.ctr==1].describe( num_features ).toPandas()

In [None]:
pf[pf.ctr==1].count()

In [None]:
%%time
num_summary_0 = pf[pf.ctr==0].describe( num_features ).toPandas()

In [None]:
num_summary.T

In [None]:
num_summary_1.T

In [None]:
num_summary_0.T

In [None]:
for f in num_features:
    pf.groupBy(f).count().select('count').toPandas().plot()

In [None]:
scaler = StandardScaler( withMean = True, withStd = True).fit( features )

In [None]:
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "/usr/local/spark/data/mllib/sample_libsvm_data.txt")
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)


In [None]:
xf = df.fillna({c : '00000000' for c in cat_features })

In [None]:
xf.collect()

In [None]:
scaler = StandardScaler(inputCol = ['i01'], outputCol = ['n01'], withStd = True, withMean = True)
model  = scaler.fit(pf)

In [None]:
xf = ss.read.format("libsvm").load("/usr/local/spark/data/mllib/sample_libsvm_data.txt")

In [None]:
xf.head()

In [17]:
!cd ../data;tar xzf criteo.kaggle2014.svm.tar.gz

In [21]:
!df -h .

Filesystem      Size  Used Avail Use% Mounted on
/dev/xvda2       99G   72G   23G  77% /home/jovyan/work


In [20]:
!ls -lhSr ../data

total 39G
-rw-r--r-- 1 jovyan users 9.2K Nov 20 00:14 ToyData.pkl
drwxr-xr-x 2 jovyan users  20K Nov 20 02:30 train_w261.parquet
drwxr-xr-x 2 jovyan users  20K Nov 20 02:32 test_w261.parquet
drwxr-xr-x 2 jovyan users  20K Nov 20 02:31 dev_w261.parquet
drwxr-xr-x 2 jovyan users  20K Nov 25 23:57 criteo.parquet.train
drwxr-xr-x 2 jovyan users  20K Nov 25 23:57 criteo.parquet.test
drwxr-xr-x 2 jovyan users  20K Nov 25 23:37 criteo.parquet.full
drwxr-xr-x 2 jovyan users  24K Nov 25 23:58 criteo.parquet.dev
-r--r--r-- 1 jovyan users 3.3G Sep 26  2017 criteo.kaggle2014.test.svm
-rwxrwxrwx 1 root   root   11G May 12  2014 train.txt
-r--r--r-- 1 jovyan users  25G Sep 26  2017 criteo.kaggle2014.train.svm


## Perform One-Hot Encoding

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorIndexer

In [None]:
# Encode the columns
encoder = OneHotEncoderEstimator(inputCols= cat_features , outputCols=cat_features)

In [None]:
model = encoder.fit(df_rare)

In [None]:
stages   = [StringIndexer(inputCol = f, outputCol= f'{f}_index') for f in cat_features]
pipeline = Pipeline(stages = stages)
model    = pipeline.fit(df_rare)
df_indexed       = model.transform(df_rare)

In [None]:
df_indexed = df_indexed.drop(*[col for col in df_indexed.columns if 'i' not in col])

In [None]:
encoded_features = [f'{feature}_encoded' for feature in cat_features ]

# Encode the columns
encoder = OneHotEncoderEstimator(inputCols= cat_features , outputCols=encoded_features)

## -----------------

In [None]:
df_rare = df_filled.replace(rb_values['s03'], 'rarebeef', 's03')

In [None]:
pd.DataFrame(df_rare.take(10), columns = df_rare.columns).T

In [None]:
df_rare.groupBy('s01').count().toPandas()

In [None]:
%%time

df_rare = df_filled.replace(rb_values['s03'], 'rarebeef', 's03')
df_rare.groupBy('s03').count().toPandas()

## Index Categorical Features

In [None]:
%%time

if  not exists('../data/train.parquet.indexed'):

    stages   = [StringIndexer(inputCol = f, outputCol= f'{f}_index').setHandleInvalid('keep') for f in cat_columns]
    pipeline = Pipeline(stages = stages)
    model    = pipeline.fit(df)
    df       = model.transform(df)

    """
    for c in cat_indexes:
        df = df.withColumn(c, col(c).cast('float'))
    
    df = df.select(['ctr'] + num_columns + cat_indexes)
    """
    df.write.parquet('../data/train.parquet.indexed')
    
df = ss.read.parquet('../data/train.parquet.indexed')
tf = df.sample(fraction = 0.01, seed = 2019)

In [None]:
xf = df

In [None]:
for c in [f'{f}_index' for f in cat_features]:
    df = df.withColumn(c, col(c).cast('float'))

In [None]:
xf.describe()

In [None]:
df.write.parquet('../data/train.parquet')

In [None]:
df = ss.read.parquet('../data/train.parquet')

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = df.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
cat_distinct = {}

for f in cat_features:
    s  = ti.time()
    cat_distinct[f] = df.agg(countDistinct(f)).collect()[0][0]
    print( f'{f} : {cat_distinct[f]:>8} : {ti.time() - s:.3f}' )

print( f'sum : {sum(cat_distinct.values()):>8}' )

In [None]:
%%time
imputer = Imputer(inputCols = num_features, outputCols = num_features)
model   = imputer.fit(df)
xf      = model.transform(df)
xf.describe(num_features).toPandas().T

In [None]:
%%time
num_assembler = VectorAssembler(inputCols = num_features, outputCol = 'num_features')
cat_assembler = VectorAssembler(inputCols = cat_features, outputCol = 'cat_features')
xf            = num_assembler.transform(xf)
#xf            = cat_assembler.transform(xf)

xf.describe(num_features).toPandas().T

In [None]:
frequent_features