# Create Toy Data Set

In [1]:
import pandas  as pd
import numpy   as np
import seaborn as sns
import time    as ti
import itertools
import matplotlib.pyplot as plt

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg         import Vectors
from pyspark.ml.feature        import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml                import Pipeline

from pyspark.sql               import SparkSession, SQLContext
from pyspark.sql.types         import StructType, StructField, BooleanType, IntegerType, StringType, DoubleType, BinaryType
from pyspark.sql.functions     import countDistinct, col, desc, log

In [2]:
ss = SparkSession.builder\
     .config('spark.executor.memory',       '4G')\
     .config('spark.driver.memory',        '40G')\
     .config('spark.driver.maxResultSize', '10G')\
     .getOrCreate()
sc = ss.sparkContext
sq = SQLContext(sc)

In [4]:
pf_train = ss.read.parquet('../data/criteo.parquet.df.train.normed.filled.masked-60000.encode.picked-1000.packed')
pf_dev = ss.read.parquet('../data/criteo.parquet.df.dev.normed.filled.masked-60000.encode.picked-1000.packed')

In [5]:
toy_train = pf_train.sample(fraction=0.00013, seed=2019).cache()

In [6]:
toy_dev = pf_dev.sample(fraction=0.00013, seed=2019).cache()

In [7]:
toy_train.count()

4657

In [8]:
toy_dev.count()

570

In [9]:
toy_train.take(1)

[Row(label=0.0, features=SparseVector(414, {0: 0.0529, 1: -0.0102, 2: 0.0176, 4: 0.0151, 5: 0.0193, 6: 0.0227, 8: -0.1134, 9: 0.0909, 13: 1.0, 27: 1.0, 68: 1.0, 83: 1.0, 103: 1.0, 112: 1.0, 119: 1.0, 131: 1.0, 142: 1.0, 145: 1.0, 152: 1.0, 173: 1.0, 189: 1.0, 215: 1.0, 225: 1.0, 252: 1.0, 270: 1.0, 280: 1.0, 316: 1.0, 325: 1.0, 329: 1.0, 347: 1.0, 353: 1.0, 363: 1.0, 387: 1.0, 401: 1.0}), weight=0.2561962362853416)]

In [10]:
toy_dev.take(1)

[Row(label=0.0, features=SparseVector(414, {0: 0.0529, 1: -0.0077, 2: 0.0227, 3: -0.1138, 4: 2.8185, 5: 0.0451, 7: 0.09, 8: -0.1588, 9: 0.0909, 10: -0.1924, 12: -0.0617, 13: 1.0, 33: 1.0, 68: 1.0, 83: 1.0, 102: 1.0, 116: 1.0, 119: 1.0, 132: 1.0, 143: 1.0, 145: 1.0, 157: 1.0, 173: 1.0, 193: 1.0, 216: 1.0, 225: 1.0, 252: 1.0, 271: 1.0, 299: 1.0, 318: 1.0, 326: 1.0, 329: 1.0, 347: 1.0, 353: 1.0, 363: 1.0, 388: 1.0, 402: 1.0}), weight=0.2561962362853416)]

In [12]:
# Write out the toy sets
toy_train.write.parquet('../data/toy_train.parquet')
toy_dev.write.parquet('../data/toy_dev.parquet')