In [68]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [26]:
# This is optional stuff - either pip install watermark
# or just comment it out (it just keeps track of what library
# versions I have)
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pyspark 2.4.1



In [27]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '10g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

<pyspark.sql.session.SparkSession.Builder at 0x11060c358>

In [28]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Checkpoint 1 

Read the csv file, drop the attributed_time (because I didn't use it in the MVP),
and downsample the 0 class by 50% because I'm still on my laptop 

In [29]:
if False:
    df = spark.read.csv('../data/train.csv', 
                    header=True, inferSchema=True)

    df = df.drop('attributed_time')
    df = df.sampleBy('is_attributed', fractions={0:.5,1:1.})

    df.write.parquet('../data/checkpoint1.parquet', mode='overwrite')
else:
    df = spark.read.parquet('../data/checkpoint1.parquet')

In [30]:
df.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int')]

# Checkpoint 2: Time bins

Putting the times into bins with KMeans is also a demo of how to do
machine learning in Spark

In [31]:

if False:
    df = df.withColumn('minute', 
             (F.hour('click_time') * 60 + F.minute('click_time')).cast(T.FloatType()) )
    df = df.withColumn('doy', F.dayofyear('click_time'))

    vec_assember = VectorAssembler(inputCols=['minute'], outputCol='features')
    df = vec_assember.transform(df)

    time_binarizer = KMeans( featuresCol='features', predictionCol='time_bin', k=10)
    model = time_binarizer.fit(df.select('features'))
    model.save('../data/fitted_time_binarizer.ml')

    df = model.transform(df)
    df = df.drop('minute')
    df = df.drop('features')
    df = df.drop('click_time')
    
    df.write.parquet('../data/checkpoint2.parquet', mode='overwrite')
else:
    # TODO: Need to read the saved model back in? 
    df = spark.read.parquet('../data/checkpoint2.parquet')

In [32]:
df.show()

+------+---+------+---+-------+-------------+---+--------+
|    ip|app|device| os|channel|is_attributed|doy|time_bin|
+------+---+------+---+-------+-------------+---+--------+
| 65252| 12|     1| 32|    328|            0|311|       9|
| 20173|  3|     1| 19|    280|            0|311|       9|
|124715|  8|     1| 19|    145|            0|311|       9|
| 81731|  2|     1| 17|    469|            0|311|       9|
|147957| 24|     1| 13|    105|            0|311|       9|
| 92820| 18|     1| 20|    121|            0|311|       9|
|178873|  9|     1| 13|    445|            0|311|       9|
|  7059|  2|     1| 36|    435|            0|311|       9|
|178023|  2|     1| 25|    477|            0|311|       9|
|  1700| 12|     1| 15|    328|            0|311|       9|
| 55786| 15|     1| 31|    315|            0|311|       9|
| 78223| 18|     1| 18|    107|            0|311|       9|
| 77041|  2|     1| 19|    435|            0|311|       9|
|203075| 18|     1| 11|    134|            0|311|       

# Checkpoint 3 
## Counting and downsampling

Really taking stuff away from the MVP model here because it's too hard to 
make it run on my laptop.  Maybe next time...

My first three predictors are:

 - device/os pair
 - app
 - channel
 
Time is also implicitly part of the model becaue I'm binning by 
calendar day and time before normalizing the counts.

In [76]:
if False:
    device_os_count = df.groupby( [ 'device', 
                                'os',
                                'time_bin',
                                'doy' ] 
                     ).count(
                     ).withColumnRenamed(
                               "count", "dev_os_count"
                     )

    app_count = df.groupby(['app', 'time_bin', 'doy']
                      ).count(
                        ).withColumnRenamed(
                            "count", "app_count"
                        )

    channel_count = df.groupby(['channel', 'time_bin', 'doy']
                      ).count(
                        ).withColumnRenamed(
                            "count", "channel_count"
                        )

    bin_size_count = df.groupby(['time_bin', 'doy']).count()

# I should have all the information I need for IDF at this point, so throw away even more data so I can do a join.

    df = df.sampleBy('is_attributed', fractions={0:.005,1:1.})
    print(df.groupby('is_attributed').count().show())

# Merge the munged data and save it as checkpoint 3

    device_os_count = device_os_count.join(bin_size_count, on = [ 'doy', 'time_bin' ])
    app_count = app_count.join(bin_size_count, on = [ 'doy', 'time_bin' ])
    channel_count = channel_count.join(bin_size_count, on = [ 'doy', 'time_bin' ])

# The probabilitistic form of IDF (see Wikipedia page) is less sensitive to the size of the corpus,

    device_os_count = device_os_count.withColumn('device_os_idf', F.log(F.column('count') - F.column('dev_os_count')) - F.log('dev_os_count'))
    app_count = app_count.withColumn('app_idf', F.log(F.column('count') - F.column('app_count')) - F.log('app_count'))
    channel_count = channel_count.withColumn('channel_idf', F.log(F.column('count') - F.column('channel_count')) - F.log('channel_count'))

    df.createOrReplaceTempView('data')
    device_os_count.createOrReplaceTempView('device_os_count')
    app_count.createOrReplaceTempView('app_count')
    channel_count.createOrReplaceTempView('channel_count')
    bin_size_count.createOrReplaceTempView('bin_size_count')

    df = spark.sql("""
SELECT is_attributed, device_os_idf, app_idf, channel_idf
FROM data
JOIN device_os_count 
ON   data.doy = device_os_count.doy
AND  data.time_bin = device_os_count.time_bin
AND  data.device = device_os_count.device
JOIN app_count
ON   data.doy = app_count.doy
AND  data.time_bin = app_count.time_bin
AND  data.app = app_count.app
JOIN channel_count
ON   data.doy = channel_count.doy
AND  data.time_bin = channel_count.time_bin
AND  data.channel = channel_count.channel
""")

    df.write.parquet('../data/checkpoint3.parquet', mode='overwrite')
    
else:
    df = spark.read.parquet('../data/checkpoint3.parquet')

In [77]:
df.createOrReplaceTempView('data')
df.show()

+-------------+------------------+------------------+------------------+
|is_attributed|     device_os_idf|           app_idf|       channel_idf|
+-------------+------------------+------------------+------------------+
|            1|3.8025177087881463|1.7919873757069489|10.097685973738646|
|            1| 9.669209323473428|1.7919873757069489|10.097685973738646|
|            1| 7.200170037028189|1.7919873757069489|10.097685973738646|
|            1| 9.337050591247532|1.7919873757069489|10.097685973738646|
|            1|10.563064544138223|1.7919873757069489|10.097685973738646|
|            1|4.7103701220476815|1.7919873757069489|10.097685973738646|
|            1| 11.77947806150139|1.7919873757069489|10.097685973738646|
|            1| 11.77947806150139|1.7919873757069489|10.097685973738646|
|            1|11.556332595146063|1.7919873757069489|10.097685973738646|
|            1| 4.890197679790861|1.7919873757069489|10.097685973738646|
|            1|13.165778167722625|1.791987375706948

# Train Random Forest model

I'm still not doing full cross-validation here, but one split is better than nothing!

In [88]:
if False:
    vec_assember = VectorAssembler(inputCols=['device_os_idf', 'app_idf', 'channel_idf'], 
                               outputCol='features')
    df = vec_assember.transform(df)

# Default metric in pyspark is ROC AUC
    evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

    rfc = RandomForestClassifier(
        featuresCol = 'features',
        labelCol = 'is_attributed',
        numTrees = 20
    )

    pg = ParamGridBuilder(
       ).addGrid(
                rfc.minInstancesPerNode, [100]
       ).addGrid(
                rfc.maxDepth, [4,5]
       ).addGrid(
                rfc.featureSubsetStrategy, ['2','3']
       ).addGrid(
                rfc.subsamplingRate, [.75, .87,  1.]
       ).build(
       )

    tvs = TrainValidationSplit(
            estimator = rfc,
            estimatorParamMaps = pg,
            evaluator = evaluator,
            trainRatio = .8
    )

    tvs_model = tvs.fit(df)
    df = tvs_model.transform(df)
    evaluator.evaluate(df)

    print(tvs_model.bestModel.explainParams())
# minIntances = 100, maxDepth =4 (increased to 5 for next time), featureSubsetStrategy = 2 (add 3 as a choice next time),
# subsamplingRate wasn't set, add some choices for next time.

    tvs_model.bestModel.save('../data/TrainedSparkRF')
else:
    pass
    # TODO : Figure out how to read the model back in

# Next up -- 

- Load the test set 
- Feature engineering
- See if training and test data seem to be of similar distribution
- If so, run a prediction and upload it to Kaggle