# What am I doing here?

1. Create all the bigram counts
2. Cut them off at the top 100 each
3. Merge them back in 
4. Run through a random forest
5. See if it cross-validates
6. See if you can upload a guess to Kaggle and see if it generalizes at all

In [20]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
from itertools import combinations

In [22]:
# This is optional stuff - either pip install watermark
# or just comment it out (it just keeps track of what library
# versions I have)
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pyspark 2.4.1



In [23]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '10g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

<pyspark.sql.session.SparkSession.Builder at 0x11f19b320>

In [24]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Checkpoint 1 

Read the csv file, drop the attributed_time (because I didn't use it in the MVP),
and downsample the 0 class to 25% because I'm still on my laptop 

In [25]:
if False:
    df = spark.read.csv('../data/train.csv', 
                    header=True, inferSchema=True)

    df = df.drop('attributed_time')
    df = df.sampleBy('is_attributed', fractions={0:.25,1:1.})
    
    test = spark.read.csv('../data/test.csv', 
                         header= True, inferSchema=True)

    df.write.parquet('../data/checkpoint1.parquet', mode='overwrite')
    test.write.parquet('../data/test_checkpoint1.parquet', mode='overwrite')
else:
    df = spark.read.parquet('../data/checkpoint1.parquet')
    test = spark.read.parquet('../data/test_checkpoint1.parquet')

In [26]:
df.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int')]

In [27]:
test.dtypes

[('click_id', 'int'),
 ('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp')]

In [28]:
df.count()

46575441

In [29]:

test.count()

18790469

# Checkpoint 2
## Counting 


In [30]:
def make_count_column( sdf, groupby_clause ):
    
    if type(groupby_clause) == str:
        column_name = groupby_clause + '_pct' # for example: ip_pct
        join_clause = [groupby_clause]
    else:
        column_name = "_".join(groupby_clause)  # for example: device_os
        join_clause = groupby_clause

    counts_sdf =  sdf.groupby( 
                        groupby_clause 
                ).count(
                ).orderBy(
                    'count', ascending = False
                ).limit(
                    100 # so we don't chase the "long tail"
                )
    
    maxcnt = counts_sdf.select(F.max('count').alias('maxcnt')).collect()
    maxcnt = maxcnt[0].maxcnt
    
    counts_sdf = counts_sdf.withColumn('ratios',
                    F.col('count').astype(T.DoubleType())/float(maxcnt))
    counts_sdf = counts_sdf.drop('count').withColumnRenamed('ratios', column_name)
    
    table_name = 'table_' + column_name
    counts_sdf.createOrReplaceTempView(table_name)
    
    sdf = sdf.join(
            counts_sdf,
            groupby_clause,
            how='left'
    )
    
    return sdf

In [31]:
# create percents from training data 

columns = [ 'device', 'os', 'ip', 'channel', 'app' ]
bigrams = [ list(b) for b in combinations(columns,2)]

if False:
    
# create the count columns with the training data 

    for c in columns:
        df = make_count_column( df, c )
    
    for bigram in combinations(columns, 2):
        df = make_count_column( df, list(bigram) )

    df = df.fillna(0)
    
# merge them with test data

    unigram_columns = columns
    unigram_tables = [ 'table_' + c + '_pct' for c in columns ]

    for table_name, column_name in zip(unigram_tables, unigram_columns):
        pct_column_name = column_name + '_pct'
        udf = spark.table(table_name)
        test = test.join(udf,
                     [column_name],
                     how = 'left'
                    )

    bigrams = [ list(b) for b in combinations(columns,2)]
    big_tables = [ 'table_' + '_'.join(b) for b in bigrams ]

    for table_name, bigram in zip(big_tables, bigrams):
        column_name = '_'.join(bigram)
        udf = spark.table(table_name)
        test = test.join(udf,
                    bigram,
                    how = 'left')
    
    test = test.fillna(0)

# checkpoint the train and test counts

    df.write.parquet('../data/df_counts.parquet', mode='overwrite')
    test.write.parquet('../data/test_counts.parquet', mode='overwrite')
else:
    df = spark.read.parquet('../data/df_counts.parquet')
    test = spark.read.parquet('../data/test_counts.parquet')

In [32]:
df.dtypes

[('channel', 'int'),
 ('app', 'int'),
 ('ip', 'int'),
 ('os', 'int'),
 ('device', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int'),
 ('device_pct', 'double'),
 ('os_pct', 'double'),
 ('ip_pct', 'double'),
 ('channel_pct', 'double'),
 ('app_pct', 'double'),
 ('device_os', 'double'),
 ('device_ip', 'double'),
 ('device_channel', 'double'),
 ('device_app', 'double'),
 ('os_ip', 'double'),
 ('os_channel', 'double'),
 ('os_app', 'double'),
 ('ip_channel', 'double'),
 ('ip_app', 'double'),
 ('channel_app', 'double')]

In [33]:
test.dtypes

[('channel', 'int'),
 ('app', 'int'),
 ('ip', 'int'),
 ('os', 'int'),
 ('device', 'int'),
 ('click_id', 'int'),
 ('click_time', 'timestamp'),
 ('device_pct', 'double'),
 ('os_pct', 'double'),
 ('ip_pct', 'double'),
 ('channel_pct', 'double'),
 ('app_pct', 'double'),
 ('device_os', 'double'),
 ('device_ip', 'double'),
 ('device_channel', 'double'),
 ('device_app', 'double'),
 ('os_ip', 'double'),
 ('os_channel', 'double'),
 ('os_app', 'double'),
 ('ip_channel', 'double'),
 ('ip_app', 'double'),
 ('channel_app', 'double')]

# A little class balancing on the train set

In [34]:
df.groupby('is_attributed').count().show()

+-------------+--------+
|is_attributed|   count|
+-------------+--------+
|            1|  456846|
|            0|46118595|
+-------------+--------+



In [35]:
class0 = df.sampleBy('is_attributed', {0:.11})
class0.groupby('is_attributed').count().show()

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            0|5072437|
+-------------+-------+



In [36]:
class1 = df.filter(df.is_attributed == 1).sample(withReplacement=True, fraction=4.0)

In [37]:
class1.count()

1825575

In [38]:
class0.count()

5072437

In [39]:
df = class1.unionAll(class0)

# And now we model! 

In [40]:
# Dropping IP because other teams have found it doesn't generalize well to
# the test set. (Need to go back and delete from checkpoints to save 
# time)

columns = [ 'device', 'os', 'channel', 'app' ]
bigrams = [ list(b) for b in combinations(columns,2)]

input_cols = [ c + '_pct' for c in columns ]
input_cols += [ '_'.join(b) for b in bigrams ]
vec_assembler = VectorAssembler(inputCols=input_cols, outputCol = 'features')

In [41]:
model_data = vec_assembler.transform(df)

In [42]:
model_data.dtypes

[('channel', 'int'),
 ('app', 'int'),
 ('ip', 'int'),
 ('os', 'int'),
 ('device', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int'),
 ('device_pct', 'double'),
 ('os_pct', 'double'),
 ('ip_pct', 'double'),
 ('channel_pct', 'double'),
 ('app_pct', 'double'),
 ('device_os', 'double'),
 ('device_ip', 'double'),
 ('device_channel', 'double'),
 ('device_app', 'double'),
 ('os_ip', 'double'),
 ('os_channel', 'double'),
 ('os_app', 'double'),
 ('ip_channel', 'double'),
 ('ip_app', 'double'),
 ('channel_app', 'double'),
 ('features', 'vector')]

In [43]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

rfc = RandomForestClassifier(
    labelCol = 'is_attributed',
)

# Preparting for future hyperparameter tuning
pg = ParamGridBuilder(
       ).addGrid(
                rfc.numTrees, [25]
       ).addGrid(
                rfc.maxDepth, [5,7]
       ).addGrid(
                rfc.subsamplingRate, [.55 ]
       ).build(
       )

tvs = TrainValidationSplit(
        estimator = rfc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

tvs_model = tvs.fit(model_data)

In [44]:
results = tvs_model.transform(model_data)

In [45]:
evaluator.evaluate(results)

0.9475162656537023

# Let's bring the test set in here

In [46]:
test_model = vec_assembler.transform(test)
results = tvs_model.transform(test_model)


In [47]:
results.dtypes

[('channel', 'int'),
 ('app', 'int'),
 ('ip', 'int'),
 ('os', 'int'),
 ('device', 'int'),
 ('click_id', 'int'),
 ('click_time', 'timestamp'),
 ('device_pct', 'double'),
 ('os_pct', 'double'),
 ('ip_pct', 'double'),
 ('channel_pct', 'double'),
 ('app_pct', 'double'),
 ('device_os', 'double'),
 ('device_ip', 'double'),
 ('device_channel', 'double'),
 ('device_app', 'double'),
 ('os_ip', 'double'),
 ('os_channel', 'double'),
 ('os_app', 'double'),
 ('ip_channel', 'double'),
 ('ip_app', 'double'),
 ('channel_app', 'double'),
 ('features', 'vector'),
 ('rawPrediction', 'vector'),
 ('probability', 'vector'),
 ('prediction', 'double')]

In [52]:
results = results.select('click_id', F.col('prediction').astype(T.ShortType()), 'probability')

In [53]:
results.dtypes

[('click_id', 'int'), ('prediction', 'smallint'), ('probability', 'vector')]

In [54]:
results.groupby('prediction').count().show()

+----------+--------+
|prediction|   count|
+----------+--------+
|         1|  623912|
|         0|18166557|
+----------+--------+



In [55]:
results.write.parquet('../data/results.parquet', mode='overwrite')

# Extract probabilities 

In [132]:
mySchema = T.StructType([
    T.StructField('click_id', T.IntegerType()),
    T.StructField('prediction', T.ShortType()),
    T.StructField('pclass1', T.FloatType())
])

def save_stuff(x):
    return T.Row(click_id=x.click_id, 
                prediction=x.prediction, 
                pclass1=float(x.probability[1]))

extract_vector = results.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)

In [133]:
extract_vector.show(5)

+--------+----------+-----------+
|click_id|prediction|    pclass1|
+--------+----------+-----------+
|14219477|         0| 0.04365348|
|14219478|         0|0.038334113|
|14219479|         0| 0.06678616|
|14219480|         0|0.070457734|
|14219481|         0| 0.07859021|
+--------+----------+-----------+
only showing top 5 rows



In [135]:
# It doesn't write the CSV with a header, so the alias is_attributed isn't really needed
extract_vector.select('click_id', 
                      F.col('pclass1').alias('is_attributed')
             ).write.csv('../data/submit.csv')

In [36]:
spark.stop()