# What am I doing here?

- GBT model : Everything hardcoded to the features and hyperparams 
chosen during grid search.
- Doing three runs on slightly different subsets of training data.
- Taking the median of the three models as the answer to upload.

TODO:
- Is there a way to do probability calibration in PySpark? 
- Add any good features from Leila's model and see if I can 
improve my score a bit more

In [1]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
# This is optional stuff - either pip install watermark
# or just comment it out (it just keeps track of what library
# versions I have)
%load_ext watermark
%watermark -iv

pyspark 2.4.1



In [3]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Global Variables 

In [4]:
unigrams = [ 'os', 'channel', 'app' ]
bigrams = [['device', 'app'], 
           ['channel', 'app']]

# Checkpoint 1 

Read the csv file, drop the attributed_time (because I didn't use it in the MVP),
and downsample the 0 class to 25% because I'm still on my laptop 

In [5]:
if False:
    df = spark.read.csv('../data/train.csv', 
                    header=True, inferSchema=True)

    df = df.drop('attributed_time')
    df = df.sampleBy('is_attributed', fractions={0:.25,1:1.})
    
    test = spark.read.csv('../data/test.csv', 
                         header= True, inferSchema=True)

    df.write.parquet('../data/checkpoint1.parquet', mode='overwrite')
    test.write.parquet('../data/test_checkpoint1.parquet', mode='overwrite')
else:
    df = spark.read.parquet('../data/checkpoint1.parquet')
    test = spark.read.parquet('../data/test_checkpoint1.parquet')

In [6]:
df.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int')]

In [7]:
test.dtypes

[('click_id', 'int'),
 ('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp')]

In [8]:
df.count()

46575441

In [9]:

test.count()

18790469

# Daily IP prevalence 
Because IP addresses get reassigned, need to do these as feature engineering on train and test
sets separately.
(See the link Elyse posted on the slack.)

In [10]:
df = df.withColumn('doy', 
                   F.dayofyear('click_time'))
test = test.withColumn('doy',
                   F.dayofyear('click_time'))

In [11]:
df_ip_counts = spark.read.parquet('../data/train_ip.parquet')
test_ip_counts = spark.read.parquet('../data/test_ip.parquet')

In [12]:
df = df.join(
    df_ip_counts[['doy','ip','ip_pct']],
    on=['doy','ip'],
    how='left'
)
test = test.join(
    test_ip_counts[['doy','ip','ip_pct']],
    on=['doy','ip'],
    how='left'
)

In [13]:
test.dtypes

[('doy', 'int'),
 ('ip', 'int'),
 ('click_id', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('ip_pct', 'double')]

## Class balancing 

Downsample the majority class and upsample the minority class. 

Todo: Should this downsample just be random or by day or by...? 

In [14]:
class1_a = df.filter(df.is_attributed == 1).sample(
    withReplacement=True, fraction=4.0, seed=111)
class1_b = df.filter(df.is_attributed == 1).sample(
    withReplacement=True, fraction=4.0, seed=222)
class1_c = df.filter(df.is_attributed == 1).sample(
    withReplacement=True, fraction=4.0, seed=333)

In [15]:
df_a = df.sampleBy('is_attributed', {0:.11}, seed=111).unionAll(class1_a)
df_b = df.sampleBy('is_attributed', {0:.11}, seed=222).unionAll(class1_b)
df_c = df.sampleBy('is_attributed', {0:.11}, seed=333).unionAll(class1_c)

## Counting 

Built count tables except for IP with the full training set rather than the 
subset. These tables were created in Spark_count_table.ipynb

In [16]:
def get_count_table( group ):
    if type(group) == str:
        column_name = group + '_pct' # for example: ip_pct
    else:
        column_name = "_".join(group)  # for example: device_os
        
    table_name = 'table_' + column_name
    counts_sdf = spark.read.parquet(f'../data/{table_name}.parquet')
    return counts_sdf

In [17]:
def join_table( sdf, count_table, group ):
    sdf = sdf.join(count_table, group, how='left')
    return sdf

# Add hour column for Leila's new features

In [18]:
def add_hour(sdf):
    return sdf.withColumn('hour',
                F.hour('click_time').astype(T.FloatType()) + 
                F.minute('click_time').astype(T.FloatType())/60.)
    
df_a = add_hour(df_a)
df_b = add_hour(df_b)
df_c = add_hour(df_c)
test = add_hour(test)

In [19]:
# create the count columns with the training data 
# write everything out to disk so we don't have to redo 
# feature engineering when all I want to do is tune hyperparameters
if True:
    for c in unigrams:
        ct   = get_count_table( c )
        df_a   = join_table(df_a, ct, [c])
        df_b   = join_table(df_b, ct, [c])
        df_c   = join_table(df_c, ct, [c])
        test = join_table(test, ct, [c])
    
    for bigram in bigrams:
        ct = get_count_table( bigram )
        df_a = join_table(df_a, ct, bigram)
        df_b = join_table(df_b, ct, bigram)
        df_c = join_table(df_c, ct, bigram)
        test = join_table(test, ct, bigram)

#    df_a.write.parquet('../data/dfa.parquet', mode='overwrite')
#    df_b.write.parquet('../data/dfb.parquet', mode='overwrite')
#    df_c.write.parquet('../data/dfc.parquet', mode='overwrite')
#    test.write.parquet('../data/test_stack.parquet', mode='overwrite')
else:
    df_a = spark.read.parquet('../data/dfa.parquet')
    df_b = spark.read.parquet('../data/dfb.parquet')
    df_c = spark.read.parquet('../data/dfc.parquet')
    test = spark.read.parquet('../data/test_stack.parquet')
    

In [20]:
test = test.fillna(0)

In [21]:

df_a = df_a.fillna(0)
df_b = df_b.fillna(0)
df_c = df_c.fillna(0)

In [22]:
for sdf in [ df_a, df_b, df_c ]:
    sdf.groupby('is_attributed').count().show()

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            1|1827229|
|            0|5069713|
+-------------+-------+

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            1|1828581|
|            0|5075130|
+-------------+-------+

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            1|1825068|
|            0|5070512|
+-------------+-------+



In [23]:
test.count()

18790469

# Create model data in format expected by Spark

In [24]:
input_cols = [ c + '_pct' for c in unigrams ]
input_cols += [ '_'.join(b) for b in bigrams ]
input_cols += ['ip_pct' ]
input_cols

['os_pct', 'channel_pct', 'app_pct', 'device_app', 'channel_app', 'ip_pct']

In [25]:
vec_assembler = VectorAssembler(inputCols=input_cols, outputCol = 'features')
    
if True:
    model_a = vec_assembler.transform(df_a).select('is_attributed', 'features')
    model_b = vec_assembler.transform(df_b).select('is_attributed', 'features')
    model_c = vec_assembler.transform(df_c).select('is_attributed', 'features')
    
 #   model_a.write.parquet('../data/model_a.parquet', mode='overwrite')
 #   model_b.write.parquet('../data/model_b.parquet', mode='overwrite')
 #   model_c.write.parquet('../data/model_c.parquet', mode='overwrite')
else:
    model_a = spark.read.parquet('../data/model_a.parquet')
    model_b = spark.read.parquet('../data/model_b.parquet')
    model_c = spark.read.parquet('../data/model_c.parquet')

In [26]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

# GBT Classifier

In [57]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
)

pg = ParamGridBuilder(
       ).addGrid(
                gbtc.maxDepth, [ 8 ]
       ).addGrid(
                gbtc.subsamplingRate, [ .8  ]
       ).addGrid(
                gbtc.featureSubsetStrategy, [ '5' ] 
       ).addGrid(
                gbtc.maxBins, [ 64 ]
       ).addGrid(
                gbtc.stepSize, [ .15 ]
       ).addGrid(
                gbtc.maxIter, [ 12 ]
       ).addGrid(
                gbtc.minInstancesPerNode, [ 10 ] 
       ).build(
       )

In [58]:
tvs = TrainValidationSplit(
        estimator = gbtc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

In [59]:
tvs_a = tvs.fit(model_a)

In [60]:
results_a = tvs_a.transform(model_a)
evaluator.evaluate(results_a)

0.9709555119579643

In [61]:
tvs_a.bestModel.featureImportances

SparseVector(6, {0: 0.0369, 1: 0.0396, 2: 0.5941, 3: 0.1941, 4: 0.0503, 5: 0.085})

In [62]:
input_cols

['os_pct', 'channel_pct', 'app_pct', 'device_app', 'channel_app', 'ip_pct']

In [63]:
tvs_b = tvs.fit(model_b)
results_b = tvs_b.transform(model_b)
evaluator.evaluate(results_b)

0.9708689128214322

In [64]:
tvs_c = tvs.fit(model_c)
results_c = tvs_c.transform(model_c)
evaluator.evaluate(results_c)

0.970945178729162

In [66]:
tvs_a.bestModel.save('../data/tvs_a.model')
tvs_b.bestModel.save('../data/tvs_b.model')

In [67]:
tvs_c.bestModel.save('../data/tvs_c.model')

# Let's bring the test set in here

In [44]:
test_model = vec_assembler.transform(test)

In [45]:
test_a = tvs_a.transform(test_model)
test_b = tvs_b.transform(test_model)
test_c = tvs_c.transform(test_model)

In [46]:
def get_prediction(sdf):
    sdf = sdf.select('click_id', 
                       F.col('prediction').astype(T.ShortType()), 
                       'probability')
    sdf.groupby('prediction').count().show()
    return sdf

In [52]:
test_a.dtypes

[('channel', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('doy', 'int'),
 ('ip', 'int'),
 ('click_id', 'int'),
 ('click_time', 'timestamp'),
 ('ip_pct', 'double'),
 ('hour', 'double'),
 ('os_pct', 'double'),
 ('channel_pct', 'double'),
 ('app_pct', 'double'),
 ('device_app', 'double'),
 ('channel_app', 'double'),
 ('features', 'vector'),
 ('rawPrediction', 'vector'),
 ('probability', 'vector'),
 ('prediction', 'double')]

# Extract probabilities 

In [53]:
mySchema = T.StructType([
    T.StructField('click_id', T.IntegerType()),
    T.StructField('prediction', T.FloatType()),
    T.StructField('pclass1', T.FloatType())
])

def save_stuff(x):
    return T.Row(click_id=x.click_id, 
                prediction=x.prediction, 
                pclass1=float(x.probability[1]))

vec_a = test_a.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)
vec_b = test_a.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)
vec_c = test_a.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)

# Take the median of the three models as my final answer

In [54]:
vec_a = vec_a.select('click_id', 
                      F.col('pclass1').alias('vec_a') )
vec_b = vec_b.select('click_id', 
                      F.col('pclass1').alias('vec_b') )
vec_c = vec_c.select('click_id', 
                      F.col('pclass1').alias('vec_c') )

joined = vec_a.join(vec_b, ['click_id']).join(vec_c, ['click_id'])

mySchema = T.StructType([
    T.StructField('click_id', T.IntegerType()),
    T.StructField('is_attributed', T.FloatType())
])

from statistics import median
def get_predict(x):
    return T.Row(click_id=x.click_id,
                is_attributed=median([x.vec_a, x.vec_b, x.vec_c]))

joined = joined.rdd.map(lambda x: get_predict(x)).toDF(schema=mySchema)

In [55]:
joined.write.csv('../data/one_last_tiimmee.csv', mode='overwrite')

In [68]:
spark.stop()