# What am I doing here?

- GBT model : Everything hardcoded to the features and hyperparams 
chosen during grid search.
- Doing three runs on slightly different subsets of training data.
- Taking the median of the three models as the answer to upload.

TODO:
- Is there a way to do probability calibration in PySpark? 
- Add any good features from Leila's model and see if I can 
improve my score a bit more

In [121]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [123]:
# This is optional stuff - either pip install watermark
# or just comment it out (it just keeps track of what library
# versions I have)
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pyspark 2.4.1



In [125]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Global Variables 

In [126]:
unigrams = [ 'os', 'channel', 'app' ]
bigrams = [[ 'device', 'os'], 
           ['device', 'channel'], 
           ['device', 'app'], 
           ['channel', 'app']]

# Checkpoint 1 

Read the csv file, drop the attributed_time (because I didn't use it in the MVP),
and downsample the 0 class to 25% because I'm still on my laptop 

In [127]:
if False:
    df = spark.read.csv('../data/train.csv', 
                    header=True, inferSchema=True)

    df = df.drop('attributed_time')
    df = df.sampleBy('is_attributed', fractions={0:.25,1:1.})
    
    test = spark.read.csv('../data/test.csv', 
                         header= True, inferSchema=True)

    df.write.parquet('../data/checkpoint1.parquet', mode='overwrite')
    test.write.parquet('../data/test_checkpoint1.parquet', mode='overwrite')
else:
    df = spark.read.parquet('../data/checkpoint1.parquet')
    test = spark.read.parquet('../data/test_checkpoint1.parquet')

In [128]:
df.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int')]

In [129]:
test.dtypes

[('click_id', 'int'),
 ('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp')]

In [130]:
df.count()

46575441

In [131]:

test.count()

18790469

# Daily IP prevalence 
Because IP addresses get reassigned, need to do these as feature engineering on train and test
sets separately.
(See the link Elyse posted on the slack.)

In [132]:
df = df.withColumn('doy', 
                   F.dayofyear('click_time'))
test = test.withColumn('doy',
                   F.dayofyear('click_time'))

In [133]:
df_ip_counts = spark.read.parquet('../data/train_ip.parquet')
test_ip_counts = spark.read.parquet('../data/test_ip.parquet')

In [134]:
df = df.join(
    df_ip_counts[['doy','ip','ip_pct']],
    on=['doy','ip'],
    how='left'
)
test = test.join(
    test_ip_counts[['doy','ip','ip_pct']],
    on=['doy','ip'],
    how='left'
)

In [136]:
test.dtypes

[('doy', 'int'),
 ('ip', 'int'),
 ('click_id', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('ip_pct', 'double')]

## Class balancing 

Downsample the majority class and upsample the minority class. 

Todo: Should this downsample just be random or by day or by...? 

In [137]:
class1_a = df.filter(df.is_attributed == 1).sample(
    withReplacement=True, fraction=4.0, seed=111)
class1_b = df.filter(df.is_attributed == 1).sample(
    withReplacement=True, fraction=4.0, seed=222)
class1_c = df.filter(df.is_attributed == 1).sample(
    withReplacement=True, fraction=4.0, seed=333)

In [138]:
df_a = df.sampleBy('is_attributed', {0:.11}, seed=111).unionAll(class1_a)
df_b = df.sampleBy('is_attributed', {0:.11}, seed=222).unionAll(class1_b)
df_c = df.sampleBy('is_attributed', {0:.11}, seed=333).unionAll(class1_c)

## Counting 

Built count tables except for IP with the full training set rather than the 
subset. These tables were created in Spark_count_table.ipynb

In [139]:
def get_count_table( group ):
    if type(group) == str:
        column_name = group + '_pct' # for example: ip_pct
    else:
        column_name = "_".join(group)  # for example: device_os
        
    table_name = 'table_' + column_name
    counts_sdf = spark.read.parquet(f'../data/{table_name}.parquet')
    return counts_sdf

In [140]:
def join_table( sdf, count_table, group ):
    sdf = sdf.join(count_table, group, how='left')
    return sdf

In [144]:
# create the count columns with the training data 
# write everything out to disk so we don't have to redo 
# feature engineering when all I want to do is tune hyperparameters
if True:
    for c in unigrams:
        ct   = get_count_table( c )
        df_a   = join_table(df_a, ct, [c])
        df_b   = join_table(df_b, ct, [c])
        df_c   = join_table(df_c, ct, [c])
        test = join_table(test, ct, [c])
    
    for bigram in bigrams:
        ct = get_count_table( bigram )
        df_a = join_table(df_a, ct, bigram)
        df_b = join_table(df_b, ct, bigram)
        df_c = join_table(df_c, ct, bigram)
        test = join_table(test, ct, bigram)

    df_a.write.parquet('../data/dfa.parquet', mode='overwrite')
    df_b.write.parquet('../data/dfb.parquet', mode='overwrite')
    df_c.write.parquet('../data/dfc.parquet', mode='overwrite')
    test.write.parquet('../data/test_stack.parquet', mode='overwrite')
else:
    df_a = spark.read.parquet('../data/dfa.parquet')
    df_b = spark.read.parquet('../data/dfb.parquet')
    df_c = spark.read.parquet('../data/dfc.parquet')
    test = spark.read.parquet('../data/test_stack.parquet')
    

In [145]:
test = test.fillna(0)

In [151]:

df_a = df_a.fillna(0)
df_b = df_b.fillna(0)
df_c = df_c.fillna(0)

In [146]:
for sdf in [ df_a, df_b, df_c ]:
    sdf.groupby('is_attributed').count().show()

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            1|1827229|
|            0|5069713|
+-------------+-------+

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            1|1828581|
|            0|5075130|
+-------------+-------+

+-------------+-------+
|is_attributed|  count|
+-------------+-------+
|            1|1825068|
|            0|5070512|
+-------------+-------+



In [147]:
test.count()

18790469

In [149]:
# Put the hour column back in because the score went down

def add_hour(sdf):
    return sdf.withColumn('hour',
                F.hour('click_time').astype(T.FloatType()) + 
                F.minute('click_time').astype(T.FloatType())/60.)
    
df_a = add_hour(df_a)
df_b = add_hour(df_b)
df_c = add_hour(df_c)
test = add_hour(test)

# Create model data in format expected by Spark

In [152]:
input_cols = [ c + '_pct' for c in unigrams ]
input_cols += [ 'device_app', 'channel_app']
input_cols += ['ip_pct', 'hour' ]

vec_assembler = VectorAssembler(inputCols=input_cols, outputCol = 'features')
    
if True:
    model_a = vec_assembler.transform(df_a).select('is_attributed', 'features')
    model_b = vec_assembler.transform(df_b).select('is_attributed', 'features')
    model_c = vec_assembler.transform(df_c).select('is_attributed', 'features')
    
    model_a.write.parquet('../data/model_a.parquet', mode='overwrite')
    model_b.write.parquet('../data/model_b.parquet', mode='overwrite')
    model_c.write.parquet('../data/model_c.parquet', mode='overwrite')
else:
    model_a = spark.read.parquet('../data/model_a.parquet')
    model_b = spark.read.parquet('../data/model_b.parquet')
    model_c = spark.read.parquet('../data/model_c.parquet')

In [153]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

# GBT Classifier

In [154]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
    maxDepth = 8,
    subsamplingRate = 0.8,
    featureSubsetStrategy = '5',
    maxBins = 64,
    stepSize = .16,
    maxIter = 10,
    minInstancesPerNode = 10
)

In [155]:
results_a = gbtc.fit(model_a)
results_b = gbtc.fit(model_b)
results_c = gbtc.fit(model_c)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed', )

# Let's bring the test set in here

In [156]:
test_model = vec_assembler.transform(test)

In [157]:
test_a = results_a.transform(test_model)
test_b = results_b.transform(test_model)
test_c = results_c.transform(test_model)

In [158]:
def get_prediction(sdf):
    sdf = sdf.select('click_id', 
                       F.col('prediction').astype(T.ShortType()), 
                       'probability')
    sdf.groupby('prediction').count().show()
    return sdf

In [159]:
count_a = get_prediction(test_a)
count_b = get_prediction(test_b)
count_c = get_prediction(test_c)

+----------+--------+
|prediction|   count|
+----------+--------+
|         1|  507111|
|         0|18283358|
+----------+--------+

+----------+--------+
|prediction|   count|
+----------+--------+
|         1|  503658|
|         0|18286811|
+----------+--------+

+----------+--------+
|prediction|   count|
+----------+--------+
|         1|  526701|
|         0|18263768|
+----------+--------+



# Extract probabilities 

In [160]:
mySchema = T.StructType([
    T.StructField('click_id', T.IntegerType()),
    T.StructField('prediction', T.ShortType()),
    T.StructField('pclass1', T.FloatType())
])

def save_stuff(x):
    return T.Row(click_id=x.click_id, 
                prediction=x.prediction, 
                pclass1=float(x.probability[1]))

vec_a = count_a.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)
vec_b = count_b.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)
vec_c = count_c.rdd.map(lambda x: save_stuff(x)).toDF(schema=mySchema)

# Take the median of the three models as my final answer

In [161]:
vec_a = vec_a.select('click_id', 
                      F.col('pclass1').alias('vec_a') )
vec_b = vec_b.select('click_id', 
                      F.col('pclass1').alias('vec_b') )
vec_c = vec_c.select('click_id', 
                      F.col('pclass1').alias('vec_c') )

joined = vec_a.join(vec_b, ['click_id']).join(vec_c, ['click_id'])

mySchema = T.StructType([
    T.StructField('click_id', T.IntegerType()),
    T.StructField('is_attributed', T.FloatType())
])

from statistics import median
def get_predict(x):
    return T.Row(click_id=x.click_id,
                is_attributed=median([x.vec_a, x.vec_b, x.vec_c]))

joined = joined.rdd.map(lambda x: get_predict(x)).toDF(schema=mySchema)

In [162]:
joined.write.csv('../data/vote_results.csv', mode='overwrite')

In [163]:
spark.stop()