# What am I doing here?

- Loading the prefitted models from the training subsets, and running the
results on all 185 million rows of the training set.

TODO: A lot of cut-n-paste from the model-building notebook - should refactor.

In [1]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
# This is optional stuff - either pip install watermark
# or just comment it out (it just keeps track of what library
# versions I have)
%load_ext watermark
%watermark -iv

pyspark 2.4.1



In [3]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Global Variables 

In [4]:
unigrams = [ 'os', 'channel', 'app' ]
bigrams = [['device', 'app'], 
           ['channel', 'app']]

# Read Data

In [5]:
df = spark.read.csv('../data/train.csv',  header=True, inferSchema=True)
df = df.drop('attributed_time')

In [6]:
df.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('is_attributed', 'int')]

# Daily IP prevalence 

In [7]:
df = df.withColumn('doy', F.dayofyear('click_time'))

In [8]:
df_ip_counts = spark.read.parquet('../data/train_ip.parquet')

In [9]:
df = df.join(
    df_ip_counts[['doy','ip','ip_pct']],
    on=['doy','ip'],
    how='left'
)

# All the other count tables

In [10]:
def get_count_table( group ):
    if type(group) == str:
        column_name = group + '_pct' # for example: ip_pct
    else:
        column_name = "_".join(group)  # for example: device_os
        
    table_name = 'table_' + column_name
    counts_sdf = spark.read.parquet(f'../data/{table_name}.parquet')
    return counts_sdf

In [11]:
def join_table( sdf, count_table, group ):
    sdf = sdf.join(count_table, group, how='left')
    return sdf

In [12]:
# create the count columns with the training data 
# write everything out to disk so we don't have to redo 
# feature engineering when all I want to do is tune hyperparameters
for c in unigrams:
    ct   = get_count_table( c )
    df   = join_table(df, ct, [c])
    
for bigram in bigrams:
    ct = get_count_table( bigram )
    df = join_table(df, ct, bigram)

In [13]:
df = df.fillna(0)

In [14]:
df.groupby('is_attributed').count().show()

+-------------+---------+
|is_attributed|    count|
+-------------+---------+
|            1|   456846|
|            0|184447044|
+-------------+---------+



# Create input in format expected by Spark

In [15]:
input_cols = [ c + '_pct' for c in unigrams ]
input_cols += [ '_'.join(b) for b in bigrams ]
input_cols += ['ip_pct' ]
input_cols

['os_pct', 'channel_pct', 'app_pct', 'device_app', 'channel_app', 'ip_pct']

In [16]:
vec_assembler = VectorAssembler(inputCols=input_cols, outputCol = 'features')
df_in = vec_assembler.transform(df).select('is_attributed', 'features')

In [17]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

# GBT Model written to disk during Bestof.ipynb

In [43]:
gbtc_a = GBTClassificationModel.read().load(
    '../data/tvs_a.model')
gbtc_b = GBTClassificationModel.read().load(
    '../data/tvs_b.model')
gbtc_c = GBTClassificationModel.read().load(
    '../data/tvs_c.model')

In [44]:
results_a = gbtc_a.transform(df_in)
evaluator.evaluate(results_a)

0.9707573820497113

In [45]:
results_b = gbtc_b.transform(df_in)
evaluator.evaluate(results_b)

0.9707904513351963

In [46]:
results_c = gbtc_c.transform(df_in)
evaluator.evaluate(results_c)

0.9708539571530767

In [53]:
mean([0.9707573820497113, 0.9707904513351963, 0.9708539571530767])

0.9708005968459947

In [47]:
def gather_stats(sdf):
    ans = sdf.select('is_attributed', 
                       F.col('prediction').astype(T.IntegerType()), 
                    ).groupby(['is_attributed','prediction']
                    ).count(
                    ).collect(
                    )
    tp,fp,tn,fn = 0,0,0,0
    for row in ans:
        if row.is_attributed == 1:
            if row.prediction == 0:
                fn = row['count']
            elif row.prediction == 1:
                tp = row['count']
        elif row.is_attributed == 0:
            if row.prediction == 0:
                tn = row['count']
            elif row.prediction == 1:
                fp = row['count']
    assert(tp != 0)
    assert(tn != 0)
    assert(fp != 0)
    assert(fn != 0)
    # precision = true_pos / (true_pos + false_pos)
    precision = float(tp)/float(tp + fp)
    # recall = true_pos / pos
    recall = float(tp) /float(tp + fn)
    # accuracy 
    acc = float(tp + tn)/float(sum([fn,tp,tn,fp]))
    return precision, recall, acc
                    

In [48]:
# Go have a coffee... take a shower... 
print(".")
pre_a, rec_a, acc_a = gather_stats(results_a)
print("..")
pre_b, rec_b, acc_b = gather_stats(results_b)
print("...")
pre_c, rec_c, acc_c = gather_stats(results_c)

In [49]:
pre_a, pre_b, pre_c

(0.11577750911463025, 0.11838944905689125, 0.11741209049192722)

In [50]:
rec_a, rec_b, rec_c

(0.8581863472592515, 0.8572100883010905, 0.8566300241219141)

In [51]:
acc_a, acc_b, acc_c

(0.9834560430286242, 0.9838756393929841, 0.9837360641790717)

In [52]:
from statistics import mean
print('avg precision', mean([pre_a, pre_b, pre_c]))
print('avg recall', mean([rec_a, rec_b, rec_c]))
print('avg accuracy', mean([acc_a, acc_b, acc_c]))

avg precision 0.11719301622114957
avg recall 0.8573421532274187
avg accuracy 0.9836892488668934


In [None]:
# stop me before I hit return too many times and kill the spark session! 
assert(False)

In [54]:
spark.stop()