# What am I doing here?

- Counting the prevalence of various features (the ones that were predictive in earlier mvp models)
- Each count is normalized by the max count found
- All the count tables are written out as parquet files so I don't have to count them for each model

In [51]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [52]:
from itertools import combinations

In [53]:
# This is optional stuff - either pip install watermark
# or just comment it out (it just keeps track of what library
# versions I have)
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pyspark 2.4.1



In [54]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

<pyspark.sql.session.SparkSession.Builder at 0x10f913c18>

In [55]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()


## Define global variables for my features

In [56]:
unigrams = [ 'os', 'channel', 'app' ]
bigrams = [[ 'device', 'os'], 
           ['device', 'channel'], 
           ['device', 'app'], 
           ['channel', 'app'],
           ]

# Read in all the stuff I want to count

In [57]:
train = spark.read.csv('../data/train.csv', 
                    header=True, inferSchema=True)

In [58]:
train = train.drop('attributed_time')
train = train.drop('is_attributed')
all_data = train
all_data.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp')]

# Make count tables

Just using the columns from the best-so-far model. 

In [59]:
def make_count_table( groupby_clause ):
    
    if type(groupby_clause) == str:
        column_name = groupby_clause + '_pct' # for example: ip_pct
    else:
        column_name = "_".join(groupby_clause)  # for example: device_os
        
    counts_sdf =  all_data.groupby( 
                        groupby_clause 
                ).count(
                ).orderBy(
                    'count', ascending = False
                )
    
    maxcnt = counts_sdf.select(F.max('count').alias('maxcnt')).collect()
    maxcnt = maxcnt[0].maxcnt
    
    counts_sdf = counts_sdf.withColumn('ratios',
                    F.col('count').astype(T.DoubleType())/float(maxcnt))
    counts_sdf = counts_sdf.drop('count').withColumnRenamed('ratios', column_name)
    
    table_name = 'table_' + column_name
    counts_sdf.createOrReplaceTempView(table_name)
    

## Create all the parquet files

In [10]:
for c in unigrams:
    make_count_table( c )

In [11]:
for bigram in bigrams:
    make_count_table( bigram )

In [12]:
unigram_tables = [ 'table_' + c + '_pct' for c in unigrams ]
big_tables = [ 'table_' + '_'.join(b) for b in bigrams ]
all_tables = unigram_tables + big_tables

for table in all_tables:
    df = spark.table(table)
    df.write.parquet(f'../data/{table}.parquet', mode='overwrite')

# IP counting

In [60]:
test = spark.read.csv('../data/test.csv', header=True,
                     inferSchema=True)

# add date column using day of year function
train = train.withColumn('doy', F.dayofyear('click_time'))
test = test.withColumn('doy', F.dayofyear('click_time'))

In [61]:
train.dtypes

[('ip', 'int'),
 ('app', 'int'),
 ('device', 'int'),
 ('os', 'int'),
 ('channel', 'int'),
 ('click_time', 'timestamp'),
 ('doy', 'int')]

In [7]:
def make_ip_counts(sdf):
    ipday = ['doy', 'ip']
    # count how many times an ip appears each day
    day_counts = sdf[ipday].groupby(ipday).count()
    # find the max count for each day
    day_max = day_counts[['doy','count']]\
                    .groupby(['doy'])\
                    .max()\
                    .withColumnRenamed('max(count)', 'day_max')\
                    .drop('max(doy)')
    # merge the max per day into the daily counts table
    merge = day_counts.join(day_max, ['doy'], how='left')
    # normalize all the counts by the max
    ip_table = merge.withColumn('ip_pct',
                     F.col('count').astype(T.FloatType())/
                     F.col('day_max').astype(T.FloatType())
                    ).drop(
                        'count'
                    ).drop(
                        'day_max'
                    )
    return ip_table

In [15]:
train_ip = make_ip_counts(train)

In [16]:
test_ip = make_ip_counts(test)

In [17]:
train_ip.write.parquet('../data/train_ip.parquet',
                      mode='overwrite')
test_ip.write.parquet('../data/test_ip.parquet',
                        mode='overwrite')

In [34]:
spark.stop()