In [12]:
import findspark
import os
findspark.init()
import pyspark
from pyspark import SparkFiles
from pyspark import sql
from pyspark import SparkConf

from pyspark.sql import SQLContext, HiveContext
from pyspark import SparkContext


In [13]:
from pyspark.sql import Row
from pyspark.sql import DataFrameWriter
from pyspark.sql import DataFrameReader
from pyspark.sql import GroupedData

from pyspark import StorageLevel

from pyspark.sql import functions as F
from pyspark.sql.functions import struct
from pyspark.sql import GroupedData

from pyspark.ml.feature import *

In [14]:
from functools import partial
import collections
import numpy as np

from datasu.auc import *

from datasu.dicts import *
from datasu.files import *
from datasu.pandas import *
from datasu.persist import *
from datasu.spark import *

import pandas as pd

In [22]:
conf = SparkConf()
conf.set('spark.executor.memory', '4g')
conf.set('spark.driver.memory', '12g')
conf.set('spark.python.worker.memory', '2g')
conf.set("spark.driver.maxResultSize", "5g")
conf.set("spark.executor.max", 3)
conf.set("spark.cores.max", 28)
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+UseCompressedOops')

conf.setAppName('prepare features')
conf.getAll()

[(u'spark.master', u'spark://spark1.ea.lab:7077'),
 (u'spark.executor.max', u'3'),
 (u'spark.driver.memory', u'12g'),
 (u'spark.submit.pyFiles',
  u'/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.executor.memory', u'4g'),
 (u'spark.jars',
  u'file:/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,file:/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,file:/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.app.name', u'prepare features'),
 (u'spark.driver.maxResultSize', u'5g'),
 (u'spark.files',
  u'file:/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,file:/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,file:/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.serializer', u'org.apache.spark.serializer.KryoSerializer'),
 (u'spark.cores.max', u'28'),
 (u's

In [23]:
try:
    sc.stop()
except:
    print 'spark context not exists'

In [24]:
   
sc = pyspark.SparkContext(conf=conf)
sqc = pyspark.SQLContext(sc)
# psc = PSparkContext(sc)

In [None]:
sc.defaultParallelism, sc.defaultMinPartitions

In [None]:
csv_reader = sqc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')

## LOAD DATA

In [None]:
base_data_path = '/home/ds/dev/data/Kagle-ValuesShoppers/'
spark_data_path = 'file://'+ base_data_path + 'spark_data/'
transactions_name = 'transactions'

In [None]:
df_coupons = pd.read_csv(base_data_path+'offers')[['offer','category','company','brand','offervalue','quantity']]
df_offers_ids = pd.read_csv(base_data_path+'trainHistory').rename(columns={'id': 'customer_id'})
df_offers_ids_subm = pd.read_csv(base_data_path+'testHistory').rename(columns={'id': 'customer_id'})
# df_trans_all = pd.read_csv(base_data_path+'transactions_reduced_category').rename(columns={'id': 'customer_id'})

In [None]:
df_offers_hist = pd.merge(df_offers_ids, df_coupons, on=['offer'])
df_offers_hist = df_offers_hist[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity','repeattrips','repeater']]

df_offers_subm = pd.merge(df_offers_ids_subm, df_coupons, on=['offer'])
df_offers_subm = df_offers_subm[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity']]

In [None]:
ddf_transactions = csv_reader.load(base_data_path+transactions_name, samplingRatio=None)
ddf_transactions.rdd.setName(transactions_name)
ddf_transactions.alias('transactions')

ddf_transactions.rdd.getNumPartitions()
ddf_transactions = ddf_transactions.withColumnRenamed('id','customer_id')

In [None]:
cat_cols = ['chain','market','category','company','brand']
num_cols = ['offervalue','quantity']

## EXPLORE DATA

In [None]:
ddf_transactions.show(5)

In [None]:
ddf_transactions.select('dept').distinct().count()

In [None]:
ddf_transactions.select('category').distinct().count()

In [None]:
ddf_transactions.select('company').distinct().count()

In [None]:
ddf_transactions.select('brand').distinct().count()

In [None]:
ddf_transactions.select('customer_id').distinct().count()

## PREPARE DATA

In [None]:
ddf_offers_hist_ids = sqc.createDataFrame(df_offers_hist).select('customer_id')
ddf_offers_all_ids = ddf_offers_hist_ids.unionAll(sqc.createDataFrame(df_offers_subm).select('customer_id')).distinct()

In [None]:
ddf_transactions = ddf_transactions.join(ddf_offers_all_ids, on='customer_id', how='leftsemi')
ddf_transactions = ddf_transactions.repartition(10000)

In [None]:
ddf_transactions.rdd.getNumPartitions()


## PREPARE FEATURES

In [None]:
summ_grouping = {'total':F.sum, 'average':F.avg }
count_grouping = {'count':F.count }

count_agg = partial(get_ddf_aggs, agg_columns=['customer_id'], agg_funcs=count_grouping, prefix='agg_')
total_avg_agg = partial(get_ddf_aggs, agg_columns=['productsize','purchasequantity','purchaseamount'], agg_funcs=summ_grouping, prefix='agg_')

In [None]:
# grpby_columns = ['customer_id','brand', 'category', 'dept']
grpby_columns = ['customer_id','brand','category']
grpby_columns_name = ['customer_id','brand']


### agg customer_brand 

In [None]:
ddf_trans_agg_customer_brand = ddf_transactions.groupBy(grpby_columns) \
                                        .agg(*(count_agg(grpby_columns_name) + total_avg_agg(grpby_columns_name)))

In [None]:
ddf_trans_agg_customer_brand.columns

In [None]:
ddf_transactions.columns

### support 1%

In [None]:
frequent_brands_support1 = ddf_transactions.freqItems(['brand'], support=0.1).first().asDict()['brand_freqItems']
frequent_brands_support1 = map(str,frequent_brands_support1)

In [None]:
ddf_onehot_trans_agg_customer_brand_fs1 = ddf_transactions.groupBy('customer_id') \
                                                    .pivot('brand', frequent_brands_support1).count()

In [None]:
def pivot_aggregate(ddf, grpby_columns, pivot_column, aggs, pivot_filter_values=None, pivot_filter_support=None):
    if pivot_filter_support and not pivot_filter_values:        
        frequent = ddf.freqItems([pivot_column], support=pivot_filter_support).first().asDict()[pivot_column+'_freqItems']
        pivot_filter_values = map(str,frequent)
    
    ddf_gr = ddf.groupBy(*grpby_columns)
    ddf_pivot = ddf_gr.pivot(pivot_column, pivot_filter_values)
    ddf_agg = ddf_pivot.agg(*aggs)
    return ddf_agg
    

In [None]:
ddf_onehot_trans_agg_customer_brand_fs1 = ddf_transactions.groupBy('customer_id') 
ddf_onehot_trans_agg_customer_brand_fs1.pivot()

### support 0.1%

In [None]:
frequent_brands_support01 = ddf_transactions.freqItems(['brand'], support=0.01).first().asDict()['brand_freqItems']
frequent_brands_support01 = map(str,frequent_brands_support01)

In [None]:
frequent_brands_support01 = pivot_aggregate()

In [None]:
ddf_onehot_trans_agg_customer_brand_fs01 = ddf_transactions.groupBy('customer_id') \
                                                    .pivot('brand', frequent_brands_support01).count()

### support 0.01%

In [None]:
frequent_brands_support001 = ddf_transactions.freqItems(['brand'], support=0.001).first().asDict()['brand_freqItems']
frequent_brands_support001 = map(str,frequent_brands_support001)

In [None]:
ddf_onehot_trans_agg_customer_brand_fs001 = ddf_transactions.groupBy('customer_id') \
                                                    .pivot('brand', frequent_brands_support001).count()

In [None]:
def pivot_aggs(df, aggs):
    

In [None]:
ddf_onehot_trans_agg_customer_brand_all = ddf_transactions.groupBy(grpby_columns)
                                                    .pivot('brand').count()

In [None]:
write_ddf_to_csv(ddf_onehot_trans_agg_customer_brand_all, spark_data_path+'ddf_onehot_trans_agg_customer_brand_all')

In [None]:
# ddf_onehot_trans_agg_customer_brand_fs1 = csv_reader.load(spark_data_path+'ddf_onehot_trans_agg_customer_brand_fs1', samplingRatio=None)
# ddf_onehot_trans_agg_customer_brand_fs01 = csv_reader.load(spark_data_path+'ddf_onehot_trans_agg_customer_brand_fs01', samplingRatio=None)

In [None]:
ddf_onehot_trans_agg_customer_brand_fs1.count()

In [None]:
ddf_onehot_trans_agg_customer_brand_fs01.count()

## merge with offers history

In [None]:
def rename_columns(df, prefix='', suffix='', separator='_', columns=None):
    prefix = prefix + separator if prefix else prefix
    suffix = separator + suffix if suffix else suffix
    columns = df.columns if columns is None else columns
    df1 = df.select('*')
    for c in columns:
        df1 = df1.withColumnRenamed(c, prefix + c + suffix)
    return df1       


def filter_columns(expr, df):
    import re
    return filter(lambda c: re.match(expr,c), df.columns)

In [None]:
ddf_onehot_brand_fs1 = ddf_onehot_trans_agg_customer_brand_fs1
ddf_onehot_brand_fs1 = rename_columns(ddf_onehot_brand_fs1, prefix = 'left', separator='.', columns=cat_cols)

In [None]:
ddf_onehot_brand_fs1 = ddf_onehot_trans_agg_customer_brand_fs01
ddf_onehot_brand_fs1 = rename_columns(ddf_onehot_brand_fs01, prefix = 'left', separator='.', columns=cat_cols)

In [None]:
ddf_onehot_brand_fs1 = ddf_onehot_trans_agg_customer_brand_fs001
ddf_onehot_brand_fs1 = rename_columns(ddf_onehot_brand_fs1, prefix = 'left', separator='.', columns=cat_cols)

#### train

In [None]:
ddf_offers_hist = sqc.createDataFrame(df_offers_hist)

In [None]:
ddf_offers_hist.count()

In [None]:
ddf_offers__onehot_brand_fs1 = ddf_offers_hist.join(ddf_onehot_brand_fs1,
                                                     on=['customer_id'], how='left_outer')

In [None]:
ddf_offers__onehot_brand_fs1.count()

In [None]:
vecAssembler_fs1 = VectorAssembler(inputCols=list(set(ddf_offers__onehot_brand_fs1.columns)-set(filter_columns('left.*',ddf_offers__onehot_brand_fs1))-set(ddf_offers_hist.columns) )+ cat_cols+num_cols, 
                                   outputCol="features")

ddf_offers__onehot_brand_fs1 = vecAssembler_fs1.transform(ddf_offers__onehot_brand_fs1) \
                                                           .select('customer_id', 'features', 'repeater')

#### submission

In [None]:
ddf_offers_subm = sqc.createDataFrame(df_offers_subm)

In [None]:
df_offers_subm.columns

In [None]:
ddf_offers_subm__onehot_brand_fs1 = ddf_offers_subm.join(ddf_onehot_brand_fs1,
                                                     on=['customer_id'], how='left_outer')

In [None]:
vecAssembler_fs1 = VectorAssembler(inputCols=list(set(ddf_offers_subm__onehot_brand_fs1.columns)-set(filter_columns('left.*',ddf_offers_subm__onehot_brand_fs1))-set(ddf_offers_subm.columns))+ cat_cols+num_cols, 
                                   outputCol="features")

In [None]:
ddf_offers_subm__onehot_brand_fs1 = vecAssembler_fs1.transform(ddf_offers_subm__onehot_brand_fs1) \
                                                    .select('customer_id', 'features')

#### persist

In [None]:
write_ddf_to_csv(ddf_offers__onehot_brand_fs1, spark_data_path+'ddf_offers__onehot_brand_fs1')
write_ddf_to_csv(ddf_offers_subm__onehot_brand_fs1, spark_data_path+'ddf_offers_subm__onehot_brand_fs1')

In [None]:
write_ddf_to_csv(ddf_offers__onehot_brand_fs1, spark_data_path+'ddf_offers__onehot_brand_fs01')
write_ddf_to_csv(ddf_offers_subm__onehot_brand_fs1, spark_data_path+'ddf_offers_subm__onehot_brand_fs01')

In [None]:
write_ddf_to_csv(ddf_offers__onehot_brand_fs1, spark_data_path+'ddf_offers__onehot_brand_fs001')
write_ddf_to_csv(ddf_offers_subm__onehot_brand_fs1, spark_data_path+'ddf_offers_subm__onehot_brand_fs001')