In [1]:
import findspark
import os
findspark.init()
import pyspark
from pyspark import SparkFiles
from pyspark import sql
from pyspark import SparkConf

from pyspark.sql import SQLContext, HiveContext
from pyspark import SparkContext


In [2]:
from pyspark.sql import Row
from pyspark.sql import DataFrameWriter
from pyspark.sql import DataFrameReader
from pyspark import StorageLevel

from pyspark.sql import functions as F
from pyspark.sql.functions import struct
from pyspark.sql import GroupedData


In [2]:
from functools import partial
import numpy as np

from datasu.auc import *
from datasu.dicts import *
from datasu.files import *
from datasu.pandas import *
from datasu.persist import *
from datasu.spark import *

import pandas as pd

ImportError: No module named pyspark.sql

In [None]:
conf = SparkConf()
conf.set('spark.executor.memory', '8g')
conf.set('spark.driver.memory', '12g')
conf.set('spark.python.worker.memory', '2g')
conf.set("spark.driver.maxResultSize", "5g")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+UseCompressedOops')

conf.setAppName('prepare features')
conf.getAll()

In [None]:
try:
    sc.stop()
except:
    print 'spark context not exists'

In [6]:
   
sc = pyspark.SparkContext(conf=conf)
sqc = pyspark.SQLContext(sc)
# psc = PSparkContext(sc)

In [7]:
sc.defaultParallelism, sc.defaultMinPartitions

(2, 2)

In [8]:
csv_reader = sqc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')

## LOAD DATA

In [51]:
base_data_path = '/home/ds/dev/data/Kagle-ValuesShoppers/'
spark_data_path = 'file://'+ base_data_path + 'spark_data/'
transactions_name = 'transactions'

In [13]:
df_coupons = pd.read_csv(base_data_path+'offers')[['offer','category','company','brand','offervalue','quantity']]
df_offers_ids = pd.read_csv(base_data_path+'trainHistory').rename(columns={'id': 'customer_id'})
df_offers_ids_submission = pd.read_csv(base_data_path+'testHistory').rename(columns={'id': 'customer_id'})
# df_trans_all = pd.read_csv(base_data_path+'transactions_reduced_category').rename(columns={'id': 'customer_id'})

In [14]:
df_offers_all = pd.merge(df_offers_ids, df_coupons, on=['offer'])
df_offers_all = df_offers_all[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity','repeattrips','repeater']]

df_offers_all_submission = pd.merge(df_offers_ids_submission, df_coupons, on=['offer'])
df_offers_all_submission = df_offers_all_submission[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity']]

In [15]:
ddf_transactions = csv_reader.load(base_data_path+transactions_name, samplingRatio=None)
ddf_transactions.rdd.setName(transactions_name)
ddf_transactions.alias('transactions')

ddf_transactions.rdd.getNumPartitions()
ddf_transactions = ddf_transactions.withColumnRenamed('id','customer_id')

## EXPLORE DATA

In [None]:
ddf_transactions.show(5)

In [None]:
ddf_transactions.select('dept').distinct().count()

In [None]:
ddf_transactions.select('category').distinct().count()

In [None]:
ddf_transactions.select('company').distinct().count()

In [None]:
ddf_transactions.select('brand').distinct().count()

In [None]:
ddf_transactions.select('customer_id').distinct().count()

## PREPARE DATA

## PREPARE FEATURES

In [16]:
summ_grouping = {'total':F.sum, 'average':F.avg }
count_grouping = {'count':F.count }

count_agg = partial(get_ddf_aggs, agg_columns=['customer_id'], agg_funcs=count_grouping, prefix='agg_')
total_avg_agg = partial(get_ddf_aggs, agg_columns=['productsize','purchasequantity','purchaseamount'], agg_funcs=summ_grouping, prefix='agg_')

### agg customer_brand 

In [17]:
# grpby_columns = ['customer_id','brand', 'category', 'dept']
grpby_columns = ['customer_id','brand','category']
grpby_columns_name = ['customer_id','brand']
ddf_trans_agg_customer_brand = ddf_transactions.groupby(grpby_columns) \
                                        .agg(*(count_agg(grpby_columns_name) + total_avg_agg(grpby_columns_name)))

In [18]:
ddf_trans_agg_customer_brand.columns

['customer_id',
 'brand',
 'category',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total']

### agg customer_category

In [19]:
grpby_cid_ctgry_columns = ['customer_id','category']

ddf_trans_agg_customer_category = ddf_transactions.groupby(grpby_cid_ctgry_columns) \
                                        .agg(*(count_agg(grpby_cid_ctgry_columns) + total_avg_agg(grpby_cid_ctgry_columns)))

In [20]:
ddf_trans_agg_customer_category.columns

['customer_id',
 'category',
 'agg_customer_id_category_customer_id_count',
 'agg_customer_id_category_productsize_average',
 'agg_customer_id_category_productsize_total',
 'agg_customer_id_category_purchasequantity_average',
 'agg_customer_id_category_purchasequantity_total',
 'agg_customer_id_category_purchaseamount_average',
 'agg_customer_id_category_purchaseamount_total']

### merge

In [21]:
ddf_trans_agg_history = ddf_trans_agg_customer_brand.join(ddf_trans_agg_customer_category, 
                                                           on=['customer_id', 'category'], how='left_outer')

In [22]:
ddf_trans_agg_history.columns

['customer_id',
 'category',
 'brand',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total',
 'agg_customer_id_category_customer_id_count',
 'agg_customer_id_category_productsize_average',
 'agg_customer_id_category_productsize_total',
 'agg_customer_id_category_purchasequantity_average',
 'agg_customer_id_category_purchasequantity_total',
 'agg_customer_id_category_purchaseamount_average',
 'agg_customer_id_category_purchaseamount_total']

In [23]:
ddf_trans_agg_history = ddf_trans_agg_history \
                                    .withColumn('repeater_calc', ddf_trans_agg_history.agg_customer_id_brand_customer_id_count > 1)



In [24]:
ddf_trans_agg_history.columns

['customer_id',
 'category',
 'brand',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total',
 'agg_customer_id_category_customer_id_count',
 'agg_customer_id_category_productsize_average',
 'agg_customer_id_category_productsize_total',
 'agg_customer_id_category_purchasequantity_average',
 'agg_customer_id_category_purchasequantity_total',
 'agg_customer_id_category_purchaseamount_average',
 'agg_customer_id_category_purchaseamount_total',
 'repeater_calc']

In [None]:
ddf_trans_agg_history.count()

In [None]:
# ddf_trans_agg_history.persist(StorageLevel.DISK_ONLY)

In [None]:
# ddf_trans_agg_history.unpersist()

In [40]:
write_ddf_to_csv(ddf_trans_agg_history, spark_data_path+'ddf_trans_agg_history')

## merge with offers history

#### train

In [None]:
# ddf_trans_agg_history = csv_reader.load(spark_data_path+'ddf_trans_agg_history', samplingRatio=None)
# ddf_trans_agg_history.columns

In [25]:
df_offers_all.columns.values.tolist()

['customer_id',
 'chain',
 'offer',
 'market',
 'category',
 'company',
 'brand',
 'offerdate',
 'offervalue',
 'quantity',
 'repeattrips',
 'repeater']

In [26]:
ddf_offers_all = sqc.createDataFrame(df_offers_all)

In [27]:
ddf_offers_all.count()

160057

In [28]:
ddf_trans_agg_customer_brand.columns

['customer_id',
 'brand',
 'category',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total']

In [29]:
ddf_offers__trans_aggs = ddf_offers_all.join(ddf_trans_agg_customer_brand,
                                             on=['customer_id','brand','category'], how='left_outer')
ddf_offers__trans_aggs = ddf_offers__trans_aggs.drop('repeattrips')

In [30]:
ddf_offers__trans_aggs = ddf_offers__trans_aggs.join(ddf_trans_agg_customer_category, 
                                                     on=['customer_id','category'], how='left_outer')

In [31]:
ddf_offers__trans_aggs.columns

['customer_id',
 'category',
 'brand',
 'chain',
 'offer',
 'market',
 'company',
 'offerdate',
 'offervalue',
 'quantity',
 'repeater',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total',
 'agg_customer_id_category_customer_id_count',
 'agg_customer_id_category_productsize_average',
 'agg_customer_id_category_productsize_total',
 'agg_customer_id_category_purchasequantity_average',
 'agg_customer_id_category_purchasequantity_total',
 'agg_customer_id_category_purchaseamount_average',
 'agg_customer_id_category_purchaseamount_total']

In [53]:
df_offers__trans_aggs = ddf_offers__trans_aggs.toPandas()

#### submission

In [32]:
ddf_offers_all_submission = sqc.createDataFrame(df_offers_all_submission)

In [33]:
ddf_offers_all_submission.columns

['customer_id',
 'chain',
 'offer',
 'market',
 'category',
 'company',
 'brand',
 'offerdate',
 'offervalue',
 'quantity']

In [34]:
ddf_offers_submission__trans_aggs = ddf_offers_all_submission.join(ddf_trans_agg_customer_brand,
                                             on=['customer_id','brand','category'], how='left_outer')
ddf_offers_submission__trans_aggs = ddf_offers_submission__trans_aggs.drop('repeattrips')

In [35]:
ddf_offers_submission__trans_aggs = ddf_offers_submission__trans_aggs.join(ddf_trans_agg_customer_category, 
                                                     on=['customer_id','category'], how='left_outer')

In [36]:
ddf_offers_submission__trans_aggs.columns

['customer_id',
 'category',
 'brand',
 'chain',
 'offer',
 'market',
 'company',
 'offerdate',
 'offervalue',
 'quantity',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total',
 'agg_customer_id_category_customer_id_count',
 'agg_customer_id_category_productsize_average',
 'agg_customer_id_category_productsize_total',
 'agg_customer_id_category_purchasequantity_average',
 'agg_customer_id_category_purchasequantity_total',
 'agg_customer_id_category_purchaseamount_average',
 'agg_customer_id_category_purchaseamount_total']

In [37]:
df_offers_submission__trans_aggs = ddf_offers_submission__trans_aggs.toPandas()

#### persist

In [55]:
persist_variables(['df_offers_submission__trans_aggs','df_offers_submission__trans_aggs']
                  ,path=base_data_path+'working_data')

dumping df_offers_submission__trans_aggs to /home/ds/dev/data/Kagle-ValuesShoppers/working_data/df_offers_submission__trans_aggs.var
dumping df_offers_submission__trans_aggs to /home/ds/dev/data/Kagle-ValuesShoppers/working_data/df_offers_submission__trans_aggs.var
