In [1]:
import os
import numpy as np
import timeit
import pyspark
import pandas as pd
import s3fs
import time
import ast
from datetime import datetime

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
# Load the data
events_rdd = sc.textFile("s3a://msds630-kaggle-competition/events_adj_mn_v2.csv/part-00000")\
               .map(lambda line : line.encode('ascii', 'ignore'))
events_rdd.count()

5009421

In [6]:
# Functions

# adj2
def convertTime(ts_str):
    return datetime.utcfromtimestamp(int(ts_str)/1000.0).strftime('%Y-%m-%d %H:%M:%S')

# adj3
def get8index(event_list):
    list_8 = []
    for i in range(len(event_list)):
        if event_list[i] == '8':
            list_8.append(i)
    return list_8

def getPurchaseTime(event_list, event_ts_list):
    purchase_time_list = []
    for index in get8index(event_list):
        purchase_time_list.append(event_ts_list[index])
    return purchase_time_list

def getPurchaseValue(event_list, event_val_list):
    purchase_val_list = []
    for index in get8index(event_list):
        purchase_val_list.append(event_val_list[index])
    return purchase_val_list

# adj4
def inInterval(start_dt, end_dt, purchase_ts_list):
    for dt in purchase_ts_list:
        if dt[:10] > start_dt and dt[:10] < end_dt:
            return 1.0
    return -1.0

# adj5
def is_weekend(sample_pts_list):
    if sample_pts_list == []:
        return ""
    dow = []
    for dt in sample_pts_list:
        dow.append(datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').weekday())
    dow = dow[0]
    if dow in [i for i in range(5)]:
        return "0"
    else:
        return "1"
    
def sumPurVal(pur_val_list):
    return sum([float(num) for num in pur_val_list])

In [8]:
events_rdd_adj = events_rdd.map(lambda x: ast.literal_eval(x))

events_rdd_adj1 = events_rdd_adj.map(lambda x: [x[0], x[1].split(" | "), 
                                                x[2].split(" | "), x[3].split(" | "),
                                                x[4]])

events_rdd_adj2 = events_rdd_adj1.map(lambda x: [x[0], x[1], [convertTime(ts_str) for ts_str in x[2]], 
                                                 x[3], x[4]])

# Add new column purchase_time_list
# Add new column purchase_val_list
events_rdd_adj3 = events_rdd_adj2.map(lambda x: [x[0], x[1], x[2], x[3], x[4], 
                                                 getPurchaseTime(x[1], x[2]),
                                                 getPurchaseValue(x[1], x[3])])

# Add new columns
# boolean regarding if this session is a purchase during Dec.2 - 8 (1.0 - yes; -1.0 - no)
# boolean regarding if this session is a purchase during Dec.2 - 15 (1.0 - yes; -1.0 - no)
events_rdd_adj4 = events_rdd_adj3.map(lambda x: [x[0], x[1], x[2], x[3], x[4], x[5], x[6],
                                                 inInterval("2018-12-01", "2018-12-09", x[5]),
                                                 inInterval("2018-12-01", "2018-12-16", x[5])])
# Convert x[1] to number of event
# Drop x[2]
# Drop x[3]
# Add number of 8's
# Add is_weekend
# Add sum of purchase value
events_rdd_adj5 = events_rdd_adj4.map(lambda x: [x[0], len(x[1]), len(x[5]), 
                                                 is_weekend(x[5]), x[4], x[5], sumPurVal(x[6]),
                                                 x[7], x[8]])

### Data Dictionary  
0 - session_id  
1 - number of events  
2 - number of purchases (number of events with 8)  
3 - boolean regarding weekend ('NP' - no purchase; '0' - weekday; '1' - weekend)  
4 - boolean regarding if this session has an purchase (1.0 - yes; -1.0 - no)  
5 - list of purchase time  
6 - sum of purchase value  
7 - boolean regarding if this session is a purchase during Dec.2 - 8 (1.0 - yes; -1.0 - no)  
8 - boolean regarding if this session is a purchase during Dec.2 - 15 (1.0 - yes; -1.0 - no)

In [9]:
events_rdd_adj5.filter(lambda x: x[4] == 1.0).take(2)

[['6035474937349177697',
  58,
  1,
  '0',
  1.0,
  ['2018-11-23 23:19:41'],
  0.0,
  -1.0,
  -1.0],
 ['137347936755241518',
  19,
  1,
  '0',
  1.0,
  ['2018-11-09 05:02:20'],
  1.3930000066757202,
  -1.0,
  -1.0]]