In [35]:
import os
import numpy as np
import timeit
import pyspark
import pandas as pd
import s3fs
import time
import ast
import operator
from datetime import datetime
import statistics

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

## Set up the data address book

In [3]:
s3_address = "s3a://msds630-kaggle-competition/"
dataset_addr_book = {}
dataset_name_list = ["events", "messages", "attributes", "sessions"]
for name in dataset_name_list:
    dataset_addr_book[name] = "".join([s3_address, name, ".csv"])

dataset_addr_book

{'attributes': 's3a://msds630-kaggle-competition/attributes.csv',
 'events': 's3a://msds630-kaggle-competition/events.csv',
 'messages': 's3a://msds630-kaggle-competition/messages.csv',
 'sessions': 's3a://msds630-kaggle-competition/sessions.csv'}

## Load events

In [4]:
events_rdd = sc.textFile(dataset_addr_book["events"])\
               .map(lambda line : line.encode('ascii', 'ignore'))
cols_events = events_rdd.map(lambda x: x.split(',')).take(1)[0]
print(cols_events)
print(len(cols_events))

['app_id', 'session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']
6


In [5]:
events_data = events_rdd.filter(lambda x: 'app_id' not in x)\
                        .map(lambda x: x.split(','))
events_data.take(1)

[['4724682771660800',
  '5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [8]:
cols_events_adj = cols_events[1:] # Drop app_id
cols_events_adj

['session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']

In [9]:
events_data_adj = events_data.map(lambda x: x[1:]) # Drop app_id
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [10]:
def convertTime(ts_str):
    return datetime.utcfromtimestamp(int(ts_str)/1000.0).strftime('%Y-%m-%d %H:%M:%S')
convertTime('1541638424150')

'2018-11-08 00:53:44'

## Train

In [11]:
events_data_train = events_data_adj.filter(lambda x: convertTime(x[2])[:10] < '2018-12-02')
events_data_train.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [12]:
temp_pair_train_dict = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_train.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_train_dict[i] = temp_pair

In [13]:
events_data_train_adj = temp_pair_train_dict[1]
events_data_train_adj = events_data_train_adj.leftOuterJoin(temp_pair_train_dict[2])\
                                             .map(lambda x: (x[0], list(x[1])))
events_data_train_adj = events_data_train_adj.leftOuterJoin(temp_pair_train_dict[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_train_adj = events_data_train_adj.leftOuterJoin(temp_pair_train_dict[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_train_adj.take(1)

[('8661029443064028323',
  [['.m6335456823869440', '45', '45'],
   ['1539520579389', '1539520621005', '1539520757941'],
   ['0.0', '0.0', '0.0'],
   ['cb0dcb8889258135825da0d3c099c1ac13525e4093fc8e040554e8e8a026e958',
    'cb0dcb8889258135825da0d3c099c1ac13525e4093fc8e040554e8e8a026e958',
    'cb0dcb8889258135825da0d3c099c1ac13525e4093fc8e040554e8e8a026e958']])]

In [14]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [15]:
events_data_train_adj2 = events_data_train_adj.map(lambda x: [x[0], x[1][0], x[1][1], x[1][2], 
                                                              most_common(x[1][3])])
events_data_train_adj2.take(1)

[['8661029443064028323',
  ['.m6335456823869440', '45', '45'],
  ['1539520579389', '1539520621005', '1539520757941'],
  ['0.0', '0.0', '0.0'],
  'cb0dcb8889258135825da0d3c099c1ac13525e4093fc8e040554e8e8a026e958']]

In [16]:
def findPur(event_list):
    pur_list = []
    for i in range(len(event_list)):
        if event_list[i] == '8':
            pur_list.append(i)
    return pur_list     

In [20]:
events_data_train_adj3 = events_data_train_adj2.map(lambda x: [x[0], findPur(x[1]), len(x[1]), x[2], x[3], x[4]])
events_data_train_adj3.filter(lambda x: x[1]!=[]).take(1)

[['5206351256240882428',
  [17],
  71,
  ['1541440544923',
   '1541440546510',
   '1541440704884',
   '1541440766940',
   '1541440784587',
   '1541440856780',
   '1541440875425',
   '1541440914174',
   '1541440914184',
   '1541441145975',
   '1541441187897',
   '1541441242396',
   '1541441260609',
   '1541441337094',
   '1541441349977',
   '1541441400894',
   '1541441508103',
   '1541441576911',
   '1541441599454',
   '1541441599464',
   '1541441662104',
   '1541441671812',
   '1541441691773',
   '1541441705190',
   '1541441717321',
   '1541441733328',
   '1541441748924',
   '1541441765433',
   '1541441791761',
   '1541441808805',
   '1541441826782',
   '1541441848370',
   '1541441867950',
   '1541441881927',
   '1541441893226',
   '1541441915422',
   '1541441937738',
   '1541441961818',
   '1541442025527',
   '1541442027952',
   '1541442034922',
   '1541442036351',
   '1541442061742',
   '1541442063154',
   '1541442081941',
   '1541442150717',
   '1541442191852',
   '1541442244202',
 

In [21]:
def moneySum(value_list ,idx_list):
    if idx_list == []:
        return 0.0
    moneySum = 0.0
    for idx in idx_list:
        moneySum += float(value_list[idx])
    return moneySum
        
a = ['1.0','1.0','5.0']
b = [0,1]
moneySum(a,b)

2.0

In [27]:
events_data_train_adj4 = events_data_train_adj3.map(lambda x: [x[5], x[0], x[2], len(x[1]),
                                                               float(max(x[3]))-float(min(x[3])), 
                                                               moneySum(x[4], x[1])])
events_data_train_adj4.filter(lambda x: x[3]>0).take(1)

[['9f9513e92b24a6aecee48f0367e7ca2b0b2a6a91c597406b7306ff7bcbdbc670',
  '141583788005273790',
  24,
  1,
  799680.0,
  3.492999792098999]]

In [30]:
temp_pair_train_dict_2 = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_train_adj4.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_train_dict_2[i] = temp_pair

In [31]:
events_data_train_adj5 = temp_pair_train_dict_2[1]
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[2])\
                                             .map(lambda x: (x[0], list(x[1])))
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[5])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], 
                                                                    x[1][0][3], x[1][1]]))
events_data_train_adj5.take(1)

[('c39d635765ff135752a20fea4283e35edb76f5cd0babe1419a3ab82ac2195fd0',
  [['424204521664520092',
    '450771625013221031',
    '763256531593533297',
    '4839434791942809118'],
   [33, 1, 16, 29],
   [0, 0, 0, 0],
   [1728678.0, 1461763.0, 0.0, 701483.0],
   [0.0, 0.0, 0.0, 0.0]])]

In [52]:
def getMedian(lst):
    if lst == []:
        return 0.0
    elif len(lst) % 2 == 1:
        return sorted(lst)[(len(lst)-1)/2]
    else:
        lst = sorted(lst)
        mid = (len(lst)-1)/2
        return (lst[mid]+lst[mid+1])/2.0
    
print(getMedian([0,1,2,]))
print(getMedian([0,1,2,3]))

1
1.5


In [59]:
events_data_train_adj6 = events_data_train_adj5.map(lambda x: [x[0], 
                                                               len(x[1][0]),
                                                               sum(x[1][1]),
                                                               getMedian(x[1][1]),
                                                               sum(x[1][2]), 
                                                               sum(x[1][4]),
                                                               getMedian(x[1][3])])
events_data_train_adj6.take(1)

[['c39d635765ff135752a20fea4283e35edb76f5cd0babe1419a3ab82ac2195fd0',
  4,
  79,
  22.5,
  0,
  0.0,
  1081623.0]]

## Data Dictionary
0 - user_id  
1 - total number of sessions  
2 - total number of events  
3 - median number of events  
4 - total sum of purchases  
5 - total amount of purchases  
6 - median session duration

In [60]:
events_data_train_adj7 = events_data_train_adj6.map(lambda x: ','.join([str(y) for y in x]))
events_data_train_adj7.take(1)

['c39d635765ff135752a20fea4283e35edb76f5cd0babe1419a3ab82ac2195fd0,4,79,22.5,0,0.0,1081623.0']

In [61]:
with open('train_total.csv', 'w') as f:
    for line in events_data_train_adj7.collect():
        f.write(line)
        f.write('\n')

## Test

In [62]:
events_data_test = events_data_adj.filter(lambda x: convertTime(x[2])[:10] < '2018-12-16')
events_data_test.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [63]:
temp_pair_test_dict = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_test.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_test_dict[i] = temp_pair

In [64]:
events_data_test_adj = temp_pair_test_dict[1]
events_data_test_adj = events_data_test_adj.leftOuterJoin(temp_pair_test_dict[2])\
                                           .map(lambda x: (x[0], list(x[1])))
events_data_test_adj = events_data_test_adj.leftOuterJoin(temp_pair_test_dict[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_test_adj = events_data_test_adj.leftOuterJoin(temp_pair_test_dict[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_test_adj.take(1)

[('6309264250257609843',
  [['1',
    '5',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '4',
    '6',
    '40',
    '41',
    '3',
    '42'],
   ['1543175110236',
    '1543175113178',
    '1543175188473',
    '1543175236995',
    '1543175242927',
    '1543175281611',
    '1543175322511',
    '1543175370581',
    '1543175381381',
    '1543175420159',
    '1543175444136',
    '1543175469816',
    '1543175478723',
    '1543175484426',
    '1543175488738',
    '1543175548012',
    '1543175624174',
    '1543175687406',
    '1543175696612',
    '1543175706829',
    '1543175710120',
    '1543175773504',
    '1543175775452'],
   ['0.0',
    '1.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0'],
   ['10596d

In [65]:
events_data_test_adj2 = events_data_test_adj.map(lambda x: [x[0], x[1][0], x[1][1], x[1][2], 
                                                            most_common(x[1][3])])
events_data_test_adj2.take(1)

[['8661029443064028323',
  ['.m6335456823869440', '45', '45'],
  ['1539520579389', '1539520621005', '1539520757941'],
  ['0.0', '0.0', '0.0'],
  'cb0dcb8889258135825da0d3c099c1ac13525e4093fc8e040554e8e8a026e958']]

In [66]:
events_data_test_adj3 = events_data_test_adj2.map(lambda x: [x[0], findPur(x[1]), len(x[1]), x[2], x[3], x[4]])
events_data_test_adj3.filter(lambda x: x[1]!=[]).take(1)

[['5206351256240882428',
  [17],
  71,
  ['1541440544923',
   '1541440546510',
   '1541440704884',
   '1541440766940',
   '1541440784587',
   '1541440856780',
   '1541440875425',
   '1541440914174',
   '1541440914184',
   '1541441145975',
   '1541441187897',
   '1541441242396',
   '1541441260609',
   '1541441337094',
   '1541441349977',
   '1541441400894',
   '1541441508103',
   '1541441576911',
   '1541441599454',
   '1541441599464',
   '1541441662104',
   '1541441671812',
   '1541441691773',
   '1541441705190',
   '1541441717321',
   '1541441733328',
   '1541441748924',
   '1541441765433',
   '1541441791761',
   '1541441808805',
   '1541441826782',
   '1541441848370',
   '1541441867950',
   '1541441881927',
   '1541441893226',
   '1541441915422',
   '1541441937738',
   '1541441961818',
   '1541442025527',
   '1541442027952',
   '1541442034922',
   '1541442036351',
   '1541442061742',
   '1541442063154',
   '1541442081941',
   '1541442150717',
   '1541442191852',
   '1541442244202',
 

In [67]:
events_data_test_adj4 = events_data_test_adj3.map(lambda x: [x[5], x[0], x[2], len(x[1]),
                                                               float(max(x[3]))-float(min(x[3])), 
                                                               moneySum(x[4], x[1])])
events_data_test_adj4.filter(lambda x: x[3]>0).take(1)

[['9f9513e92b24a6aecee48f0367e7ca2b0b2a6a91c597406b7306ff7bcbdbc670',
  '141583788005273790',
  24,
  1,
  799680.0,
  3.492999792098999]]

In [68]:
temp_pair_test_dict_2 = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_test_adj4.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_test_dict_2[i] = temp_pair

In [69]:
events_data_test_adj5 = temp_pair_test_dict_2[1]
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[2])\
                                             .map(lambda x: (x[0], list(x[1])))
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[5])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], 
                                                                    x[1][0][3], x[1][1]]))
events_data_test_adj5.take(1)

[('0d261313961125c71f330a8a8d59857bc48fefd0b4278c50eb06878993a31d95',
  [['4413133688754720943'], [35], [0], [1626343.0], [0.0]])]

In [70]:
events_data_test_adj6 = events_data_test_adj5.map(lambda x: [x[0], 
                                                               len(x[1][0]),
                                                               sum(x[1][1]),
                                                               getMedian(x[1][1]),
                                                               sum(x[1][2]), 
                                                               sum(x[1][4]),
                                                               getMedian(x[1][3])])
events_data_test_adj6.take(1)

[['c39d635765ff135752a20fea4283e35edb76f5cd0babe1419a3ab82ac2195fd0',
  4,
  79,
  22.5,
  0,
  0.0,
  1081623.0]]

In [71]:
events_data_test_adj7 = events_data_test_adj6.map(lambda x: ','.join([str(y) for y in x]))
events_data_test_adj7.take(1)

['0d261313961125c71f330a8a8d59857bc48fefd0b4278c50eb06878993a31d95,1,35,35,0,0.0,1626343.0']

In [72]:
with open('test_total.csv', 'w') as f:
    for line in events_data_test_adj7.collect():
        f.write(line)
        f.write('\n')

In [73]:
print(events_data_train_adj7.count())
print(events_data_test_adj7.count())

620464
621001
