In [1]:
import os
import numpy as np
import timeit
import pyspark
import pandas as pd
import s3fs
import time
import ast
import operator
from datetime import datetime
import statistics

In [43]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

## Set up the data address book

In [44]:
s3_address = "s3a://msds630-kaggle-competition/"
dataset_addr_book = {}
dataset_name_list = ["events", "messages", "attributes", "sessions"]
for name in dataset_name_list:
    dataset_addr_book[name] = "".join([s3_address, name, ".csv"])

dataset_addr_book

{'attributes': 's3a://msds630-kaggle-competition/attributes.csv',
 'events': 's3a://msds630-kaggle-competition/events.csv',
 'messages': 's3a://msds630-kaggle-competition/messages.csv',
 'sessions': 's3a://msds630-kaggle-competition/sessions.csv'}

## Load events

In [45]:
events_rdd = sc.textFile(dataset_addr_book["events"])\
               .map(lambda line : line.encode('ascii', 'ignore'))
cols_events = events_rdd.map(lambda x: x.split(',')).take(1)[0]
print(cols_events)
print(len(cols_events))

['app_id', 'session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']
6


In [46]:
events_data = events_rdd.filter(lambda x: 'app_id' not in x)\
                        .map(lambda x: x.split(','))
events_data.take(1)

[['4724682771660800',
  '5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [47]:
cols_events_adj = cols_events[1:] # Drop app_id
cols_events_adj

['session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']

In [48]:
events_data_adj = events_data.map(lambda x: x[1:]) # Drop app_id
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [49]:
def convertTime(ts_str):
    return datetime.utcfromtimestamp(int(ts_str)/1000.0).strftime('%Y-%m-%d %H:%M:%S')
convertTime('1541638424150')

'2018-11-08 00:53:44'

## Train

In [50]:
events_data_train = events_data_adj.filter(lambda x: convertTime(x[2])[:10] < '2018-12-02')\
                                   .filter(lambda x: convertTime(x[2])[:10] >= '2018-11-18')
events_data_train.take(1)

[['346242741644068108',
  '45',
  '1543070431080',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [51]:
temp_pair_train_dict = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_train.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_train_dict[i] = temp_pair

In [52]:
events_data_train_adj = temp_pair_train_dict[1]
events_data_train_adj = events_data_train_adj.leftOuterJoin(temp_pair_train_dict[2])\
                                             .map(lambda x: (x[0], list(x[1])))
events_data_train_adj = events_data_train_adj.leftOuterJoin(temp_pair_train_dict[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_train_adj = events_data_train_adj.leftOuterJoin(temp_pair_train_dict[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_train_adj.take(1)

[('6309264250257609843',
  [['1',
    '5',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '4',
    '6',
    '40',
    '41',
    '3',
    '42'],
   ['1543175110236',
    '1543175113178',
    '1543175188473',
    '1543175236995',
    '1543175242927',
    '1543175281611',
    '1543175322511',
    '1543175370581',
    '1543175381381',
    '1543175420159',
    '1543175444136',
    '1543175469816',
    '1543175478723',
    '1543175484426',
    '1543175488738',
    '1543175548012',
    '1543175624174',
    '1543175687406',
    '1543175696612',
    '1543175706829',
    '1543175710120',
    '1543175773504',
    '1543175775452'],
   ['0.0',
    '1.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0'],
   ['10596d

In [53]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [54]:
events_data_train_adj2 = events_data_train_adj.map(lambda x: [x[0], x[1][0], x[1][1], x[1][2], 
                                                              most_common(x[1][3])])
events_data_train_adj2.take(1)

[['6309264250257609843',
  ['1',
   '5',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '4',
   '6',
   '40',
   '41',
   '3',
   '42'],
  ['1543175110236',
   '1543175113178',
   '1543175188473',
   '1543175236995',
   '1543175242927',
   '1543175281611',
   '1543175322511',
   '1543175370581',
   '1543175381381',
   '1543175420159',
   '1543175444136',
   '1543175469816',
   '1543175478723',
   '1543175484426',
   '1543175488738',
   '1543175548012',
   '1543175624174',
   '1543175687406',
   '1543175696612',
   '1543175706829',
   '1543175710120',
   '1543175773504',
   '1543175775452'],
  ['0.0',
   '1.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0'],
  '10596df08f16abc27259f399c1e71d5b591a931c22eeed1cd3fa8ba3e798d237']]

In [55]:
def findPur(event_list):
    pur_list = []
    for i in range(len(event_list)):
        if event_list[i] == '8':
            pur_list.append(i)
    return pur_list     

In [56]:
events_data_train_adj3 = events_data_train_adj2.map(lambda x: [x[0], findPur(x[1]), len(x[1]), x[2], x[3], x[4]])
events_data_train_adj3.filter(lambda x: x[1]!=[]).take(1)

[['6035474937349177697',
  [23],
  58,
  ['1543012869863',
   '1543012872451',
   '1543012906922',
   '1543012972189',
   '1543013241606',
   '1543013255956',
   '1543013354066',
   '1543013990867',
   '1543014046797',
   '1543014127836',
   '1543014207784',
   '1543014338804',
   '1543014448169',
   '1543014559513',
   '1543014559543',
   '1543014604543',
   '1543014652194',
   '1543014693201',
   '1543014766370',
   '1543014815998',
   '1543014816019',
   '1543014920523',
   '1543015037412',
   '1543015181136',
   '1543015188876',
   '1543015188884',
   '1543015337010',
   '1543015437992',
   '1543015520929',
   '1543015688601',
   '1543015690779',
   '1543015698253',
   '1543015699574',
   '1543015734460',
   '1543015735660',
   '1543015743668',
   '1543015794578',
   '1543015794600',
   '1543015823413',
   '1543015902233',
   '1543015947016',
   '1543015991708',
   '1543016181470',
   '1543016245597',
   '1543016330325',
   '1543016374057',
   '1543016445136',
   '1543016502219',
 

In [57]:
def moneySum(value_list ,idx_list):
    if idx_list == []:
        return 0.0
    moneySum = 0.0
    for idx in idx_list:
        moneySum += float(value_list[idx])
    return moneySum
        
a = ['1.0','1.0','5.0']
b = [0,1]
moneySum(a,b)

2.0

In [58]:
events_data_train_adj4 = events_data_train_adj3.map(lambda x: [x[5], x[0], x[2], len(x[1]),
                                                               float(max(x[3]))-float(min(x[3])), 
                                                               moneySum(x[4], x[1])])
events_data_train_adj4.filter(lambda x: x[3]>0).take(1)

[['a7cc6e6f8c7b889a1272583505c240ce8c6f4196ba261c46c4de328d5ca4f739',
  '3395145758454465276',
  192,
  4,
  14665965.0,
  11.871999382972717]]

In [59]:
temp_pair_train_dict_2 = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_train_adj4.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_train_dict_2[i] = temp_pair

In [60]:
events_data_train_adj5 = temp_pair_train_dict_2[1]
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[2])\
                                             .map(lambda x: (x[0], list(x[1])))
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_train_adj5 = events_data_train_adj5.leftOuterJoin(temp_pair_train_dict_2[5])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], 
                                                                    x[1][0][3], x[1][1]]))
events_data_train_adj5.take(1)

[('a7d68ea7dcf1bc9fc2d455c19d1ba409951609735a543bc97d3b51cd19b09f6f',
  [['69031578336701658'], [1], [0], [0.0], [0.0]])]

In [61]:
def getMedian(lst):
    if lst == []:
        return 0.0
    elif len(lst) % 2 == 1:
        return sorted(lst)[(len(lst)-1)/2]
    else:
        lst = sorted(lst)
        mid = (len(lst)-1)/2
        return (lst[mid]+lst[mid+1])/2.0
    
print(getMedian([0,1,2,]))
print(getMedian([0,1,2,3]))

1
1.5


In [62]:
events_data_train_adj6 = events_data_train_adj5.map(lambda x: [x[0], 
                                                               len(x[1][0]),
                                                               sum(x[1][1]),
                                                               getMedian(x[1][1]),
                                                               sum(x[1][2]), 
                                                               sum(x[1][4]),
                                                               getMedian(x[1][3])])
events_data_train_adj6.take(1)

[['a7d68ea7dcf1bc9fc2d455c19d1ba409951609735a543bc97d3b51cd19b09f6f',
  1,
  1,
  1,
  0,
  0.0,
  0.0]]

## Data Dictionary
0 - user_id  
1 - total number of sessions  
2 - total number of events  
3 - median number of events  
4 - total sum of purchases  
5 - total amount of purchases  
6 - median session duration

In [63]:
events_data_train_adj7 = events_data_train_adj6.map(lambda x: ','.join([str(y) for y in x]))
events_data_train_adj7.take(1)

['a7d68ea7dcf1bc9fc2d455c19d1ba409951609735a543bc97d3b51cd19b09f6f,1,1,1,0,0.0,0.0']

In [64]:
with open('train_last_14_days.csv', 'w') as f:
    for line in events_data_train_adj7.collect():
        f.write(line)
        f.write('\n')

## Test

In [65]:
events_data_test = events_data_adj.filter(lambda x: convertTime(x[2])[:10] < '2018-12-16')\
                                  .filter(lambda x: convertTime(x[2])[:10] >= '2018-12-02')
events_data_test.take(1)

[['2201961907282901522',
  '4',
  '1543713091129',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [66]:
temp_pair_test_dict = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_test.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_test_dict[i] = temp_pair

In [67]:
events_data_test_adj = temp_pair_test_dict[1]
events_data_test_adj = events_data_test_adj.leftOuterJoin(temp_pair_test_dict[2])\
                                           .map(lambda x: (x[0], list(x[1])))
events_data_test_adj = events_data_test_adj.leftOuterJoin(temp_pair_test_dict[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_test_adj = events_data_test_adj.leftOuterJoin(temp_pair_test_dict[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_test_adj.take(1)

[('2110482880339331667',
  [['1',
    '5',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '45',
    '4',
    '6',
    '40',
    '41',
    '3',
    '42',
    '1',
    '1'],
   ['1544194347400',
    '1544194351427',
    '1544194423393',
    '1544194481805',
    '1544194519338',
    '1544194584780',
    '1544194652133',
    '1544194695390',
    '1544194771318',
    '1544194825036',
    '1544194926066',
    '1544194976711',
    '1544195017774',
    '1544195088559',
    '1544195124975',
    '1544195231198',
    '1544195255892',
    '1544195274452',
    '1544195278188',
    '1544195290584',
    '1544195292452',
    '1544195318555',
    '1544195320893',
    '1544195364243',
    '1544195378293'],
   ['0.0',
    '1.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.0',
    '0.

In [68]:
events_data_test_adj2 = events_data_test_adj.map(lambda x: [x[0], x[1][0], x[1][1], x[1][2], 
                                                            most_common(x[1][3])])
events_data_test_adj2.take(1)

[['7500399621368845766',
  ['45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '4',
   '6',
   '1',
   '5',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '7',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '4',
   '6',
   '40',
   '1',
   '5',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45',
   '45'],
  ['1543807781412',
   '1543807868888',
   '1543807891969',
   '1543807952363',
   '1543808000979',
   '1543808080755',
   '1543808135583',
   '1543808222467',
   '1543808224742',
   '1543805928549',
   '1543805932102',
   '1543806043738',
   '1543806090715',
   '1543806154773',
   '1543806235158',
   '1543806281883',
   '1543806310789',
   '1543806505729',
   '1543806612187',
   '1543806612218',
   '1543806632190',
   '1543806693806',
   '1543806721333',
   '1543806782611',
   '1543806826499',
   '1543806902142',
   '1543806933446',
   '1543806935698',
   '1543806946555',
   '1543806958347',
   '1543806960

In [69]:
events_data_test_adj3 = events_data_test_adj2.map(lambda x: [x[0], findPur(x[1]), len(x[1]), x[2], x[3], x[4]])
events_data_test_adj3.filter(lambda x: x[1]!=[]).take(1)

[['3324869504369203093',
  [0],
  16,
  ['1543984333715',
   '1543984342092',
   '1543984342097',
   '1543984395785',
   '1543984427556',
   '1543984499133',
   '1543984587594',
   '1543984639267',
   '1543984710960',
   '1543984736530',
   '1543984798711',
   '1543984801475',
   '1543984810973',
   '1543984812794',
   '1543984861422',
   '1543984862638'],
  ['1.393',
   '20.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0',
   '0.0'],
  'c137d015d49e967f937597a9b50e6d5b8e5611e2ed1dc05b0c67c5230cd846d4']]

In [70]:
events_data_test_adj4 = events_data_test_adj3.map(lambda x: [x[5], x[0], x[2], len(x[1]),
                                                               float(max(x[3]))-float(min(x[3])), 
                                                               moneySum(x[4], x[1])])
events_data_test_adj4.filter(lambda x: x[3]>0).take(1)

[['c137d015d49e967f937597a9b50e6d5b8e5611e2ed1dc05b0c67c5230cd846d4',
  '3324869504369203093',
  16,
  1,
  528923.0,
  1.393]]

In [71]:
temp_pair_test_dict_2 = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_test_adj4.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_test_dict_2[i] = temp_pair

In [72]:
events_data_test_adj5 = temp_pair_test_dict_2[1]
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[2])\
                                             .map(lambda x: (x[0], list(x[1])))
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[3])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][1]]))
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[4])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], x[1][1]]))
events_data_test_adj5 = events_data_test_adj5.leftOuterJoin(temp_pair_test_dict_2[5])\
                                             .map(lambda x: (x[0], list(x[1])))\
                                             .map(lambda x: (x[0], [x[1][0][0], x[1][0][1], x[1][0][2], 
                                                                    x[1][0][3], x[1][1]]))
events_data_test_adj5.take(1)

[('a14d6729d06b71565cb88ac8d5f0b60970d68ba5831709391ce480d7fe79b06c',
  [['4064445969441574723',
    '2160171628568031558',
    '6480985702993295252',
    '841216543603380687'],
   [142, 6, 74, 2],
   [0, 0, 0, 0],
   [5193.0, 3199431.0, 6234163.0, 358229.0],
   [0.0, 0.0, 0.0, 0.0]])]

In [73]:
events_data_test_adj6 = events_data_test_adj5.map(lambda x: [x[0], 
                                                               len(x[1][0]),
                                                               sum(x[1][1]),
                                                               getMedian(x[1][1]),
                                                               sum(x[1][2]), 
                                                               sum(x[1][4]),
                                                               getMedian(x[1][3])])
events_data_test_adj6.take(1)

[['a14d6729d06b71565cb88ac8d5f0b60970d68ba5831709391ce480d7fe79b06c',
  4,
  224,
  40.0,
  0,
  0.0,
  1778830.0]]

In [74]:
events_data_test_adj7 = events_data_test_adj6.map(lambda x: ','.join([str(y) for y in x]))
events_data_test_adj7.take(1)

['a14d6729d06b71565cb88ac8d5f0b60970d68ba5831709391ce480d7fe79b06c,4,224,40.0,0,0.0,1778830.0']

In [75]:
with open('test_last_14_days.csv', 'w') as f:
    for line in events_data_test_adj7.collect():
        f.write(line)
        f.write('\n')

In [76]:
print(events_data_train_adj7.count())
print(events_data_test_adj7.count())

207601
81316


In [77]:
sc.stop()