In [1]:
import os
import numpy as np
import timeit
import pyspark
import pandas as pd
import s3fs

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

# Set up the data address book

In [3]:
s3_address = "s3a://msds630-kaggle-competition/"
dataset_addr_book = {}
dataset_name_list = ["events", "messages", "attributes", "sessions"]
for name in dataset_name_list:
    dataset_addr_book[name] = "".join([s3_address, name, ".csv"])

dataset_addr_book

{'attributes': 's3a://msds630-kaggle-competition/attributes.csv',
 'events': 's3a://msds630-kaggle-competition/events.csv',
 'messages': 's3a://msds630-kaggle-competition/messages.csv',
 'sessions': 's3a://msds630-kaggle-competition/sessions.csv'}

# Load events

In [4]:
events_rdd = sc.textFile(dataset_addr_book["events"])\
               .map(lambda line : line.encode('ascii', 'ignore'))
cols_events = events_rdd.map(lambda x: x.split(',')).take(1)[0]
print(cols_events)
print(len(cols_events))

['app_id', 'session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']
6


In [5]:
events_data = events_rdd.filter(lambda x: 'app_id' not in x)\
                        .map(lambda x: x.split(','))
events_data.take(1)

[['4724682771660800',
  '5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [6]:
cols_events_adj = cols_events[1:] # Drop app_id
cols_events_adj.append('is_purchase')
cols_events_adj

['session_id',
 'event',
 'event_timestamp',
 'event_value',
 'user_id_hash',
 'is_purchase']

In [7]:
events_data_adj = events_data.map(lambda x: x[1:]) # Drop app_id
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [8]:
def isPurchase(event):
    if event == '8':
        return 1.0
    else:
        return -1.0

# Test
print(isPurchase('8'))
print(isPurchase('9'))

1.0
-1.0


In [9]:
cols_events_adj

['session_id',
 'event',
 'event_timestamp',
 'event_value',
 'user_id_hash',
 'is_purchase']

In [10]:
# Add is_purchase
events_data_adj = events_data_adj.map(lambda x: [x[0], x[1], x[2], x[3], x[4], isPurchase(x[1])])
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006',
  -1.0]]

In [11]:
events_data_adj_tsAsKey = events_data_adj.map(lambda x: (x[2], x)).sortByKey()
events_data_adj_tsAsKey.take(2)

[('1538377204526',
  ['4339488202460077611',
   '.a5027911885258752',
   '1538377204526',
   '0.0',
   'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
   -1.0]),
 ('1538377204526',
  ['4339488202460077611',
   '.a5061295285075968',
   '1538377204526',
   '0.0',
   'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
   -1.0])]

In [12]:
events_data_adj2 = events_data_adj_tsAsKey.values()

In [13]:
cols_events_adj

['session_id',
 'event',
 'event_timestamp',
 'event_value',
 'user_id_hash',
 'is_purchase']

In [14]:
temp_pair_dict = {}
for i in range(6):
    if i != 0:
        temp_pair = events_data_adj2.map(lambda x: (x[0], x[i]))\
                                    .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_dict[i] = temp_pair

In [15]:
events_data_adj3 = temp_pair_dict[1]
events_data_adj3 = events_data_adj3.leftOuterJoin(temp_pair_dict[2]).map(lambda x: (x[0], list(x[1])))
events_data_adj3 = events_data_adj3.leftOuterJoin(temp_pair_dict[3]).map(lambda x: (x[0], list(x[1])))\
                                                                      .map(lambda x: (x[0], [x[1][0][0],
                                                                                              x[1][0][1],
                                                                                              x[1][1]]))
events_data_adj3 = events_data_adj3.leftOuterJoin(temp_pair_dict[4]).map(lambda x: (x[0], list(x[1])))\
                                                                      .map(lambda x: (x[0], [x[1][0][0],
                                                                                              x[1][0][1],
                                                                                              x[1][0][2],
                                                                                              x[1][1]]))
events_data_adj3 = events_data_adj3.leftOuterJoin(temp_pair_dict[5]).map(lambda x: (x[0], list(x[1])))\
                                                                      .map(lambda x: (x[0], [x[1][0][0],
                                                                                              x[1][0][1],
                                                                                              x[1][0][2],
                                                                                              x[1][0][3],
                                                                                              x[1][1]]))
# for i in range(4):
#     a = i+2
#     new_events_sample = new_events_sample.leftOuterJoin(temp_pair_dict[a]).map(lambda x: (x[0], list(x[1])))
#     new_events_sample = new_events_sample.map(lambda x: (x[0], [x[1][0][0],x[1][0][1],x[1][1]]))
#events_data_adj3.take(1)

In [16]:
event_data_adj4 = events_data_adj3.map(lambda x: [x[0], x[1][0], x[1][1], x[1][2], x[1][3], x[1][4]])
#event_data_adj4.take(1)

In [17]:
# Test user_id
# event_data_adj4.map(lambda x: (len(set(x[4])), 1))\
#                .reduceByKey(lambda x,y: x + y).collect()

In [18]:
def find_purchase(x):
    if 1.0 in x:
        return 1.0
    else:
        return -1.0

# Test
print(find_purchase([-1.0, -1.0]))
print(find_purchase([-1.0, 1.0]))

-1.0
1.0


In [None]:
# event_data_adj5 = event_data_adj4.map(lambda x: [x[0], ' | '.join(x[1]), ' | '.join(x[2]), 
#                                                  ' | '.join(x[3]), 
#                                                  list(set(x[4]))[0], find_purchase(x[5])])
# event_data_adj5.take(1)                                                  

In [19]:
event_data_adj5 = event_data_adj4.map(lambda x: [x[0], ' | '.join(x[1]), ' | '.join(x[2]), 
                                                 ' | '.join(x[3]), find_purchase(x[5])])
event_data_adj5.take(3)                                                  

[['7310395194373542287',
  '45 | 1 | 1 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 4 | 6 | 40 | 41 | 3 | 42 | 5 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 4 | 6 | 40 | 41 | 3 | 42 | 5 | 14 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 4 | 6 | 40 | 41 | 3 | 42 | 1 | 1 | 1 | 5 | 45 | 14 | 14 | 14 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 45 | 6 | 1 | 7 | 45 | 45 | 45 | 4 | 6 | 40',
  '1541982498725 | 1541982505748 | 1541982543234 | 1541982672782 | 1541982714271 | 1541982865076 | 1541982869743 | 1541982875827 | 1541982897260 | 1541982900296 | 1541982904079 | 1541982976633 | 1541983010238 | 1541983026018 | 1541983028092 | 1541983035992 | 1541983037648 | 1541983060490 | 1541983061675 | 1541983071037 | 1541983134614 | 1541983178585 | 1541983276942 | 1541983354028 | 1541983431176 | 1541983505980 | 1541983590638 | 1541983653488 | 1541983735357

In [20]:
# event_data_adj5.coalesce(1).saveAsTextFile("s3a://msds630-kaggle-competition/events_adj.csv")
event_data_adj5.saveAsTextFile("s3a://msds630-kaggle-competition/events_adj_mn.csv")

In [63]:
# experiment
events_sample = sc.parallelize(events_data_adj2.take(20))
events_sample.collect()

[['4339488202460077611',
  '.a5027911885258752',
  '1538377204526',
  '0.0',
  'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
  -1.0],
 ['4339488202460077611',
  '.a5061295285075968',
  '1538377204526',
  '0.0',
  'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
  -1.0],
 ['7012480646309837595',
  '.a5027911885258752',
  '1538377208476',
  '0.0',
  '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee',
  -1.0],
 ['7012480646309837595',
  '44',
  '1538377240514',
  '0.0',
  '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee',
  -1.0],
 ['7012480646309837595',
  '5',
  '1538377242744',
  '0.0',
  '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee',
  -1.0],
 ['7012480646309837595',
  '45',
  '1538377266395',
  '0.0',
  '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee',
  -1.0],
 ['8483277250870486701',
  '.a5027911885258752',
  '1538377290694',
  '0.0',
  '4bc7b52d56b24d258ba1746994492f354

In [88]:
temp_pair_dict = {}
for i in range(6):
    if i != 0:
        temp_pair = events_sample.map(lambda x: (x[0], x[i]))\
                                 .groupByKey().map(lambda x: [x[0], list(x[1])])
        temp_pair_dict[i] = temp_pair
        print(temp_pair.take(1))

[['7012480646309837595', ['.a5027911885258752', '44', '5', '45', '14', '14']]]
[['7012480646309837595', ['1538377208476', '1538377240514', '1538377242744', '1538377266395', '1538377304065', '1538377320349']]]
[['7012480646309837595', ['0.0', '0.0', '0.0', '0.0', '0.0', '0.0']]]
[['7012480646309837595', ['285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee', '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee', '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee', '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee', '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee', '285863e47df26bf47664c93d45c35ee3b9cfc8e253da23760fdbeacb2ba140ee']]]
[['7012480646309837595', [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]]


In [113]:
new_events_sample = temp_pair_dict[1]
new_events_sample = new_events_sample.leftOuterJoin(temp_pair_dict[2]).map(lambda x: (x[0], list(x[1])))
new_events_sample = new_events_sample.leftOuterJoin(temp_pair_dict[3]).map(lambda x: (x[0], list(x[1])))\
                                                                      .map(lambda x: (x[0], [x[1][0][0],
                                                                                              x[1][0][1],
                                                                                              x[1][1]]))
new_events_sample = new_events_sample.leftOuterJoin(temp_pair_dict[4]).map(lambda x: (x[0], list(x[1])))\
                                                                      .map(lambda x: (x[0], [x[1][0][0],
                                                                                              x[1][0][1],
                                                                                              x[1][0][2],
                                                                                              x[1][1]]))
new_events_sample = new_events_sample.leftOuterJoin(temp_pair_dict[5]).map(lambda x: (x[0], list(x[1])))\
                                                                      .map(lambda x: (x[0], [x[1][0][0],
                                                                                              x[1][0][1],
                                                                                              x[1][0][2],
                                                                                              x[1][0][3],
                                                                                              x[1][1]]))
# for i in range(4):
#     a = i+2
#     new_events_sample = new_events_sample.leftOuterJoin(temp_pair_dict[a]).map(lambda x: (x[0], list(x[1])))
#     new_events_sample = new_events_sample.map(lambda x: (x[0], [x[1][0][0],x[1][0][1],x[1][1]]))
new_events_sample.collect()

[('2920311224184894615',
  [['45'],
   ['1538377326564'],
   ['0.0'],
   ['475f8f20576637055c5331513dda4db1d889f5e951c8e5f62f718a7fd871bd7b'],
   [-1.0]]),
 ('4339488202460077611',
  [['.a5027911885258752', '.a5061295285075968', '44', '5'],
   ['1538377204526', '1538377204526', '1538377314780', '1538377317525'],
   ['0.0', '0.0', '0.0', '0.0'],
   ['c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
    'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
    'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e',
    'c9bad12f865142f1b3938c9ab8a3003a7882136a2b4a121c2a20008cf44ed91e'],
   [-1.0, -1.0, -1.0, -1.0]]),
 ('8114762104527844914',
  [['11', '4', '6', '40'],
   ['1538377330697', '1538377332486', '1538377338829', '1538377345657'],
   ['5.0', '0.0', '0.0', '0.0'],
   ['82349b0d1a80b42e59519068ded46178778084b9602b9c2edcd4cd0f94941733',
    '82349b0d1a80b42e59519068ded46178778084b9602b9c2edcd4cd0f94941733',
    '82349b0d1a80b42e59519068de