In [1]:
import os
import numpy as np
import timeit
import pyspark
import pandas as pd
import s3fs
import time
import ast
import operator
from datetime import datetime

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
s3_address = "s3a://msds630-kaggle-competition/"
dataset_addr_book = {}
dataset_name_list = ["events", "messages", "attributes", "sessions"]
for name in dataset_name_list:
    dataset_addr_book[name] = "".join([s3_address, name, ".csv"])

dataset_addr_book

{'attributes': 's3a://msds630-kaggle-competition/attributes.csv',
 'events': 's3a://msds630-kaggle-competition/events.csv',
 'messages': 's3a://msds630-kaggle-competition/messages.csv',
 'sessions': 's3a://msds630-kaggle-competition/sessions.csv'}

In [4]:
events_rdd = sc.textFile(dataset_addr_book["events"])\
               .map(lambda line : line.encode('ascii', 'ignore'))
cols_events = events_rdd.map(lambda x: x.split(',')).take(1)[0]
print(cols_events)
print(len(cols_events))

['app_id', 'session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']
6


In [5]:
events_data = events_rdd.filter(lambda x: 'app_id' not in x)\
                        .map(lambda x: x.split(','))
events_data.take(1)

[['4724682771660800',
  '5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [6]:
cols_events_adj = cols_events[1:] # Drop app_id
cols_events_adj.append('is_purchase')
cols_events_adj

['session_id',
 'event',
 'event_timestamp',
 'event_value',
 'user_id_hash',
 'is_purchase']

In [19]:
events_data_adj = events_data.map(lambda x: x[1:]) # Drop app_id
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [20]:
def isPurchase(event):
    if event == '8':
        return 1.0
    else:
        return 0.0

# Test
print(isPurchase('8'))
print(isPurchase('9'))

1.0
0.0


In [21]:
cols_events_adj

['session_id',
 'event',
 'event_timestamp',
 'event_value',
 'user_id_hash',
 'is_purchase']

In [22]:
# Add is_purchase
events_data_adj = events_data_adj.map(lambda x: [x[0], x[1], x[2], x[3], x[4], isPurchase(x[1])])
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006',
  0.0]]

In [23]:
def convertTime(ts_str):
    return datetime.utcfromtimestamp(int(ts_str)/1000.0).strftime('%Y-%m-%d %H:%M:%S')
convertTime('1541638424150')

'2018-11-08 00:53:44'

In [24]:
'2018-11-08 00:53:44'[:10]

'2018-11-08'

In [25]:
events_data_adj = events_data_adj.map(lambda x: [convertTime(x[2]), x[4], x[5]])
events_data_adj.take(1)

[['2018-11-14 17:09:57',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006',
  0.0]]

In [27]:
events_data_adj2 = events_data_adj.filter(lambda x: x[0][:10] >= '2018-12-02')
events_data_adj2.take(1)

[['2018-12-02 01:11:31',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006',
  0.0]]

In [28]:
events_data_adj2.count()

12754394

In [29]:
events_data_adj3 = events_data_adj2.filter(lambda x: x[0][:10] < '2018-12-16')
events_data_adj3.count()

12754394

In [36]:
def getToOne(num):
    if num is None: 
        return 0.0
    elif num >= 1: 
        return 1.0
    else:
        return 0.0
print(getToOne(None))
print(getToOne(1))
print(getToOne(2))
print(getToOne(0))

0.0
1.0
1.0
0.0


In [37]:
label_2_rdd = events_data_adj3.map(lambda x: (x[1], x[2]))\
                              .groupByKey().map(lambda x: [x[0], getToOne(sum(x[1]))])
label_2_rdd.take(1)

[['94482eb06653391eabffeb27ff94b84dca88e0ad3ceaa61f6a26ff36e7130a36', 0.0]]

In [39]:
label_2_rdd_2 = label_2_rdd.map(lambda x: ','.join([str(y) for y in x]))
label_2_rdd_2.take(1)

['94482eb06653391eabffeb27ff94b84dca88e0ad3ceaa61f6a26ff36e7130a36,0.0']

In [40]:
with open('label_2.csv', 'w') as f:
    for line in label_2_rdd_2.collect():
        f.write(line)
        f.write('\n')

In [41]:
events_data_adj4 = events_data_adj3.filter(lambda x: x[0][:10] < '2018-12-09')
events_data_adj4.count()

7349848

In [42]:
label_1_rdd = events_data_adj4.map(lambda x: (x[1], x[2]))\
                              .groupByKey().map(lambda x: [x[0], getToOne(sum(x[1]))])
label_1_rdd.take(1)

[['bf676611754201cc93ca1e2bcce8ca2c7ff0105186ebc03e4945affc1f80192d', 1.0]]

In [43]:
label_1_rdd_2 = label_1_rdd.map(lambda x: ','.join([str(y) for y in x]))
label_1_rdd_2.take(1)

['16663074c2e49d54525924457622d0a80eba89a50a670f3b0767a96b392ae06d,0.0']

In [44]:
with open('label_1.csv', 'w') as f:
    for line in label_1_rdd_2.collect():
        f.write(line)
        f.write('\n')