In [52]:
import os
import numpy as np
import timeit
import pyspark
import pandas as pd
import s3fs
import time
import ast
from datetime import datetime

In [53]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [54]:
s3_address = "s3a://msds630-kaggle-competition/"
dataset_addr_book = {}
dataset_name_list = ["events", "messages", "attributes", "sessions"]
for name in dataset_name_list:
    dataset_addr_book[name] = "".join([s3_address, name, ".csv"])

dataset_addr_book

{'attributes': 's3a://msds630-kaggle-competition/attributes.csv',
 'events': 's3a://msds630-kaggle-competition/events.csv',
 'messages': 's3a://msds630-kaggle-competition/messages.csv',
 'sessions': 's3a://msds630-kaggle-competition/sessions.csv'}

In [55]:
events_rdd = sc.textFile(dataset_addr_book["events"])\
               .map(lambda line : line.encode('ascii', 'ignore'))
cols_events = events_rdd.map(lambda x: x.split(',')).take(1)[0]
print(cols_events)
print(len(cols_events))

['app_id', 'session_id', 'event', 'event_timestamp', 'event_value', 'user_id_hash']
6


In [56]:
events_data = events_rdd.filter(lambda x: 'app_id' not in x)\
                        .map(lambda x: x.split(','))
events_data.take(1)

[['4724682771660800',
  '5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [57]:
events_data_adj = events_data.map(lambda x: x[1:]) # Drop app_id
events_data_adj.take(1)

[['5558845121177764917',
  '45',
  '1542215397132',
  '0.0',
  '9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006']]

In [58]:
def convertTime(ts_str):
    return datetime.utcfromtimestamp(int(ts_str)/1000.0).strftime('%Y-%m-%d %H:%M:%S')
convertTime('1541638424150')

'2018-11-08 00:53:44'

In [59]:
events_data_adj2 = events_data_adj.map(lambda x: (x[4], convertTime(x[2])))
events_data_adj2.take(1)

[('9943447915df3a45fd6720a026af905b6da6b56a37701b8b2629802e9a541006',
  '2018-11-14 17:09:57')]

In [60]:
events_data_train = events_data_adj2.filter(lambda x: x[1] < '2018-12-02')
events_data_test = events_data_adj2.filter(lambda x: x[1] < '2018-12-16')

In [61]:
events_data_train.take(1)[0][1]

'2018-11-14 17:09:57'

In [62]:
datetime.strptime('2018-11-14 17:09:57', '%Y-%m-%d %H:%M:%S')

datetime.datetime(2018, 11, 14, 17, 9, 57)

In [63]:
def getMaxDate(dt_list):
    return max([datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in dt_list])
getMaxDate(['2018-11-14 17:09:57','2018-12-14 17:09:57'])

datetime.datetime(2018, 12, 14, 17, 9, 57)

In [64]:
event_data_train_group = events_data_train.groupByKey().map(lambda x: [x[0], getMaxDate(list(x[1]))])
event_data_test_group = events_data_test.groupByKey().map(lambda x: [x[0], getMaxDate(list(x[1]))])

In [65]:
event_data_train_group.take(1)

[['f112a4a8617253cf392752d6cb8eaae910750852a2c95ec0b5255b090cd8ef2f',
  datetime.datetime(2018, 10, 31, 9, 2, 59)]]

In [66]:
str(datetime(2018, 11, 5, 8, 0, 26))

'2018-11-05 08:00:26'

In [67]:
event_data_train_group_2 = event_data_train_group.map(lambda x: ','.join([x[0],str(x[1])]))
event_data_test_group_2 = event_data_test_group.map(lambda x: ','.join([x[0],str(x[1])]))

In [68]:
event_data_train_group_2.take(1)

['f112a4a8617253cf392752d6cb8eaae910750852a2c95ec0b5255b090cd8ef2f,2018-10-31 09:02:59']

In [45]:
# event_data_train_group_2.coalesce(1).saveAsTextFile("train_most_recent_session.csv")

In [46]:
# event_data_test_group_2.coalesce(1).saveAsTextFile("test_most_recent_session.csv")

In [71]:
with open('most_recent_session_train.csv', 'w') as f:
    for line in event_data_train_group_2.collect():
        f.write(line)
        f.write('\n')

In [72]:
event_data_test_group_2.take(1)

['545bc18dece2ea9762fabe6d23507455e955c0aec1369f51a704d9b2c564b1f4,2018-11-05 08:00:26']

In [73]:
with open('most_recent_session_test.csv', 'w') as f:
    for line in event_data_test_group_2.collect():
        f.write(line)
        f.write('\n')

In [None]:
event_data_train_group_2.count()