In [3]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql.functions import lit

spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

schema = "`event_time` TIMESTAMP,`event_type` STRING,`product_id` INT,`category_id` BIGINT,`category_code` STRING,`brand` STRING,`price` DECIMAL(7,2),`user_id` INT,`user_session` STRING"
ddl_schema = T._parse_datatype_string(schema)

CPU times: user 2.55 ms, sys: 695 µs, total: 3.25 ms
Wall time: 8.97 ms


In [4]:
%%time
df01 = spark.read.option("header","true") \
        .schema(ddl_schema) \
        .csv("/project/ds5559/group12/raw_data/2020-01.csv") # January 2020

df02 = spark.read.option("header","true") \
        .schema(ddl_schema) \
        .csv("/project/ds5559/group12/raw_data/2020-02.csv") # February 2020
print(f'Time Period 1 Count: {df01.count()}')
print(f'Time Period 2 Count: {df02.count()}')

Time Period 1 Count: 55967041
Time Period 2 Count: 55318565
CPU times: user 5.44 ms, sys: 4.45 ms, total: 9.89 ms
Wall time: 7.98 s


#### Get distinct id list so more frequent visitors are no more likely to be picked than less frequent ones. 

In [22]:
%%time
ids = df01.select('user_session').distinct()
#ids.count()

CPU times: user 1.34 ms, sys: 1.28 ms, total: 2.62 ms
Wall time: 11.2 ms


#### Shuffle data & take random sample of first time period

In [23]:
%%time
sample01 = ids.sample(.1, 18) #18 is a seed for reproduceability

CPU times: user 216 µs, sys: 204 µs, total: 420 µs
Wall time: 1.37 ms


In [24]:
#%%time
#sample01.count()

In [25]:
%%time
selected_t1 = df01.join(sample01, 'user_session', 'leftsemi')

CPU times: user 1.48 ms, sys: 0 ns, total: 1.48 ms
Wall time: 13.5 ms


In [26]:
%%time
print(selected_t1.count())

5592558
CPU times: user 3.57 ms, sys: 3.34 ms, total: 6.91 ms
Wall time: 40.8 s


In [27]:
%%time
selected_t2 = df02.join(sample01, 'user_session', 'leftsemi')
print(selected_t2.count())

15381
CPU times: user 5.47 ms, sys: 2.81 ms, total: 8.27 ms
Wall time: 39 s


#### Add time period tags so we don't have to deal with futzing with date-time when trying to separate time periods

In [28]:
%%time
selected_t1 = selected_t1.withColumn("Period", lit(1))
selected_t2 = selected_t2.withColumn("Period", lit(2))

CPU times: user 1.56 ms, sys: 1.6 ms, total: 3.16 ms
Wall time: 34.3 ms


In [29]:
#Confirmed looks as expected - wall time 6 minutes
#%%time
#selected_t1.show(5)

In [30]:
#Confirmed looks as expected - wall time 6 mins
#%%time
#selected_t2.show(5)

#### Union the two into one dataframe

In [31]:
%%time
finaldf = selected_t1.union(selected_t2)

CPU times: user 411 µs, sys: 378 µs, total: 789 µs
Wall time: 5.14 ms


In [32]:
# %%time # Wall time 12 mins with show()
# print(finaldf.count())
# finaldf.show(5) # Looks as expected

#### Train Test Split

In [33]:
%%time
train_ids = sample01.sample(.8, 543)

CPU times: user 264 µs, sys: 241 µs, total: 505 µs
Wall time: 1.31 ms


In [34]:
%%time 
#Wall time 5 mins
training_data = finaldf.join(train_ids, 'user_session', 'leftsemi')
#print(training_data.count())

CPU times: user 874 µs, sys: 888 µs, total: 1.76 ms
Wall time: 14.6 ms


In [35]:
%%time 
# Wall time 1 min
test_data = finaldf.join(train_ids, 'user_session', 'leftanti')
#print(test_data.count())

CPU times: user 1.55 ms, sys: 474 µs, total: 2.03 ms
Wall time: 13.5 ms


In [36]:
#%%time 
# Wall time 11 mins
#training_data.show(5) #Looks as expected

In [37]:
# %%time 
# Wall time 11 mins
# test_data.show(5) #Looks as expected

In [40]:
# %%time
#Wall time 14 mins
# training_data.write.csv("/project/ds5559/group12/raw_data/train10percent.csv")

CPU times: user 60.7 ms, sys: 50.4 ms, total: 111 ms
Wall time: 13min 49s


In [39]:
#%%time 
#Wall time 14 mins
#test_data.write.csv("/project/ds5559/group12/raw_data/test10percent.csv")

CPU times: user 61 ms, sys: 50.9 ms, total: 112 ms
Wall time: 13min 49s


In [42]:
#%%time 
#Wall time 14 minutes
#training_data.write.parquet("/project/ds5559/group12/raw_data/train10percent.parquet")

CPU times: user 59.8 ms, sys: 52.5 ms, total: 112 ms
Wall time: 14min 3s


In [43]:
#%%time 
#Wall time 14 minutes
#test_data.write.parquet("/project/ds5559/group12/raw_data/test10percent.parquet")

CPU times: user 63.7 ms, sys: 50.2 ms, total: 114 ms
Wall time: 14min 25s


## Alternatively... only people who made a purchase in time period 1

In [6]:
%%time 
buyers = df01.filter(df01.event_type == 'purchase').select(df01.user_session).distinct() #700k
#buyers.count() # 15 seconds

CPU times: user 3.63 ms, sys: 2.63 ms, total: 6.26 ms
Wall time: 14.6 s


690930

In [8]:
%%time
buyer_data_1 = df01.join(buyers, 'user_session', 'leftsemi')
# print(buyer_data_1.count()) # 5.8 million
#buyer_data_1.show(5) # 8 mins, looks as expected

5853224
+--------------------+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+
|        user_session|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|
+--------------------+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+
|0001d6fa-2839-402...|2020-01-31 13:43:02|      view|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|
|0001d6fa-2839-402...|2020-01-31 13:45:06|      cart|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|
|0001d6fa-2839-402...|2020-01-31 13:46:41|  purchase|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|
|0001d6fa-2839-402...|2020-01-31 13:47:28|      view|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|
|0001d6fa-2839-402...|2020-01-31 13:47:38|      view|   1005212|22327320930

In [10]:
%%time
buyer_data_2 = df02.join(buyers, 'user_session', 'leftsemi')
print(buyer_data_2.count())

36473
CPU times: user 6.4 ms, sys: 3.49 ms, total: 9.89 ms
Wall time: 44.9 s


In [12]:
%%time
buyer_data_1 = buyer_data_1.withColumn("Period", lit(1)) # Add time period
buyer_data_2 = buyer_data_2.withColumn("Period", lit(2))

CPU times: user 1.16 ms, sys: 2.02 ms, total: 3.18 ms
Wall time: 34.7 ms


In [13]:
%%time
buyer_final_df = buyer_data_1.union(buyer_data_2)

CPU times: user 649 µs, sys: 1.17 ms, total: 1.82 ms
Wall time: 16.5 ms


In [14]:
# Train Test Split

In [17]:
%%time
buyer_train_ids = buyers.sample(.8, 543)

CPU times: user 1.13 ms, sys: 35 µs, total: 1.17 ms
Wall time: 1.87 ms


In [31]:
%%time 
#Wall time 1 mins
training_datab = buyer_final_df.join(buyer_train_ids, 'user_session', 'leftsemi')
#print(training_datab.count())

CPU times: user 2.09 ms, sys: 0 ns, total: 2.09 ms
Wall time: 15.3 ms


In [30]:
%%time 
#Wall time 1 mins
test_datab = buyer_final_df.join(buyer_train_ids, 'user_session', 'leftanti')
print(test_datab.count())

1177198
CPU times: user 12.2 ms, sys: 5.79 ms, total: 18 ms
Wall time: 1min 50s


In [35]:
%%time 
#Wall time 14 minutes
#training_datab.write.parquet("/project/ds5559/group12/raw_data/train_buyers.parquet")

CPU times: user 66.3 ms, sys: 49.9 ms, total: 116 ms
Wall time: 14min 10s


In [37]:
%%time 
#Wall time 13 minutes
#test_datab.write.parquet("/project/ds5559/group12/raw_data/test_buyers.parquet")

CPU times: user 61.5 ms, sys: 46.8 ms, total: 108 ms
Wall time: 12min 56s


In [32]:
%%time 
#Wall time 26 min
training_datab.show(5)

+--------------------+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+------+
|        user_session|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|Period|
+--------------------+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+------+
|0001d6fa-2839-402...|2020-01-31 13:43:02|      view|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|     1|
|0001d6fa-2839-402...|2020-01-31 13:45:06|      cart|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|     1|
|0001d6fa-2839-402...|2020-01-31 13:46:41|  purchase|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|     1|
|0001d6fa-2839-402...|2020-01-31 13:47:28|      view|   1005212|2232732093077520756|construction.tool...|samsung|164.06|599532510|     1|
|0001d6fa-2839-402...|2020-01-31 1

NameError: name 'trainig_datab' is not defined

In [34]:
training_datab.count()

4712499

In [33]:
%%time
test_datab.show(5)
test_datab.count()

+--------------------+-------------------+----------+----------+-------------------+-------------+--------+-----+---------+------+
|        user_session|         event_time|event_type|product_id|        category_id|category_code|   brand|price|  user_id|Period|
+--------------------+-------------------+----------+----------+-------------------+-------------+--------+-----+---------+------+
|0007e40b-2d95-460...|2020-01-16 11:51:48|      view|   7004322|2232732079009824823|  kids.skates|babytime|61.78|579436960|     1|
|0007e40b-2d95-460...|2020-01-16 11:52:14|      view|   7005805|2232732079009824823|  kids.skates|babytime|61.60|579436960|     1|
|0007e40b-2d95-460...|2020-01-16 11:53:36|      view|   7005063|2232732079009824823|  kids.skates|babytime|63.06|579436960|     1|
|0007e40b-2d95-460...|2020-01-16 11:54:13|      view|   7006592|2232732079009824823|  kids.skates|    yoga|59.20|579436960|     1|
|0007e40b-2d95-460...|2020-01-16 11:54:30|      view|   7006592|2232732079009824823

1177198