In [3]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql.functions import lit

spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

schema = "`event_time` TIMESTAMP,`event_type` STRING,`product_id` INT,`category_id` BIGINT,`category_code` STRING,`brand` STRING,`price` FLOAT,`user_id` INT,`user_session` STRING"
ddl_schema = T._parse_datatype_string(schema)

CPU times: user 846 µs, sys: 1.78 ms, total: 2.63 ms
Wall time: 8.69 ms


In [4]:
%%time
df01 = spark.read.option("header","true") \
        .schema(ddl_schema) \
        .csv("/project/ds5559/group12/raw_data/2020-01.csv") # January 2020

df02 = spark.read.option("header","true") \
        .schema(ddl_schema) \
        .csv("/project/ds5559/group12/raw_data/2020-02.csv") # February 2020
print(f'Time Period 1 Count: {df01.count()}')
print(f'Time Period 2 Count: {df02.count()}')

Time Period 1 Count: 55967041
Time Period 2 Count: 55318565
CPU times: user 5.18 ms, sys: 2.83 ms, total: 8.01 ms
Wall time: 7.82 s


In [None]:
df01.show(5)

#### Get distinct id list so more frequent visitors are no more likely to be picked than less frequent ones. 

In [None]:
%%time
ids = df01.select('user_id').distinct()
#ids.count()

#### Shuffle data & take random sample of first time period

In [None]:
%%time
sample01 = ids.sample(.1, 18) #18 is a seed for reproduceability

In [None]:
#%%time
#sample01.count()

In [None]:
%%time
selected_t1 = df01.join(sample01, 'user_id', 'leftsemi')

In [None]:
%%time
print(selected_t1.count())

In [None]:
%%time
selected_t2 = df02.join(sample01, 'user_id', 'leftsemi')
print(selected_t2.count())

#### Add time period tags so we don't have to deal with futzing with date-time when trying to separate time periods

In [None]:
%%time
selected_t1 = selected_t1.withColumn("Period", lit(1))
selected_t2 = selected_t2.withColumn("Period", lit(2))

In [None]:
#Confirmed looks as expected - wall time 6 minutes
#%%time
#selected_t1.show(5)

In [None]:
#Confirmed looks as expected - wall time 6 mins
#%%time
#selected_t2.show(5)

#### Union the two into one dataframe

In [None]:
%%time
finaldf = selected_t1.union(selected_t2)

In [None]:
# %%time # Wall time 12 mins with show()
# print(finaldf.count())
# finaldf.show(5) # Looks as expected

#### Train Test Split

In [None]:
%%time
train_ids = sample01.sample(.8, 543)

In [None]:
%%time 
#Wall time 5 mins
training_data = finaldf.join(train_ids, 'user_id', 'leftsemi')
#print(training_data.count())

In [None]:
%%time 
# Wall time 1 min
test_data = finaldf.join(train_ids, 'user_id', 'leftanti')
#print(test_data.count())

In [None]:
#%%time 
# Wall time 11 mins
#training_data.show(5) #Looks as expected

In [None]:
# %%time 
# Wall time 11 mins
# test_data.show(5) #Looks as expected

In [None]:
# %%time
#Wall time 14 mins
# training_data.write.csv("/project/ds5559/group12/raw_data/train10percent.csv")

In [None]:
#%%time 
#Wall time 14 mins
#test_data.write.csv("/project/ds5559/group12/raw_data/test10percent.csv")

In [None]:
#%%time 
#Wall time 14 minutes
#training_data.write.parquet("/project/ds5559/group12/raw_data/train10percent.parquet")

In [None]:
#%%time 
#Wall time 14 minutes
#test_data.write.parquet("/project/ds5559/group12/raw_data/test10percent.parquet")

## Alternatively... only people who made a purchase in time period 1

In [5]:
%%time 
buyers = df01.filter(df01.event_type == 'purchase').select(df01.user_id).distinct() #360k
#buyers.count() # 15 seconds

CPU times: user 2.05 ms, sys: 1.94 ms, total: 3.99 ms
Wall time: 143 ms


In [6]:
%%time
buyer_data_1 = df01.join(buyers, 'user_id', 'leftsemi')
#print(buyer_data_1.count()) # 5.8 million
#buyer_data_1.show(5) # 8 mins, looks as expected

CPU times: user 2.08 ms, sys: 985 µs, total: 3.07 ms
Wall time: 44.4 ms


In [8]:
#print(buyer_data_1.count()) # 5.8 million

In [10]:
%%time
buyer_data_2 = df02.join(buyers, 'user_id', 'leftsemi')
#print(buyer_data_2.count())

CPU times: user 1.01 ms, sys: 1.01 ms, total: 2.02 ms
Wall time: 16.2 ms


In [11]:
%%time
buyer_data_1 = buyer_data_1.withColumn("Period", lit(1)) # Add time period
buyer_data_2 = buyer_data_2.withColumn("Period", lit(2))

CPU times: user 1.39 ms, sys: 1.33 ms, total: 2.72 ms
Wall time: 93.4 ms


In [12]:
%%time
buyer_final_df = buyer_data_1.union(buyer_data_2)

CPU times: user 318 µs, sys: 1.28 ms, total: 1.6 ms
Wall time: 19.7 ms


In [13]:
# Train Test Split

In [14]:
%%time
buyer_train_ids = buyers.sample(.8, 543)

CPU times: user 1.05 ms, sys: 14 µs, total: 1.06 ms
Wall time: 4.39 ms


In [15]:
%%time 
#Wall time 1 mins
training_datab = buyer_final_df.join(buyer_train_ids, 'user_id', 'leftsemi')
#print(training_datab.count())

CPU times: user 1.23 ms, sys: 1.19 ms, total: 2.43 ms
Wall time: 26.5 ms


In [16]:
%%time 
#Wall time 1 mins
test_datab = buyer_final_df.join(buyer_train_ids, 'user_id', 'leftanti')
print(test_datab.count())

4748204
CPU times: user 7.33 ms, sys: 5.2 ms, total: 12.5 ms
Wall time: 1min 10s


In [17]:
%%time 
#Wall time 14 minutes
#training_datab.write.parquet("/project/ds5559/group12/raw_data/train_buyers.parquet")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.2 µs


In [18]:
%%time 
#Wall time 13 minutes
test_datab.write.parquet("/project/ds5559/group12/raw_data/test_buyers.parquet")

CPU times: user 51.6 ms, sys: 41 ms, total: 92.6 ms
Wall time: 11min 20s


In [None]:
%%time 
#Wall time 26 min
#training_datab.show(5)

In [None]:
#training_datab.count()

In [None]:
#%%time
#test_datab.show(5)
#test_datab.count()