## File 00 - Original Data Filtering
#### This will create a new dataset in the format of the original ones, but filtered only to our persons of interest and combining all time periods into one dataframe.

In [1]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql.functions import lit

spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

schema = "`event_time` TIMESTAMP,`event_type` STRING,`product_id` INT,`category_id` BIGINT,`category_code` STRING,`brand` STRING,`price` FLOAT,`user_id` INT,`user_session` STRING"
ddl_schema = T._parse_datatype_string(schema)

CPU times: user 228 ms, sys: 163 ms, total: 391 ms
Wall time: 5.34 s


#### Read in two months

In [2]:
%%time
# Read in two months
df01 = spark.read.option("header","true") \
        .schema(ddl_schema) \
        .csv("/project/ds5559/group12/raw_data/2020-01.csv") # January 2020

df02 = spark.read.option("header","true") \
        .schema(ddl_schema) \
        .csv("/project/ds5559/group12/raw_data/2020-02.csv") # February 2020
print(f'Time Period 1 Count: {df01.count()}')
print(f'Time Period 2 Count: {df02.count()}')

Time Period 1 Count: 55967041
Time Period 2 Count: 55318565
CPU times: user 3.65 ms, sys: 5.85 ms, total: 9.5 ms
Wall time: 11.4 s


#### Get distinct id list so more frequent visitors are no more likely to be picked than less frequent ones. 

In [3]:
%%time
ids = df01.select('user_session').distinct()
#ids.count()

CPU times: user 1.98 ms, sys: 969 µs, total: 2.95 ms
Wall time: 51.8 ms


#### Filter to only those individuals who made a purchase in month 1

In [4]:
%%time 
buyers = df01.filter(df01.event_type == 'purchase').select(df01.user_id).distinct() #700k
#buyers.count() ##359105, 15 seconds 

CPU times: user 2.56 ms, sys: 2.41 ms, total: 4.97 ms
Wall time: 122 ms


In [5]:
%%time
buyer_data_1 = df01.join(buyers, 'user_id', 'leftsemi')
#print(buyer_data_1.count()) # 15923973
#buyer_data_1.show(5) # 8 mins, looks as expected

CPU times: user 1.18 ms, sys: 1.19 ms, total: 2.37 ms
Wall time: 41.5 ms


In [6]:
%%time
buyer_data_2 = df02.join(buyers, 'user_id', 'leftsemi')
# print(buyer_data_2.count()) # 7637487

CPU times: user 1.1 ms, sys: 1.02 ms, total: 2.12 ms
Wall time: 17.5 ms


#### Add time period marker so data can be merged but still easily separated

In [7]:
%%time
buyer_data_1 = buyer_data_1.withColumn("Period", lit(1)) 
buyer_data_2 = buyer_data_2.withColumn("Period", lit(2))

CPU times: user 1.28 ms, sys: 1.37 ms, total: 2.65 ms
Wall time: 38 ms


#### Save as separate parquets so compatable with basic preprocessing steps built on 2 dataframes

In [11]:
%%time
buyer_data_1.write.parquet("/project/ds5559/group12/raw_data/FilteredJan2020M1.parquet")

CPU times: user 25.1 ms, sys: 21.6 ms, total: 46.6 ms
Wall time: 5min 43s


In [None]:
%%time
buyer_data_2.write.parquet("/project/ds5559/group12/raw_data/FilteredFeb2020M2.parquet")

#### Union months 1 & 2 into final dataset (maintains all columns of the original dataset, but it is filtered & months are combined)
#### CURRENTLY NOT USED

In [8]:
#%%time
#buyer_final_df = buyer_data_1.union(buyer_data_2)
# print(buyer_final_df.count()) # 23561460
# buyer_final_df.show(5)

CPU times: user 1.25 ms, sys: 258 µs, total: 1.51 ms
Wall time: 20.6 ms


In [10]:
#%%time 
#Wall time 13 minutes
#buyer_final_df.write.parquet("/project/ds5559/group12/raw_data/Filtered_Month1.parquet")