## File 02 - Feature Creation

In this file, we create new features from our interaction-level dataset.

### Set up Spark session and data schema

We can specify more options in the SparkSession creator, but currently the options are at the default settings.

In [1]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import *
import datetime as dt
from pyspark.sql.functions import translate

from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors 
              
import copy
    
import sys
spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

#schema = "`event_time` TIMESTAMP,`event_type` STRING,`product_id` INT,`category_id` BIGINT,`category_code` STRING,`brand` STRING,`price` FLOAT,`user_id` INT,`user_session` STRING"
#ddl_schema = T._parse_datatype_string(schema)

CPU times: user 468 ms, sys: 328 ms, total: 796 ms
Wall time: 5.13 s


See https://docs.google.com/document/d/1NG4KGticBXn0D3PL5_zMxLV2Pr7A8PQtLcasxCOd1nA/edit for table of features.

### Read in data

In [2]:
%%time
full = spark.read.parquet("./processed_data/preprocessed_01.parquet")
m1 = spark.read.parquet("./processed_data/month_01_filtered.parquet") # This brings in the data we can create additional features from

CPU times: user 1.27 ms, sys: 2.28 ms, total: 3.55 ms
Wall time: 2.59 s


In [3]:
print(full.count())
full.show(5)

219080
+---------+--------------+------------------+------------+--------------+
|  user_id| T_total_spend|       total_spend|total_events|total_sessions|
+---------+--------------+------------------+------------+--------------+
|413580824|           0.0|               0.0|           3|             2|
|428963270|           0.0|               0.0|          68|             8|
|501050566|           0.0|               0.0|           2|             1|
|512378217|           0.0|               0.0|          25|             6|
|512394054|39439.12109375|236.82000732421875|          56|            16|
+---------+--------------+------------------+------------+--------------+
only showing top 5 rows



In [4]:
print(m1.count())
m1.show(5)

2807167
+---------+-------------------+----------+----------+-------------------+--------------------+--------+------+--------------------+
|  user_id|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|        user_session|
+---------+-------------------+----------+----------+-------------------+--------------------+--------+------+--------------------+
|416793411|2020-01-12 13:16:29|      view|   1004836|2232732093077520756|construction.tool...| samsung|231.38|315c1383-b002-4c3...|
|465783976|2020-01-04 10:36:20|      view|  13901213|2053013557343158789|construction.comp...|  blanco|218.65|75f6bddc-41f8-497...|
|465783976|2020-01-04 10:37:14|      view|  13902800|2053013561092866779|   computers.desktop|  blanco|140.35|75f6bddc-41f8-497...|
|465783976|2020-01-04 10:38:07|      view|  13902800|2053013561092866779|   computers.desktop|  blanco|140.35|75f6bddc-41f8-497...|
|465783976|2020-01-04 10:40:08|      view|  13902647|205301356109286

## Begin Creating Features
### Create each on an individual level, then join to full
##### NOTE: Must rename all features so that they do not contain parenthesis - not compatible with saving to parquet

_________________

#### Average Session Duration (avg_session_length)

In [5]:
session_ends = m1.groupBy('user_id', 'user_session').agg(max('event_time'), min('event_time'))

In [6]:
session_ends.show(5)

+---------+--------------------+-------------------+-------------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|
+---------+--------------------+-------------------+-------------------+
|514283509|ba808472-24ca-4a5...|2020-01-19 12:30:36|2020-01-19 12:30:36|
|524368563|c00a9199-98cb-412...|2020-01-08 11:43:08|2020-01-08 11:41:06|
|554064616|04d50e0d-b80c-44e...|2020-01-08 10:02:28|2020-01-08 10:02:28|
|554426664|a6aebb81-e68d-4fc...|2020-01-20 11:57:27|2020-01-20 11:48:18|
|560605592|535c77ff-60a7-432...|2020-01-21 13:03:42|2020-01-21 12:53:09|
+---------+--------------------+-------------------+-------------------+
only showing top 5 rows



In [7]:
session_ends = session_ends.withColumn('session_length', (col("max(event_time)").cast('long') - col("min(event_time)").cast('long')))

In [8]:
session_ends.orderBy(col("session_length").desc()).show(5)
# NOTE: Lots of these sessions are unreasonably long

+---------+--------------------+-------------------+-------------------+--------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|session_length|
+---------+--------------------+-------------------+-------------------+--------------+
|550527121|9124b2c1-02e4-4cc...|2020-01-31 22:39:05|2020-01-01 07:10:21|       2647724|
|593313269|bcaf86f2-1c1d-420...|2020-01-31 16:40:45|2020-01-01 05:36:13|       2631872|
|516733273|2b0fc08b-bd1d-439...|2020-01-31 14:24:46|2020-01-01 07:09:16|       2618130|
|566985224|9d0368d8-c6ac-42d...|2020-01-31 14:45:42|2020-01-01 07:41:22|       2617460|
|542394994|1b9f919f-f044-4b5...|2020-01-31 14:14:10|2020-01-01 08:02:12|       2614318|
+---------+--------------------+-------------------+-------------------+--------------+
only showing top 5 rows



In [9]:
avg_sess = session_ends.groupBy('user_id').avg('session_length').withColumnRenamed('avg(session_length)', "avg_session_length")

In [10]:
avg_sess.show(5)

+---------+------------------+
|  user_id|avg_session_length|
+---------+------------------+
|512700240| 336.2857142857143|
|514013554|         44762.375|
|514338009| 161.1595744680851|
|514747635|               0.0|
|516092497|108.26666666666667|
+---------+------------------+
only showing top 5 rows



In [11]:
full = full.join(avg_sess, full.user_id == avg_sess.user_id).drop(avg_sess.user_id)
print(full.count())
full.show(5)

219080
+---------+------------------+------------------+------------+--------------+------------------+
|  user_id|     T_total_spend|       total_spend|total_events|total_sessions|avg_session_length|
+---------+------------------+------------------+------------+--------------+------------------+
|512700240|14504.489959716797|1436.1799926757812|          66|             7| 336.2857142857143|
|514013554| 7735.519714355469|               0.0|          52|             8|         44762.375|
|514338009| 2786062.823135376| 269863.6780014038|       23472|            94| 161.1595744680851|
|514747635|               0.0|               0.0|           1|             1|               0.0|
|516092497|               0.0|               0.0|          38|            15|108.26666666666667|
+---------+------------------+------------------+------------+--------------+------------------+
only showing top 5 rows



#### Std Deviation of session duration by person (sd_session_length)

In [12]:
session_ends.show(5)

+---------+--------------------+-------------------+-------------------+--------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|session_length|
+---------+--------------------+-------------------+-------------------+--------------+
|514283509|ba808472-24ca-4a5...|2020-01-19 12:30:36|2020-01-19 12:30:36|             0|
|524368563|c00a9199-98cb-412...|2020-01-08 11:43:08|2020-01-08 11:41:06|           122|
|554064616|04d50e0d-b80c-44e...|2020-01-08 10:02:28|2020-01-08 10:02:28|             0|
|554426664|a6aebb81-e68d-4fc...|2020-01-20 11:57:27|2020-01-20 11:48:18|           549|
|560605592|535c77ff-60a7-432...|2020-01-21 13:03:42|2020-01-21 12:53:09|           633|
+---------+--------------------+-------------------+-------------------+--------------+
only showing top 5 rows



In [13]:
sd_session_length = session_ends.groupBy('user_id') \
                                 .agg(stddev('session_length')) \
                                 .withColumnRenamed("stddev_samp(session_length)", 'sd_session_length')

In [14]:
sd_session_length.show(5)

+---------+------------------+
|  user_id| sd_session_length|
+---------+------------------+
|512700240|358.21209466167494|
|514013554|126444.70285909568|
|514338009| 255.1480827963551|
|514747635|              null|
|516092497|201.15149467384978|
+---------+------------------+
only showing top 5 rows



In [15]:
full = full.join(sd_session_length, full.user_id == sd_session_length.user_id).drop(sd_session_length.user_id)

#### UNFINISHED Distance from last interaction to end of month (seconds)


In [16]:
# new_month = dt.datetime(2020,2,1,0,0).timestamp() # This is the epoch seconds of Feb 1, 2020 at midnight
# new_month

In [17]:
# last_interaction = m1.groupBy('user_id').agg(max('event_time'))
# last_interaction.show()

In [18]:
## last_int_dist = last_interaction.withColumn('time_from_end_month', timestamp(1580533200) - col('max(event_time)'))

#### Average number of interactions per session (avg_interactions_per_session)

In [19]:
interactions_per_session = m1.groupBy('user_id', 'user_session').agg(count('event_type'))

In [20]:
interactions_per_session.show(5)

+---------+--------------------+-----------------+
|  user_id|        user_session|count(event_type)|
+---------+--------------------+-----------------+
|514283509|ba808472-24ca-4a5...|                1|
|524368563|c00a9199-98cb-412...|                4|
|554064616|04d50e0d-b80c-44e...|                1|
|554426664|a6aebb81-e68d-4fc...|                9|
|560605592|535c77ff-60a7-432...|                6|
+---------+--------------------+-----------------+
only showing top 5 rows



In [21]:
avg_interactions_per_session = interactions_per_session.groupBy('user_id').avg('count(event_type)')

In [22]:
avg_interactions_per_session = avg_interactions_per_session.withColumnRenamed('avg(count(event_type))', "avg_interactions_per_session")

In [23]:
full = full.join(avg_interactions_per_session, full.user_id == avg_interactions_per_session.user_id).drop(avg_interactions_per_session.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+
|405614124|          0.0|        0.0|           2|             2|               0.0|               0.0|                         1.0|
|485991194|          0.0|        0.0|           3|             1|             398.0|              null|                         3.0|
|496765250|          0.0|        0.0|           1|             1|               0.0|              null|                         1.0|
|501980918|          0.0|        0.0|        2141|            40|          1955.925|3099.7057626077335|                      53.525|
|502621333|          0.0|        0.0|           5|             3|221.

#### Std Deviation of number of interactions per session per person (stddev_int_per_session)

In [24]:
std_interactions_per_session = interactions_per_session.groupBy('user_id') \
                                                       .agg(stddev('count(event_type)')) \
                                                       .withColumnRenamed("stddev_samp(count(event_type))", 'sd_interactions_per_session')
std_interactions_per_session.show(5)

+---------+---------------------------+
|  user_id|sd_interactions_per_session|
+---------+---------------------------+
|512700240|          2.138089935299395|
|514013554|         3.4121631178560534|
|514338009|           6.08820433161764|
|514747635|                       null|
|516092497|         1.8464895909600494|
+---------+---------------------------+
only showing top 5 rows



In [25]:
full = full.join(std_interactions_per_session, full.user_id == std_interactions_per_session.user_id).drop(std_interactions_per_session.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+
|405614124|          0.0|        0.0|           2|             2|               0.0|               0.0|                         1.0|                        0.0|
|485991194|          0.0|        0.0|           3|             1|             398.0|              null|                         3.0|                       null|
|496765250|          0.0|        0.0|           1|             1|               0.0|              null|                         1.0|                       null|
|501980918|          0.0|        0

#### Max number of interactions within one session (max_interactions_one_session)

In [26]:
max_interactions_per_session = interactions_per_session.groupBy('user_id').max('count(event_type)')

In [27]:
max_interactions_per_session = max_interactions_per_session.withColumnRenamed('max(count(event_type))', "max_interactions_per_session")

In [28]:
max_interactions_per_session.show(1)

+---------+----------------------------+
|  user_id|max_interactions_per_session|
+---------+----------------------------+
|512700240|                           7|
+---------+----------------------------+
only showing top 1 row



In [29]:
full = full.join(max_interactions_per_session, full.user_id == max_interactions_per_session.user_id).drop(max_interactions_per_session.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+
|405614124|          0.0|        0.0|           2|             2|               0.0|               0.0|                         1.0|                        0.0|                           1|
|485991194|          0.0|        0.0|           3|             1|             398.0|              null|                         3.0|                       null|                           3|
|496765250|          0.0|        0.0|           1|

#### Percent of total events that are x (Purchase, Cart, View) ('purchase_pct_of_total_events', 'cart_pct_of_total_events', 'view_pct_of_total_events')

In [30]:
event_counts = m1.groupBy('user_id', 'user_session').pivot('event_type').agg(count('event_type'))
# Here the three types of event count are pivoted out for later tabulation

In [31]:
event_counts = event_counts.fillna(0) #replace nulls with 0 for math
event_counts.show(5)

+---------+--------------------+----+--------+----+
|  user_id|        user_session|cart|purchase|view|
+---------+--------------------+----+--------+----+
|602491865|7ec0e1e6-94e0-493...|   0|       0|   4|
|552639168|d335b338-322b-4eb...|   1|       1|   2|
|581273021|847d49fa-06a5-438...|   0|       0|   3|
|515047041|591cd0ea-f290-47c...|   0|       0|   1|
|591332625|1f8c24dd-9574-47c...|   1|       0|  14|
+---------+--------------------+----+--------+----+
only showing top 5 rows



In [32]:
events_per_session = event_counts.withColumn('events_per_session_total', col('cart') + col('purchase') + col('view')) 
# Get total number of events per session

In [33]:
events_per_session.show(5)

+---------+--------------------+----+--------+----+------------------------+
|  user_id|        user_session|cart|purchase|view|events_per_session_total|
+---------+--------------------+----+--------+----+------------------------+
|602491865|7ec0e1e6-94e0-493...|   0|       0|   4|                       4|
|552639168|d335b338-322b-4eb...|   1|       1|   2|                       4|
|581273021|847d49fa-06a5-438...|   0|       0|   3|                       3|
|515047041|591cd0ea-f290-47c...|   0|       0|   1|                       1|
|591332625|1f8c24dd-9574-47c...|   1|       0|  14|                      15|
+---------+--------------------+----+--------+----+------------------------+
only showing top 5 rows



In [34]:
pct_events = events_per_session.groupBy('user_id').sum()

In [35]:
pct_totalevents = pct_events.withColumn('purchase_pct_of_total_events', col('sum(purchase)')/col('sum(events_per_session_total)')) \
                  .withColumn('view_pct_of_total_events', col('sum(view)')/col('sum(events_per_session_total)')) \
                  .withColumn('cart_pct_of_total_events', col('sum(cart)')/col('sum(events_per_session_total)'))

In [36]:
merge_me = pct_totalevents.select('user_id', 'purchase_pct_of_total_events', 'view_pct_of_total_events', 'cart_pct_of_total_events')

In [37]:
full = full.join(merge_me, full.user_id == merge_me.user_id).drop(merge_me.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+
|405614124|          0.0|        0.0|           2|             2|               0.0|               0.0|                         1.0|                        0.0|                           1|    

#### Average number of purchases per session (avg_purchases_per_session)

In [38]:
avg_purchases_per_session = events_per_session.groupBy('user_id').avg('purchase').withColumnRenamed('avg(purchase)', "avg_purchases_per_session")

In [39]:
avg_purchases_per_session.show(5)

+---------+-------------------------+
|  user_id|avg_purchases_per_session|
+---------+-------------------------+
|512700240|       0.5714285714285714|
|539141084|                     0.04|
|606753158|                      0.0|
|575813444|                      0.0|
|581503547|                      0.0|
+---------+-------------------------+
only showing top 5 rows



In [40]:
full = full.join(avg_purchases_per_session, full.user_id == avg_purchases_per_session.user_id).drop(avg_purchases_per_session.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+
|405614124|          0.0|        0.0|           2|             2|               0.0|               0.0|            

#### STD of number of purchases per session per person (std_purchases_per_session)

In [41]:
std_purchases_per_session = events_per_session.groupBy('user_id') \
                                              .agg(stddev('purchase')) \
                                              .withColumnRenamed('stddev_samp(purchase)', "sd_purchases_per_session")
std_purchases_per_session.show(5)

+---------+------------------------+
|  user_id|sd_purchases_per_session|
+---------+------------------------+
|512700240|      0.5345224838248488|
|539141084|                     0.2|
|606753158|                     0.0|
|575813444|                     0.0|
|581503547|                     0.0|
+---------+------------------------+
only showing top 5 rows



In [42]:
full = full.join(std_purchases_per_session, full.user_id == std_purchases_per_session.user_id).drop(std_purchases_per_session.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+
|405614124|          0.0|        0.0|   

#### Total number of each type of event over whole month (monthlyCartTotal, monthlyPurchaseTotal, monthlyViewTotal)

In [43]:
event_counts_month = event_counts.groupBy('user_id').sum('cart', 'purchase', 'view')\
                     .withColumnRenamed('sum(cart)', 'cart_events') \
                     .withColumnRenamed('sum(purchase)', 'purchase_events') \
                     .withColumnRenamed('sum(view)', 'view_events')

In [44]:
event_counts_month.show(5)

+---------+-----------+---------------+-----------+
|  user_id|cart_events|purchase_events|view_events|
+---------+-----------+---------------+-----------+
|512700240|          6|              4|         23|
|539141084|          2|              1|        184|
|606753158|          0|              0|          5|
|575813444|          0|              0|          9|
|581503547|          2|              0|         57|
+---------+-----------+---------------+-----------+
only showing top 5 rows



In [45]:
full = full.join(event_counts_month, full.user_id == event_counts_month.user_id).drop(event_counts_month.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|cart_events|purchase_events|view_events|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+------------

#### Total number of sessions that contain event over whole month (NumSessWithPurchases, NumSessWithCart, NumSessWithView)

In [46]:
events_over_month = events_per_session.withColumn('purchase_events', when(col('purchase') == 0, 0).otherwise(1)) \
                                      .withColumn('cart_events', when(col('cart')==0, 0).otherwise(1)) \
                                      .withColumn('view_events', when(col('view')==0, 0).otherwise(1))

In [47]:
num_sesh_containing_event = events_over_month.groupBy('user_id').sum('purchase_events', "cart_events", "view_events") \
                            .withColumnRenamed("sum(purchase_events)", "sessions_with_purchase") \
                            .withColumnRenamed("sum(cart_events)", "sessions_with_cart") \
                            .withColumnRenamed("sum(view_events)", "sessions_with_view")

In [48]:
num_sesh_containing_event.show(5)

+---------+----------------------+------------------+------------------+
|  user_id|sessions_with_purchase|sessions_with_cart|sessions_with_view|
+---------+----------------------+------------------+------------------+
|512700240|                     4|                 4|                 7|
|539141084|                     1|                 1|                25|
|606753158|                     0|                 0|                 2|
|575813444|                     0|                 0|                 7|
|581503547|                     0|                 2|                10|
+---------+----------------------+------------------+------------------+
only showing top 5 rows



In [49]:
full = full.join(num_sesh_containing_event, full.user_id == num_sesh_containing_event.user_id).drop(num_sesh_containing_event.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+----------------------+------------------+------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|cart_events|purchase_events|view_events|sessions_with_purchase|sessions_with_cart|sessions_with_view|
+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+--------------------------

#### Percent of individual's sessions that end in cart/purchase (ses_end_purch, ses_end_cart)

In [50]:
session_ends2 = event_counts.withColumn('end_purchase', \
                                when(col('purchase') != 0, 1) \
                                .otherwise(0)) \
                            .withColumn('end_cart', \
                                when((col("purchase") == 0) & (col("cart") != 0), 1) \
                                .otherwise(0))
session_ends2.show(5)

+---------+--------------------+----+--------+----+------------+--------+
|  user_id|        user_session|cart|purchase|view|end_purchase|end_cart|
+---------+--------------------+----+--------+----+------------+--------+
|602491865|7ec0e1e6-94e0-493...|   0|       0|   4|           0|       0|
|552639168|d335b338-322b-4eb...|   1|       1|   2|           1|       0|
|581273021|847d49fa-06a5-438...|   0|       0|   3|           0|       0|
|515047041|591cd0ea-f290-47c...|   0|       0|   1|           0|       0|
|591332625|1f8c24dd-9574-47c...|   1|       0|  14|           0|       1|
+---------+--------------------+----+--------+----+------------+--------+
only showing top 5 rows



In [51]:
session_sum = session_ends2.groupBy('user_id').agg(count('user_session'), sum('end_purchase'), sum('end_cart'))
session_sum.show(5)

+---------+-------------------+-----------------+-------------+
|  user_id|count(user_session)|sum(end_purchase)|sum(end_cart)|
+---------+-------------------+-----------------+-------------+
|512700240|                  7|                4|            0|
|539141084|                 25|                1|            0|
|606753158|                  2|                0|            0|
|575813444|                  7|                0|            0|
|581503547|                 10|                0|            2|
+---------+-------------------+-----------------+-------------+
only showing top 5 rows



In [52]:
session_sum = session_sum.withColumn('pct_sessions_end_purchase', col('sum(end_purchase)')/col('count(user_session)')) \
                         .withColumn('pct_sessions_end_cart', col('sum(end_cart)')/col('count(user_session)'))
session_sum.show(5)

+---------+-------------------+-----------------+-------------+-------------------------+---------------------+
|  user_id|count(user_session)|sum(end_purchase)|sum(end_cart)|pct_sessions_end_purchase|pct_sessions_end_cart|
+---------+-------------------+-----------------+-------------+-------------------------+---------------------+
|512700240|                  7|                4|            0|       0.5714285714285714|                  0.0|
|539141084|                 25|                1|            0|                     0.04|                  0.0|
|606753158|                  2|                0|            0|                      0.0|                  0.0|
|575813444|                  7|                0|            0|                      0.0|                  0.0|
|581503547|                 10|                0|            2|                      0.0|                  0.2|
+---------+-------------------+-----------------+-------------+-------------------------+---------------

In [53]:
temp = session_sum.select('user_id', "pct_sessions_end_purchase", "pct_sessions_end_cart")
temp.show(5)

+---------+-------------------------+---------------------+
|  user_id|pct_sessions_end_purchase|pct_sessions_end_cart|
+---------+-------------------------+---------------------+
|512700240|       0.5714285714285714|                  0.0|
|539141084|                     0.04|                  0.0|
|606753158|                      0.0|                  0.0|
|575813444|                      0.0|                  0.0|
|581503547|                      0.0|                  0.2|
+---------+-------------------------+---------------------+
only showing top 5 rows



In [54]:
full = full.join(temp, full.user_id == temp.user_id).drop(temp.user_id)
full.show(5)

+---------+-------------+-----------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+----------------------+------------------+------------------+-------------------------+---------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|cart_events|purchase_events|view_events|sessions_with_purchase|sessions_with_cart|sessions_with_view|pct_sessions_end_purchase|pct_sessions_end_cart|
+---------+-------------+-----------+------------+------------

### Preview full dataframe

In [55]:
full.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- T_total_spend: double (nullable = true)
 |-- total_spend: double (nullable = true)
 |-- total_events: long (nullable = true)
 |-- total_sessions: long (nullable = true)
 |-- avg_session_length: double (nullable = true)
 |-- sd_session_length: double (nullable = true)
 |-- avg_interactions_per_session: double (nullable = true)
 |-- sd_interactions_per_session: double (nullable = true)
 |-- max_interactions_per_session: long (nullable = true)
 |-- purchase_pct_of_total_events: double (nullable = true)
 |-- view_pct_of_total_events: double (nullable = true)
 |-- cart_pct_of_total_events: double (nullable = true)
 |-- avg_purchases_per_session: double (nullable = true)
 |-- sd_purchases_per_session: double (nullable = true)
 |-- cart_events: long (nullable = true)
 |-- purchase_events: long (nullable = true)
 |-- view_events: long (nullable = true)
 |-- sessions_with_purchase: long (nullable = true)
 |-- sessions_with_cart: long (nullable =

In [56]:
full.show(1)

+---------+-------------+-----------+------------+--------------+------------------+-----------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+----------------------+------------------+------------------+-------------------------+---------------------+
|  user_id|T_total_spend|total_spend|total_events|total_sessions|avg_session_length|sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|cart_events|purchase_events|view_events|sessions_with_purchase|sessions_with_cart|sessions_with_view|pct_sessions_end_purchase|pct_sessions_end_cart|
+---------+-------------+-----------+------------+--------------

In [57]:
full.count()

219080

#### Save as parquet. (If saving in project group12 folder - Make sure to change permissions in bash using chmod 777 filename)

In [58]:
%%time
full.write.mode("overwrite").parquet("./processed_data/engineered_features.parquet")

CPU times: user 3.8 ms, sys: 1.5 ms, total: 5.3 ms
Wall time: 25.6 s


In [59]:
%%time
train, test = full.randomSplit([.8, .2], seed=42)

CPU times: user 0 ns, sys: 2.14 ms, total: 2.14 ms
Wall time: 20.4 ms


#### Purchased items in month 1, converted to PCA (pca_purchases)

Note: Unlike all of the other preprocessing, we need to train the PCA model on the training set, then implement it on the test set. For this reason it comes after the train/test split.

In [62]:
%%time

# Create a function that prepares a dataset for PCA.

def pca_prepare_on_subset(subset_df, limited_columns=[]):
    # Only get this data from the training (or test) set
    m1_subset = m1.join(subset_df,'user_id','leftsemi')

    # Remove the periods from the dataframe category_code and replace with dashes. PySpark does not do well with periods in column
    #  names, for some reason
    m1_stripped = m1.withColumn('category_code_s', translate('category_code', '.', '-'))

    # Pivot so that each category of purchase becomes a colummn
    cats = m1_stripped.filter(m1.event_type == "purchase").groupBy('user_id').pivot('category_code_s').count().na.fill(0)

    pca_input_cols = [cols for cols in cats.columns if cols!='user_id' and cols!='null']
        
    # Make a new copy of columns (this is from the training set to the test set, in order to filter out other columns)
    if(limited_columns==[]):
        limited_columns = copy.deepcopy(pca_input_cols)
        limited_columns.append('user_id')
    else:
        cats = cats.select(*limited_columns) # This is for the test set 
        # print(cats.schema)

    # Transform columns into a sparse vector (prepare for PCA)
    assembler = VectorAssembler(
        inputCols=pca_input_cols,
        outputCol="to_pca_columns")
    
    # Create sparse vector
    pca_df = assembler.transform(cats)
    return limited_columns, pca_df
    

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs


In [63]:
# Get columns from training set and get training df
limited_columns, train_pre_pca = pca_prepare_on_subset(train)
# Limit to these columns on the test set and get test df
_, test_pre_pca = pca_prepare_on_subset(test, limited_columns=limited_columns)

# Visualize what this looks like
train_pre_pca.select(["user_id","to_pca_columns"]).show(2, truncate=False)

# Create new PCA instance
pca = PCAml(k=10, inputCol="to_pca_columns", outputCol="pca_purchases")
# Fit on training data
model = pca.fit(train_pre_pca)

# Transform training and test sets
train_with_pca = model.transform(train_pre_pca)
test_with_pca = model.transform(test_pre_pca)


+---------+---------------------------------------------------------------------------------------+
|user_id  |to_pca_columns                                                                         |
+---------+---------------------------------------------------------------------------------------+
|514338009|(129,[7,11,15,25,47,57,72,80,95,109,122],[2.0,3.0,1.0,4.0,2.0,4.0,4.0,7.0,1.0,1.0,1.0])|
|602481043|(129,[80],[2.0])                                                                       |
+---------+---------------------------------------------------------------------------------------+
only showing top 2 rows



In [64]:
# Merge PCA df back into full training set
join_train_df = train_with_pca.select(["user_id","pca_purchases"])
train = train.join(join_train_df, train.user_id == join_train_df.user_id).drop(join_train_df.user_id)

# Merge PCA df back into full test set
join_test_df = test_with_pca.select(["user_id","pca_purchases"])
test = test.join(join_test_df, test.user_id == join_test_df.user_id).drop(join_test_df.user_id)

In [66]:
train.show(5, truncate=False)
test.show(5, truncate=False)

+---------+------------------+------------------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+----------------------+------------------+------------------+-------------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id  |T_total_spend     |total_spend       |total_events|total_sessions|avg_session_length|sd_session_length |avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_sessio

#### Write train and test

In [None]:
%%time
train.write.mode("overwrite").parquet("./processed_data/train.parquet")
test.write.mode("overwrite").parquet("./processed_data/test.parquet")