## File 04 - Feature Creation

In this file, we create new features from our data

### Set up Spark session and data schema

We can specify more options in the SparkSession creator, but currently the options are at the default settings.

In [3]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import *
import datetime as dt

import sys
spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

#schema = "`event_time` TIMESTAMP,`event_type` STRING,`product_id` INT,`category_id` BIGINT,`category_code` STRING,`brand` STRING,`price` FLOAT,`user_id` INT,`user_session` STRING"
#ddl_schema = T._parse_datatype_string(schema)

CPU times: user 549 ms, sys: 317 ms, total: 866 ms
Wall time: 5.62 s


See https://docs.google.com/document/d/1NG4KGticBXn0D3PL5_zMxLV2Pr7A8PQtLcasxCOd1nA/edit for table of features.

### Read in data

In [4]:
%%time
full = spark.read.parquet("./processed_data/preprocessed_01.parquet")
m1 = spark.read.parquet("./processed_data/month_01_filtered.parquet") # This brings in the data we can create additional features from

CPU times: user 4.41 ms, sys: 1.35 ms, total: 5.76 ms
Wall time: 2.96 s


In [5]:
print(full.count())
full.show(5)

219080
+---------+--------------+------------------+---------------+------------------+----------------+
|  user_id|m2_total_spend|    m1_total_spend|m1_total_events|m1_purchase_events|m1_user_sessions|
+---------+--------------+------------------+---------------+------------------+----------------+
|413580824|           0.0|               0.0|              3|                 0|               2|
|428963270|           0.0|               0.0|             68|                 0|               8|
|501050566|           0.0|               0.0|              2|                 0|               1|
|512378217|           0.0|               0.0|             25|                 0|               6|
|512394054|39439.12109375|236.82000732421875|             56|                 2|              16|
+---------+--------------+------------------+---------------+------------------+----------------+
only showing top 5 rows



In [6]:
print(m1.count())
m1.show(5)

2807167
+---------+-------------------+----------+----------+-------------------+--------------------+--------+------+--------------------+
|  user_id|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|        user_session|
+---------+-------------------+----------+----------+-------------------+--------------------+--------+------+--------------------+
|416793411|2020-01-12 13:16:29|      view|   1004836|2232732093077520756|construction.tool...| samsung|231.38|315c1383-b002-4c3...|
|465783976|2020-01-04 10:36:20|      view|  13901213|2053013557343158789|construction.comp...|  blanco|218.65|75f6bddc-41f8-497...|
|465783976|2020-01-04 10:37:14|      view|  13902800|2053013561092866779|   computers.desktop|  blanco|140.35|75f6bddc-41f8-497...|
|465783976|2020-01-04 10:38:07|      view|  13902800|2053013561092866779|   computers.desktop|  blanco|140.35|75f6bddc-41f8-497...|
|465783976|2020-01-04 10:40:08|      view|  13902647|205301356109286

## Begin Creating Features
### Create each on an individual level, then join to full
##### NOTE: Must rename all features so that they do not contain parenthesis - not compatable with saving to parquet

_________________

#### Number of items purchased per person, per category
##### Commenting out for now - we'll have to think strategically about what 'huge' column sets we want

In [7]:
#%%time
#cats = m1.filter(m1.event_type == "purchase").groupBy('user_id').pivot('category_code').count()

In [8]:
#%%time
#cats.take(1)

In [9]:
#%%time
#full = full.join(cats, full.user_id == cats.user_id).drop(cats.user_id)

In [10]:
#full.take(1)

#### Number of sessions in month (num_sessions_month)

In [11]:
sessions_total = m1.groupBy('user_id') \
                .agg(count('user_session'))\
                .withColumnRenamed("count(user_session)", 'num_sessions_month')
                    

In [12]:
sessions_total.show()

+---------+------------------+
|  user_id|num_sessions_month|
+---------+------------------+
|405614124|                 2|
|485991194|                 3|
|496765250|                 1|
|501980918|              2141|
|502621333|                 5|
|509313388|                 8|
|512364164|                 4|
|512461742|                15|
|512489465|                13|
|512529571|                34|
|512541716|                53|
|512562919|                 4|
|512598380|                 7|
|512688457|                12|
|512700240|                33|
|512708582|                13|
|512719971|                 9|
|512720637|                30|
|512742043|                 5|
|512798925|                11|
+---------+------------------+
only showing top 20 rows



In [13]:
full = full.join(sessions_total, full.user_id == sessions_total.user_id).drop(sessions_total.user_id)

#### Average Session Duration (AvgSessLen)

In [14]:
session_ends = m1.groupBy('user_id', 'user_session').agg(max('event_time'), min('event_time'))

In [15]:
session_ends.show()

+---------+--------------------+-------------------+-------------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|
+---------+--------------------+-------------------+-------------------+
|514283509|ba808472-24ca-4a5...|2020-01-19 12:30:36|2020-01-19 12:30:36|
|524368563|c00a9199-98cb-412...|2020-01-08 11:43:08|2020-01-08 11:41:06|
|554064616|04d50e0d-b80c-44e...|2020-01-08 10:02:28|2020-01-08 10:02:28|
|554426664|a6aebb81-e68d-4fc...|2020-01-20 11:57:27|2020-01-20 11:48:18|
|560605592|535c77ff-60a7-432...|2020-01-21 13:03:42|2020-01-21 12:53:09|
|568789888|97820352-2275-4d5...|2020-01-30 13:32:14|2020-01-15 18:39:57|
|568789888|149a4323-02ba-b09...|2020-01-04 15:42:04|2020-01-04 15:42:04|
|568789888|235b7e77-1845-4fb...|2020-01-05 18:27:07|2020-01-05 18:27:07|
|568789888|ee138fc0-4703-4e5...|2020-01-06 06:45:35|2020-01-06 06:44:28|
|568789888|2c905dac-06d3-4a6...|2020-01-07 15:13:00|2020-01-07 15:13:00|
|568789888|8724cf8e-d770-f2b...|2020-01-13 13:36:21

In [16]:
session_ends = session_ends.withColumn('SessionLengthSecs', (col("max(event_time)").cast('long') - col("min(event_time)").cast('long')))

In [17]:
session_ends.show()

+---------+--------------------+-------------------+-------------------+-----------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|SessionLengthSecs|
+---------+--------------------+-------------------+-------------------+-----------------+
|514283509|ba808472-24ca-4a5...|2020-01-19 12:30:36|2020-01-19 12:30:36|                0|
|524368563|c00a9199-98cb-412...|2020-01-08 11:43:08|2020-01-08 11:41:06|              122|
|554064616|04d50e0d-b80c-44e...|2020-01-08 10:02:28|2020-01-08 10:02:28|                0|
|554426664|a6aebb81-e68d-4fc...|2020-01-20 11:57:27|2020-01-20 11:48:18|              549|
|560605592|535c77ff-60a7-432...|2020-01-21 13:03:42|2020-01-21 12:53:09|              633|
|568789888|97820352-2275-4d5...|2020-01-30 13:32:14|2020-01-15 18:39:57|          1277537|
|568789888|149a4323-02ba-b09...|2020-01-04 15:42:04|2020-01-04 15:42:04|                0|
|568789888|235b7e77-1845-4fb...|2020-01-05 18:27:07|2020-01-05 18:27:07|                0|

In [18]:
avg_sess = session_ends.groupBy('user_id').avg('SessionLengthSecs').withColumnRenamed('avg(SessionLengthSecs)', "AvgSessLen")

In [19]:
avg_sess.show()

+---------+------------------+
|  user_id|        AvgSessLen|
+---------+------------------+
|512700240| 336.2857142857143|
|514013554|         44762.375|
|514338009| 161.1595744680851|
|514747635|               0.0|
|516092497|108.26666666666667|
|530467296|              31.0|
|536666006|             128.2|
|538669655|             287.0|
|539141084|            565.32|
|567758692|            1038.0|
|569603970|23.282758620689656|
|575813444|25.142857142857142|
|581503547|             301.9|
|597517612|            214.75|
|598174578|              97.0|
|602329313|              66.0|
|605878179| 223.9090909090909|
|606191296|              84.0|
|606753158|              18.0|
|607546688|             226.0|
+---------+------------------+
only showing top 20 rows



In [20]:
full = full.join(avg_sess, full.user_id == avg_sess.user_id).drop(avg_sess.user_id)

#### Std Deviation of session duration by person (stddev_SessionLengthSecs)

In [21]:
session_ends.show(5)

+---------+--------------------+-------------------+-------------------+-----------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|SessionLengthSecs|
+---------+--------------------+-------------------+-------------------+-----------------+
|514283509|ba808472-24ca-4a5...|2020-01-19 12:30:36|2020-01-19 12:30:36|                0|
|524368563|c00a9199-98cb-412...|2020-01-08 11:43:08|2020-01-08 11:41:06|              122|
|554064616|04d50e0d-b80c-44e...|2020-01-08 10:02:28|2020-01-08 10:02:28|                0|
|554426664|a6aebb81-e68d-4fc...|2020-01-20 11:57:27|2020-01-20 11:48:18|              549|
|560605592|535c77ff-60a7-432...|2020-01-21 13:03:42|2020-01-21 12:53:09|              633|
+---------+--------------------+-------------------+-------------------+-----------------+
only showing top 5 rows



In [22]:
std_session_length = session_ends.groupBy('user_id') \
                                 .agg(stddev('SessionLengthSecs')) \
                                 .withColumnRenamed("stddev_samp(SessionLengthSecs)", 'stddev_SessionLengthSecs')

In [23]:
std_session_length.show(5)

+---------+------------------------+
|  user_id|stddev_SessionLengthSecs|
+---------+------------------------+
|512700240|      358.21209466167494|
|514013554|      126444.70285909568|
|514338009|       255.1480827963551|
|514747635|                    null|
|516092497|      201.15149467384978|
+---------+------------------------+
only showing top 5 rows



In [24]:
full = full.join(std_session_length, full.user_id == std_session_length.user_id).drop(std_session_length.user_id)

#### UNFINISHED Distance from last interaction to end of month (seconds)


In [25]:
new_month = dt.datetime(2020,2,1,0,0).timestamp() # This is the epoch seconds of Feb 1, 2020 at midnight
new_month

1580533200.0

In [26]:
last_interaction = m1.groupBy('user_id').agg(max('event_time'))
last_interaction.show()

+---------+-------------------+
|  user_id|    max(event_time)|
+---------+-------------------+
|405614124|2020-01-23 21:47:06|
|485991194|2020-01-06 18:42:02|
|496765250|2020-01-08 18:55:44|
|501980918|2020-01-31 11:38:47|
|502621333|2020-01-26 16:08:41|
|509313388|2020-01-30 13:24:17|
|512364164|2020-01-08 18:16:11|
|512461742|2020-01-30 12:13:15|
|512489465|2020-01-30 10:42:10|
|512529571|2020-01-29 16:43:41|
|512541716|2020-01-31 03:45:26|
|512562919|2020-01-18 11:50:59|
|512598380|2020-01-16 18:33:08|
|512688457|2020-01-28 15:02:45|
|512700240|2020-01-31 04:17:39|
|512708582|2020-01-25 11:16:52|
|512719971|2020-01-25 04:26:11|
|512720637|2020-01-31 16:29:26|
|512742043|2020-01-23 16:24:52|
|512798925|2020-01-27 08:51:30|
+---------+-------------------+
only showing top 20 rows



In [27]:
## last_int_dist = last_interaction.withColumn('time_from_end_month', timestamp(1580533200) - col('max(event_time)'))

#### Average number of interactions per session (avg_interactions_per_session)

In [28]:
interactions_per_session = m1.groupBy('user_id', 'user_session').agg(count('event_type'))

In [29]:
interactions_per_session.show()

+---------+--------------------+-----------------+
|  user_id|        user_session|count(event_type)|
+---------+--------------------+-----------------+
|514283509|ba808472-24ca-4a5...|                1|
|524368563|c00a9199-98cb-412...|                4|
|554064616|04d50e0d-b80c-44e...|                1|
|554426664|a6aebb81-e68d-4fc...|                9|
|560605592|535c77ff-60a7-432...|                6|
|568789888|97820352-2275-4d5...|               72|
|568789888|149a4323-02ba-b09...|                1|
|568789888|235b7e77-1845-4fb...|                1|
|568789888|ee138fc0-4703-4e5...|                2|
|568789888|2c905dac-06d3-4a6...|                1|
|568789888|8724cf8e-d770-f2b...|                1|
|568789888|0202f81f-070c-c27...|                1|
|568789888|a27b6171-ac12-27d...|                1|
|568789888|07243911-82b9-081...|                1|
|568789888|76f8cb11-d11b-f8c...|                1|
|568789888|15b5b692-e614-373...|                1|
|568789888|093187ef-db9e-3d2...

In [30]:
avg_interactions_per_session = interactions_per_session.groupBy('user_id').avg('count(event_type)')

In [31]:
avg_interactions_per_session = avg_interactions_per_session.withColumnRenamed('avg(count(event_type))', "avg_interactions_per_session")

In [32]:
full = full.join(avg_interactions_per_session, full.user_id == avg_interactions_per_session.user_id).drop(avg_interactions_per_session.user_id)

#### Std Deviation of number of interactions per session per person (stddev_int_per_session)

In [33]:
std_interactions_per_session = interactions_per_session.groupBy('user_id') \
                                                       .agg(stddev('count(event_type)')) \
                                                       .withColumnRenamed("stddev_samp(count(event_type))", 'stddev_int_per_session')
std_interactions_per_session.show(5)

+---------+----------------------+
|  user_id|stddev_int_per_session|
+---------+----------------------+
|512700240|     2.138089935299395|
|514013554|    3.4121631178560534|
|514338009|      6.08820433161764|
|514747635|                  null|
|516092497|    1.8464895909600494|
+---------+----------------------+
only showing top 5 rows



In [34]:
full = full.join(std_interactions_per_session, full.user_id == std_interactions_per_session.user_id).drop(std_interactions_per_session.user_id)

#### Max number of interactions within one session (max_interactions_one_session)

In [35]:
max_interactions_per_session = interactions_per_session.groupBy('user_id').max('count(event_type)')

In [36]:
max_interactions_per_session = max_interactions_per_session.withColumnRenamed('max(count(event_type))', "max_interactions_one_session")

In [37]:
max_interactions_per_session.show(1)

+---------+----------------------------+
|  user_id|max_interactions_one_session|
+---------+----------------------------+
|512700240|                           7|
+---------+----------------------------+
only showing top 1 row



In [38]:
full = full.join(max_interactions_per_session, full.user_id == max_interactions_per_session.user_id).drop(max_interactions_per_session.user_id)

#### Percent of total events that are x (Purchase, Cart, View) ('purchase_pct_of_total_events', 'cart_pct_of_total_events', 'view_pct_of_total_events')

In [39]:
event_counts = m1.groupBy('user_id', 'user_session').pivot('event_type').agg(count('event_type'))
# Here the three types of event count are pivoted out for later tabulation

In [40]:
event_counts = event_counts.fillna(0) #replace nulls with 0 for math
event_counts.show()

+---------+--------------------+----+--------+----+
|  user_id|        user_session|cart|purchase|view|
+---------+--------------------+----+--------+----+
|602491865|7ec0e1e6-94e0-493...|   0|       0|   4|
|552639168|d335b338-322b-4eb...|   1|       1|   2|
|581273021|847d49fa-06a5-438...|   0|       0|   3|
|515047041|591cd0ea-f290-47c...|   0|       0|   1|
|591332625|1f8c24dd-9574-47c...|   1|       0|  14|
|544146586|0b8b1f4e-6776-471...|   0|       0|   2|
|583400913|91b1a207-0025-40c...|   0|       0|   1|
|601537528|1e8169c6-4e52-488...|   0|       0|   1|
|596809045|a7e2f49c-40b0-42c...|   0|       0|   1|
|597031468|ec19bd73-8443-4c6...|   0|       0|   1|
|597616595|276d2a56-208a-4f0...|   3|       0|   4|
|595220818|3637a73f-7296-400...|   1|       0|   2|
|547438388|3dd53be2-45a1-4f0...|   2|       1|   7|
|594130277|5fa33f91-f26f-46e...|   0|       0|  10|
|594783921|86ed108f-a0c6-42e...|   0|       0|   1|
|543329028|714eca9f-3298-443...|   0|       0|   6|
|598732220|2

In [41]:
events_per_session = event_counts.withColumn('events_per_session_total', col('cart') + col('purchase') + col('view')) 
# Get total number of events per session

In [42]:
events_per_session.show()

+---------+--------------------+----+--------+----+------------------------+
|  user_id|        user_session|cart|purchase|view|events_per_session_total|
+---------+--------------------+----+--------+----+------------------------+
|602491865|7ec0e1e6-94e0-493...|   0|       0|   4|                       4|
|552639168|d335b338-322b-4eb...|   1|       1|   2|                       4|
|581273021|847d49fa-06a5-438...|   0|       0|   3|                       3|
|515047041|591cd0ea-f290-47c...|   0|       0|   1|                       1|
|591332625|1f8c24dd-9574-47c...|   1|       0|  14|                      15|
|544146586|0b8b1f4e-6776-471...|   0|       0|   2|                       2|
|583400913|91b1a207-0025-40c...|   0|       0|   1|                       1|
|601537528|1e8169c6-4e52-488...|   0|       0|   1|                       1|
|596809045|a7e2f49c-40b0-42c...|   0|       0|   1|                       1|
|597031468|ec19bd73-8443-4c6...|   0|       0|   1|                       1|

In [43]:
pct_events = events_per_session.groupBy('user_id').sum()

In [44]:
pct_totalevents = pct_events.withColumn('purchase_pct_of_total_events', col('sum(purchase)')/col('sum(events_per_session_total)')) \
                  .withColumn('view_pct_of_total_events', col('sum(view)')/col('sum(events_per_session_total)')) \
                  .withColumn('cart_pct_of_total_events', col('sum(cart)')/col('sum(events_per_session_total)'))

In [45]:
merge_me = pct_totalevents.select('user_id', 'purchase_pct_of_total_events', 'cart_pct_of_total_events', 'view_pct_of_total_events')

In [46]:
full = full.join(merge_me, full.user_id == merge_me.user_id).drop(merge_me.user_id)

#### Average number of purchases per session (avg_purchases_per_session)

In [47]:
avg_purchases_per_session = events_per_session.groupBy('user_id').avg('purchase').withColumnRenamed('avg(purchase)', "avg_purchases_per_session")

In [48]:
avg_purchases_per_session.show()

+---------+-------------------------+
|  user_id|avg_purchases_per_session|
+---------+-------------------------+
|512700240|       0.5714285714285714|
|539141084|                     0.04|
|606753158|                      0.0|
|575813444|                      0.0|
|581503547|                      0.0|
|598174578|                      0.0|
|569603970|                      0.0|
|607546688|                      0.0|
|605878179|       0.2727272727272727|
|536666006|                      0.0|
|567758692|                      0.5|
|607701094|                      0.0|
|514747635|                      0.0|
|530467296|                      0.0|
|602329313|                      0.0|
|516092497|                      0.0|
|597517612|                      0.0|
|514013554|                      0.0|
|606191296|                      0.0|
|514338009|      0.32978723404255317|
+---------+-------------------------+
only showing top 20 rows



In [49]:
full = full.join(avg_purchases_per_session, full.user_id == avg_purchases_per_session.user_id).drop(avg_purchases_per_session.user_id)

#### STD of number of purchases per session per person (std_purchases_per_session)

In [50]:
std_purchases_per_session = events_per_session.groupBy('user_id') \
                                              .agg(stddev('purchase')) \
                                              .withColumnRenamed('stddev_samp(purchase)', "std_purchases_per_session")
std_purchases_per_session.show(5)

+---------+-------------------------+
|  user_id|std_purchases_per_session|
+---------+-------------------------+
|512700240|       0.5345224838248488|
|539141084|                      0.2|
|606753158|                      0.0|
|575813444|                      0.0|
|581503547|                      0.0|
+---------+-------------------------+
only showing top 5 rows



In [51]:
full = full.join(std_purchases_per_session, full.user_id == std_purchases_per_session.user_id).drop(std_purchases_per_session.user_id)

#### Total number of each type of event over whole month (monthlyCartTotal, monthlyPurchaseTotal, monthlyViewTotal)

In [52]:
event_counts_month = event_counts.groupBy('user_id').sum('cart', 'purchase', 'view')\
                     .withColumnRenamed('sum(cart)', 'monthlyCartTotal') \
                     .withColumnRenamed('sum(purchase)', 'monthlyPurchaseTotal') \
                     .withColumnRenamed('sum(view)', 'monthlyViewTotal')

In [53]:
event_counts_month.show()

+---------+----------------+--------------------+----------------+
|  user_id|monthlyCartTotal|monthlyPurchaseTotal|monthlyViewTotal|
+---------+----------------+--------------------+----------------+
|512700240|               6|                   4|              23|
|539141084|               2|                   1|             184|
|606753158|               0|                   0|               5|
|575813444|               0|                   0|               9|
|581503547|               2|                   0|              57|
|598174578|               0|                   0|               2|
|569603970|               0|                   0|             148|
|607546688|               0|                   0|               2|
|605878179|               6|                   3|              71|
|536666006|               0|                   0|              14|
|567758692|               1|                   1|              27|
|607701094|               0|                   0|             

In [54]:
full = full.join(event_counts_month, full.user_id == event_counts_month.user_id).drop(event_counts_month.user_id)

#### Total number of sessions that contain event over whole month (NumSessWithPurchases, NumSessWithCart, NumSessWithView)

In [55]:
events_over_month = events_per_session.withColumn('purchaseInSession', when(col('purchase') == 0, 0).otherwise(1)) \
                                      .withColumn('cartInSession', when(col('cart')==0, 0).otherwise(1)) \
                                      .withColumn('viewInSession', when(col('view')==0, 0).otherwise(1))

In [56]:
num_sesh_containing_event = events_over_month.groupBy('user_id').sum('purchaseInSession', "cartInSession", "viewInSession") \
                            .withColumnRenamed("sum(purchaseInSession)", "NumSessWithPurchases") \
                            .withColumnRenamed("sum(cartInSession)", "NumSessWithCart") \
                            .withColumnRenamed("sum(viewInSession)", "NumSessWithView")

In [57]:
num_sesh_containing_event.show()

+---------+--------------------+---------------+---------------+
|  user_id|NumSessWithPurchases|NumSessWithCart|NumSessWithView|
+---------+--------------------+---------------+---------------+
|512700240|                   4|              4|              7|
|539141084|                   1|              1|             25|
|606753158|                   0|              0|              2|
|575813444|                   0|              0|              7|
|581503547|                   0|              2|             10|
|598174578|                   0|              0|              1|
|569603970|                   0|              0|            145|
|607546688|                   0|              0|              1|
|605878179|                   3|              5|             11|
|536666006|                   0|              0|              5|
|567758692|                   1|              1|              2|
|607701094|                   0|              0|              5|
|514747635|              

In [58]:
full = full.join(num_sesh_containing_event, full.user_id == num_sesh_containing_event.user_id).drop(num_sesh_containing_event.user_id)

#### Percent of individual's sessions that end in cart/purchase (ses_end_purch, ses_end_cart)

In [59]:
session_ends2 = event_counts.withColumn('sess_end_purchase', \
                                when(col('purchase') != 0, 1) \
                                .otherwise(0)) \
                            .withColumn('sess_end_cart', \
                                when((col("purchase") == 0) & (col("cart") != 0), 1) \
                                .otherwise(0))
session_ends2.show(5)

+---------+--------------------+----+--------+----+-----------------+-------------+
|  user_id|        user_session|cart|purchase|view|sess_end_purchase|sess_end_cart|
+---------+--------------------+----+--------+----+-----------------+-------------+
|602491865|7ec0e1e6-94e0-493...|   0|       0|   4|                0|            0|
|552639168|d335b338-322b-4eb...|   1|       1|   2|                1|            0|
|581273021|847d49fa-06a5-438...|   0|       0|   3|                0|            0|
|515047041|591cd0ea-f290-47c...|   0|       0|   1|                0|            0|
|591332625|1f8c24dd-9574-47c...|   1|       0|  14|                0|            1|
+---------+--------------------+----+--------+----+-----------------+-------------+
only showing top 5 rows



In [60]:
session_sum = session_ends2.groupBy('user_id').agg(count('user_session'), sum('sess_end_purchase'), sum('sess_end_cart'))
session_sum.show(5)

+---------+-------------------+----------------------+------------------+
|  user_id|count(user_session)|sum(sess_end_purchase)|sum(sess_end_cart)|
+---------+-------------------+----------------------+------------------+
|512700240|                  7|                     4|                 0|
|539141084|                 25|                     1|                 0|
|606753158|                  2|                     0|                 0|
|575813444|                  7|                     0|                 0|
|581503547|                 10|                     0|                 2|
+---------+-------------------+----------------------+------------------+
only showing top 5 rows



In [61]:
session_sum = session_sum.withColumn('ses_end_purch', col('sum(sess_end_purchase)')/col('count(user_session)')) \
                         .withColumn('ses_end_cart', col('sum(sess_end_cart)')/col('count(user_session)'))
session_sum.show(5)

+---------+-------------------+----------------------+------------------+------------------+------------+
|  user_id|count(user_session)|sum(sess_end_purchase)|sum(sess_end_cart)|     ses_end_purch|ses_end_cart|
+---------+-------------------+----------------------+------------------+------------------+------------+
|512700240|                  7|                     4|                 0|0.5714285714285714|         0.0|
|539141084|                 25|                     1|                 0|              0.04|         0.0|
|606753158|                  2|                     0|                 0|               0.0|         0.0|
|575813444|                  7|                     0|                 0|               0.0|         0.0|
|581503547|                 10|                     0|                 2|               0.0|         0.2|
+---------+-------------------+----------------------+------------------+------------------+------------+
only showing top 5 rows



In [62]:
temp = session_sum.select('user_id', "ses_end_purch", "ses_end_cart")
temp.show()

+---------+------------------+--------------------+
|  user_id|     ses_end_purch|        ses_end_cart|
+---------+------------------+--------------------+
|512700240|0.5714285714285714|                 0.0|
|539141084|              0.04|                 0.0|
|606753158|               0.0|                 0.0|
|575813444|               0.0|                 0.0|
|581503547|               0.0|                 0.2|
|598174578|               0.0|                 0.0|
|569603970|               0.0|                 0.0|
|607546688|               0.0|                 0.0|
|605878179|0.2727272727272727| 0.18181818181818182|
|536666006|               0.0|                 0.0|
|567758692|               0.5|                 0.0|
|607701094|               0.0|                 0.0|
|514747635|               0.0|                 0.0|
|530467296|               0.0|                 0.0|
|602329313|               0.0|                 0.0|
|516092497|               0.0|                 0.0|
|597517612| 

In [63]:
full = full.join(temp, full.user_id == temp.user_id).drop(temp.user_id)

### Preview full dataframe

In [64]:
full.schema

StructType(List(StructField(user_id,IntegerType,true),StructField(m2_total_spend,DoubleType,true),StructField(m1_total_spend,DoubleType,true),StructField(m1_total_events,LongType,true),StructField(m1_purchase_events,LongType,true),StructField(m1_user_sessions,LongType,true),StructField(num_sessions_month,LongType,false),StructField(AvgSessLen,DoubleType,true),StructField(stddev_SessionLengthSecs,DoubleType,true),StructField(avg_interactions_per_session,DoubleType,true),StructField(stddev_int_per_session,DoubleType,true),StructField(max_interactions_one_session,LongType,true),StructField(purchase_pct_of_total_events,DoubleType,true),StructField(cart_pct_of_total_events,DoubleType,true),StructField(view_pct_of_total_events,DoubleType,true),StructField(avg_purchases_per_session,DoubleType,true),StructField(std_purchases_per_session,DoubleType,true),StructField(monthlyCartTotal,LongType,true),StructField(monthlyPurchaseTotal,LongType,true),StructField(monthlyViewTotal,LongType,true),Struct

In [65]:
full.show(1)

+---------+--------------+--------------+---------------+------------------+----------------+------------------+----------+------------------------+----------------------------+----------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+-------------------------+----------------+--------------------+----------------+--------------------+---------------+---------------+-------------+------------+
|  user_id|m2_total_spend|m1_total_spend|m1_total_events|m1_purchase_events|m1_user_sessions|num_sessions_month|AvgSessLen|stddev_SessionLengthSecs|avg_interactions_per_session|stddev_int_per_session|max_interactions_one_session|purchase_pct_of_total_events|cart_pct_of_total_events|view_pct_of_total_events|avg_purchases_per_session|std_purchases_per_session|monthlyCartTotal|monthlyPurchaseTotal|monthlyViewTotal|NumSessWithPurchases|NumSessWithCart|NumSessWithView|ses_end_purch|ses_end_cart|
+-------

In [66]:
full.count()

219080

#### Save as parquet. (If saving in project group12 folder - Make sure to change permissions in bash using chmod 777 filename)

In [67]:
%%time
full.write.mode("overwrite").parquet("./processed_data/engineered_features.parquet")

CPU times: user 4.7 ms, sys: 2.63 ms, total: 7.33 ms
Wall time: 37.7 s


In [68]:
%%time
train, test = full.randomSplit([.8, .2], seed=42)

In [69]:
%%time
train.write.mode("overwrite").parquet("./processed_data/train.parquet")
test.write.mode("overwrite").parquet("./processed_data/test.parquet")

CPU times: user 8.06 ms, sys: 4.19 ms, total: 12.2 ms
Wall time: 1min 8s
