## File 04 - Feature Creation

In this file, we create new features from our data

### Set up Spark session and data schema

We can specify more options in the SparkSession creator, but currently the options are at the default settings.

In [1]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import *
import datetime as dt

import sys
spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

#schema = "`event_time` TIMESTAMP,`event_type` STRING,`product_id` INT,`category_id` BIGINT,`category_code` STRING,`brand` STRING,`price` FLOAT,`user_id` INT,`user_session` STRING"
#ddl_schema = T._parse_datatype_string(schema)

CPU times: user 535 ms, sys: 352 ms, total: 887 ms
Wall time: 5.68 s


See https://docs.google.com/document/d/1NG4KGticBXn0D3PL5_zMxLV2Pr7A8PQtLcasxCOd1nA/edit for table of features.

### Read in data

In [2]:
%%time
full = spark.read.parquet("/project/ds5559/group12/processed_data/output_from_03.parquet")
m1 = spark.read.parquet("/project/ds5559/group12/raw_data/FilteredJan2020M1.parquet") # This brings in the data we can create additional features from

CPU times: user 2.88 ms, sys: 1.78 ms, total: 4.65 ms
Wall time: 2.88 s


In [3]:
print(full.count())
full.show(5)

359105
+---------+------------------+------------------+---------------+------------------+----------------+
|  user_id|    m2_total_spend|    m1_total_spend|m1_total_events|m1_purchase_events|m1_user_sessions|
+---------+------------------+------------------+---------------+------------------+----------------+
|512407417|               0.0| 805.9299926757812|             42|                 1|              11|
|512423995|               0.0|129.99000549316406|             88|                 1|              11|
|512445637|               0.0|27.799999237060547|             50|                 1|               7|
|512479812|50881.730712890625|166.19000244140625|             73|                 1|               9|
|512541716|               0.0| 250.7100067138672|             53|                 1|              16|
+---------+------------------+------------------+---------------+------------------+----------------+
only showing top 5 rows



In [4]:
print(m1.count())
m1.show(5)

15923973
+---------+-------------------+----------+----------+-------------------+--------------------+-------+------+--------------------+------+
|  user_id|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|        user_session|Period|
+---------+-------------------+----------+----------+-------------------+--------------------+-------+------+--------------------+------+
|512366912|2020-01-15 09:53:20|      view|   1004856|2232732093077520756|construction.tool...|samsung|128.45|93ce8365-4c50-4d3...|     1|
|512366912|2020-01-15 09:53:26|      view|   1004856|2232732093077520756|construction.tool...|samsung|128.45|93ce8365-4c50-4d3...|     1|
|512366912|2020-01-15 09:53:46|      view|   1004856|2232732093077520756|construction.tool...|samsung|128.45|93ce8365-4c50-4d3...|     1|
|512366912|2020-01-15 10:42:48|      view|   1004856|2232732093077520756|construction.tool...|samsung|128.45|c2f4ccfe-f0e9-4a7...|     1|
|512366912|2020-01-16 07:

## Begin Creating Features
### Create each on an individual level, then join to full
##### NOTE: Must rename all features so that they do not contain parenthesis - not compatable with saving to parquet

_________________

#### Number of items purchased per person, per category
##### Commenting out for now - we'll have to think strategically about what 'huge' column sets we want

In [5]:
#%%time
#cats = m1.filter(m1.event_type == "purchase").groupBy('user_id').pivot('category_code').count()

In [6]:
#%%time
#cats.take(1)

In [7]:
#%%time
#full = full.join(cats, full.user_id == cats.user_id).drop(cats.user_id)

In [8]:
#full.take(1)

#### Number of sessions in month (num_sessions_month)

In [9]:
sessions_total = m1.groupBy('user_id') \
                .agg(count('user_session'))\
                .withColumnRenamed("count(user_session)", 'num_sessions_month')
                    

In [10]:
sessions_total.show()

+---------+------------------+
|  user_id|num_sessions_month|
+---------+------------------+
|512372691|                33|
|512407417|                42|
|512423995|                88|
|512428523|                34|
|512445637|                50|
|512448189|                 9|
|512460113|                13|
|512479812|                73|
|512510580|                78|
|512513760|               160|
|512517137|                13|
|512541716|                53|
|512547480|                37|
|512550513|               223|
|512552482|                85|
|512555062|                30|
|512562561|                82|
|512562754|               322|
|512583155|                22|
|512596570|               143|
+---------+------------------+
only showing top 20 rows



In [11]:
full = full.join(sessions_total, full.user_id == sessions_total.user_id).drop(sessions_total.user_id)

#### Average Session Duration (AvgSessLen)

In [12]:
session_ends = m1.groupBy('user_id', 'user_session').agg(max('event_time'), min('event_time'))

In [13]:
session_ends.show()

+---------+--------------------+-------------------+-------------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|
+---------+--------------------+-------------------+-------------------+
|512459247|3465c2cc-9df4-4a5...|2020-01-30 09:22:21|2020-01-30 09:15:20|
|512501209|78c8cf4a-26f7-491...|2020-01-16 10:16:27|2020-01-16 10:14:19|
|513212967|6b8144b3-a1d0-471...|2020-01-18 18:08:02|2020-01-18 18:01:23|
|513212967|bfb38911-7ef3-401...|2020-01-25 09:29:57|2020-01-25 09:29:57|
|513235701|677abd5e-88a8-4ab...|2020-01-23 17:44:40|2020-01-23 17:41:35|
|513575696|c2ed7438-99d0-45b...|2020-01-09 07:36:52|2020-01-09 07:36:41|
|513873816|a7655679-3f21-481...|2020-01-18 08:20:24|2020-01-18 08:16:05|
|514052791|31a79da3-d7ae-458...|2020-01-26 02:21:16|2020-01-26 02:07:56|
|514423406|2de9cfba-6d1b-441...|2020-01-16 04:54:02|2020-01-16 04:54:02|
|514423406|80e604da-56b2-4ef...|2020-01-20 19:45:50|2020-01-20 19:45:40|
|514423406|60bea535-4244-434...|2020-01-22 03:27:07

In [14]:
session_ends = session_ends.withColumn('SessionLengthSecs', (col("max(event_time)").cast('long') - col("min(event_time)").cast('long')))

In [15]:
session_ends.show()

+---------+--------------------+-------------------+-------------------+-----------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|SessionLengthSecs|
+---------+--------------------+-------------------+-------------------+-----------------+
|512459247|3465c2cc-9df4-4a5...|2020-01-30 09:22:21|2020-01-30 09:15:20|              421|
|512501209|78c8cf4a-26f7-491...|2020-01-16 10:16:27|2020-01-16 10:14:19|              128|
|513212967|6b8144b3-a1d0-471...|2020-01-18 18:08:02|2020-01-18 18:01:23|              399|
|513212967|bfb38911-7ef3-401...|2020-01-25 09:29:57|2020-01-25 09:29:57|                0|
|513235701|677abd5e-88a8-4ab...|2020-01-23 17:44:40|2020-01-23 17:41:35|              185|
|513575696|c2ed7438-99d0-45b...|2020-01-09 07:36:52|2020-01-09 07:36:41|               11|
|513873816|a7655679-3f21-481...|2020-01-18 08:20:24|2020-01-18 08:16:05|              259|
|514052791|31a79da3-d7ae-458...|2020-01-26 02:21:16|2020-01-26 02:07:56|              800|

In [16]:
avg_sess = session_ends.groupBy('user_id').avg('SessionLengthSecs').withColumnRenamed('avg(SessionLengthSecs)', "AvgSessLen")

In [17]:
avg_sess.show()

+---------+------------------+
|  user_id|        AvgSessLen|
+---------+------------------+
|512700240| 336.2857142857143|
|512907846| 83.05555555555556|
|512939102|323.23809523809524|
|512975406|5124.0192307692305|
|513504459|              46.2|
|513615667| 497.3333333333333|
|514290304|217.54545454545453|
|514338009| 161.1595744680851|
|514447047|             605.0|
|515125070|            580.64|
|515409104|209.85714285714286|
|516873641|22868.166666666668|
|517713387|             244.8|
|518529350| 286.6666666666667|
|519121308|252.16666666666666|
|519298781|376.85714285714283|
|520771974|             322.6|
|524321301|14244.923076923076|
|524566828| 46979.03846153846|
|538147908|1074.3076923076924|
+---------+------------------+
only showing top 20 rows



In [18]:
full = full.join(avg_sess, full.user_id == avg_sess.user_id).drop(avg_sess.user_id)

#### Std Deviation of session duration by person (stddev_SessionLengthSecs)

In [19]:
session_ends.show(5)

+---------+--------------------+-------------------+-------------------+-----------------+
|  user_id|        user_session|    max(event_time)|    min(event_time)|SessionLengthSecs|
+---------+--------------------+-------------------+-------------------+-----------------+
|512459247|3465c2cc-9df4-4a5...|2020-01-30 09:22:21|2020-01-30 09:15:20|              421|
|512501209|78c8cf4a-26f7-491...|2020-01-16 10:16:27|2020-01-16 10:14:19|              128|
|513212967|6b8144b3-a1d0-471...|2020-01-18 18:08:02|2020-01-18 18:01:23|              399|
|513212967|bfb38911-7ef3-401...|2020-01-25 09:29:57|2020-01-25 09:29:57|                0|
|513235701|677abd5e-88a8-4ab...|2020-01-23 17:44:40|2020-01-23 17:41:35|              185|
+---------+--------------------+-------------------+-------------------+-----------------+
only showing top 5 rows



In [20]:
std_session_length = session_ends.groupBy('user_id') \
                                 .agg(stddev('SessionLengthSecs')) \
                                 .withColumnRenamed("stddev_samp(SessionLengthSecs)", 'stddev_SessionLengthSecs')

In [21]:
std_session_length.show(5)

+---------+------------------------+
|  user_id|stddev_SessionLengthSecs|
+---------+------------------------+
|512700240|      358.21209466167494|
|512907846|      146.57961989076196|
|512939102|       442.5742767899988|
|512975406|       34236.70295107879|
|513504459|       67.80265481527991|
+---------+------------------------+
only showing top 5 rows



In [22]:
full = full.join(std_session_length, full.user_id == std_session_length.user_id).drop(std_session_length.user_id)

#### UNFINISHED Distance from last interaction to end of month (seconds)


In [23]:
new_month = dt.datetime(2020,2,1,0,0).timestamp() # This is the epoch seconds of Feb 1, 2020 at midnight
new_month

1580533200.0

In [24]:
last_interaction = m1.groupBy('user_id').agg(max('event_time'))
last_interaction.show()

+---------+-------------------+
|  user_id|    max(event_time)|
+---------+-------------------+
|512372691|2020-01-24 08:37:05|
|512407417|2020-01-30 15:28:07|
|512423995|2020-01-31 18:55:41|
|512428523|2020-01-16 09:49:41|
|512445637|2020-01-28 12:42:03|
|512448189|2020-01-12 18:40:43|
|512460113|2020-01-29 19:02:16|
|512479812|2020-01-28 16:29:46|
|512510580|2020-01-31 07:35:47|
|512513760|2020-01-31 06:55:48|
|512517137|2020-01-23 14:33:44|
|512541716|2020-01-31 03:45:26|
|512547480|2020-01-18 05:01:55|
|512550513|2020-01-30 12:25:41|
|512552482|2020-01-31 18:57:30|
|512555062|2020-01-29 06:03:46|
|512562561|2020-01-31 12:13:52|
|512562754|2020-01-30 18:53:40|
|512583155|2020-01-26 23:43:46|
|512596570|2020-01-21 08:45:29|
+---------+-------------------+
only showing top 20 rows



In [25]:
## last_int_dist = last_interaction.withColumn('time_from_end_month', timestamp(1580533200) - col('max(event_time)'))

#### Average number of interactions per session (avg_interactions_per_session)

In [26]:
interactions_per_session = m1.groupBy('user_id', 'user_session').agg(count('event_type'))

In [27]:
interactions_per_session.show()

+---------+--------------------+-----------------+
|  user_id|        user_session|count(event_type)|
+---------+--------------------+-----------------+
|512459247|3465c2cc-9df4-4a5...|                4|
|512501209|78c8cf4a-26f7-491...|                5|
|513212967|6b8144b3-a1d0-471...|                8|
|513212967|bfb38911-7ef3-401...|                1|
|513235701|677abd5e-88a8-4ab...|                8|
|513575696|c2ed7438-99d0-45b...|                2|
|513873816|a7655679-3f21-481...|                4|
|514052791|31a79da3-d7ae-458...|               18|
|514423406|2de9cfba-6d1b-441...|                1|
|514423406|80e604da-56b2-4ef...|                2|
|514423406|60bea535-4244-434...|                1|
|514687260|6313d529-e697-457...|                4|
|515937747|f2ef682e-ac64-4dc...|                1|
|516024028|174ffd34-b3c9-4ef...|                3|
|516729801|2695d003-f162-4ee...|                6|
|517023442|39ee9231-2a38-40f...|                1|
|517037372|6d020962-1814-414...

In [28]:
avg_interactions_per_session = interactions_per_session.groupBy('user_id').avg('count(event_type)')

In [29]:
avg_interactions_per_session = avg_interactions_per_session.withColumnRenamed('avg(count(event_type))', "avg_interactions_per_session")

In [30]:
full = full.join(avg_interactions_per_session, full.user_id == avg_interactions_per_session.user_id).drop(avg_interactions_per_session.user_id)

#### Std Deviation of number of interactions per session per person (stddev_int_per_session)

In [31]:
std_interactions_per_session = interactions_per_session.groupBy('user_id') \
                                                       .agg(stddev('count(event_type)')) \
                                                       .withColumnRenamed("stddev_samp(count(event_type))", 'stddev_int_per_session')
std_interactions_per_session.show(5)

+---------+----------------------+
|  user_id|stddev_int_per_session|
+---------+----------------------+
|512700240|     2.138089935299395|
|512907846|     2.727851988394575|
|512939102|    10.862780491200215|
|512975406|     8.314790833588951|
|513504459|     3.898717737923586|
+---------+----------------------+
only showing top 5 rows



In [32]:
full = full.join(std_interactions_per_session, full.user_id == std_interactions_per_session.user_id).drop(std_interactions_per_session.user_id)

#### Max number of interactions within one session (max_interactions_one_session)

In [33]:
max_interactions_per_session = interactions_per_session.groupBy('user_id').max('count(event_type)')

In [34]:
max_interactions_per_session = max_interactions_per_session.withColumnRenamed('max(count(event_type))', "max_interactions_one_session")

In [35]:
max_interactions_per_session.show(1)

+---------+----------------------------+
|  user_id|max_interactions_one_session|
+---------+----------------------------+
|512700240|                           7|
+---------+----------------------------+
only showing top 1 row



In [36]:
full = full.join(max_interactions_per_session, full.user_id == max_interactions_per_session.user_id).drop(max_interactions_per_session.user_id)

#### Percent of total events that are x (Purchase, Cart, View) ('purchase_pct_of_total_events', 'cart_pct_of_total_events', 'view_pct_of_total_events')

In [37]:
event_counts = m1.groupBy('user_id', 'user_session').pivot('event_type').agg(count('event_type'))
# Here the three types of event count are pivoted out for later tabulation

In [38]:
event_counts = event_counts.fillna(0) #replace nulls with 0 for math
event_counts.show()

+---------+--------------------+----+--------+----+
|  user_id|        user_session|cart|purchase|view|
+---------+--------------------+----+--------+----+
|514303875|205fae11-e924-407...|   0|       0|   1|
|519241453|a9437a1d-41e5-4fa...|   0|       0|  14|
|523201266|9daa4af6-6dac-4d1...|   1|       1|   2|
|543762302|22e1020d-8b73-4fa...|   0|       0|   4|
|546604646|f5d96e2e-18d1-462...|   0|       0|   1|
|547438388|3dd53be2-45a1-4f0...|   2|       1|   7|
|550007461|11a6f31b-0692-482...|   2|       1|   1|
|552639168|d335b338-322b-4eb...|   1|       1|   2|
|563020567|b5758a16-dce0-40a...|   0|       0|   1|
|564814222|00ed7b07-a733-421...|   0|       0|   2|
|567513518|ce0e10cd-0852-409...|   1|       1|   1|
|568805468|6377f2a7-59e5-49a...|   0|       0|   1|
|569919554|d5598d78-5d83-412...|   1|       1|   1|
|585566367|b21c2b85-3a26-468...|   0|       0|   6|
|591485124|bc9ef30b-8fcf-493...|   0|       0|  23|
|596296622|6eda9fc5-f80a-44b...|   0|       0|   1|
|601896730|c

In [39]:
events_per_session = event_counts.withColumn('events_per_session_total', col('cart') + col('purchase') + col('view')) 
# Get total number of events per session

In [40]:
events_per_session.show()

+---------+--------------------+----+--------+----+------------------------+
|  user_id|        user_session|cart|purchase|view|events_per_session_total|
+---------+--------------------+----+--------+----+------------------------+
|514303875|205fae11-e924-407...|   0|       0|   1|                       1|
|519241453|a9437a1d-41e5-4fa...|   0|       0|  14|                      14|
|523201266|9daa4af6-6dac-4d1...|   1|       1|   2|                       4|
|543762302|22e1020d-8b73-4fa...|   0|       0|   4|                       4|
|546604646|f5d96e2e-18d1-462...|   0|       0|   1|                       1|
|547438388|3dd53be2-45a1-4f0...|   2|       1|   7|                      10|
|550007461|11a6f31b-0692-482...|   2|       1|   1|                       4|
|552639168|d335b338-322b-4eb...|   1|       1|   2|                       4|
|563020567|b5758a16-dce0-40a...|   0|       0|   1|                       1|
|564814222|00ed7b07-a733-421...|   0|       0|   2|                       2|

In [41]:
pct_events = events_per_session.groupBy('user_id').sum()

In [42]:
pct_totalevents = pct_events.withColumn('purchase_pct_of_total_events', col('sum(purchase)')/col('sum(events_per_session_total)')) \
                  .withColumn('view_pct_of_total_events', col('sum(view)')/col('sum(events_per_session_total)')) \
                  .withColumn('cart_pct_of_total_events', col('sum(cart)')/col('sum(events_per_session_total)'))

In [43]:
merge_me = pct_totalevents.select('user_id', 'purchase_pct_of_total_events', 'cart_pct_of_total_events', 'view_pct_of_total_events')

In [44]:
full = full.join(merge_me, full.user_id == merge_me.user_id).drop(merge_me.user_id)

#### Average number of purchases per session (avg_purchases_per_session)

In [45]:
avg_purchases_per_session = events_per_session.groupBy('user_id').avg('purchase').withColumnRenamed('avg(purchase)', "avg_purchases_per_session")

In [46]:
avg_purchases_per_session.show()

+---------+-------------------------+
|  user_id|avg_purchases_per_session|
+---------+-------------------------+
|519121308|      0.16666666666666666|
|588954881|      0.09090909090909091|
|568888698|       0.3076923076923077|
|512700240|       0.5714285714285714|
|512975406|      0.19230769230769232|
|539141084|                     0.04|
|512939102|     0.047619047619047616|
|568834070|     7.680491551459293E-4|
|597560455|                      0.2|
|574391586|       0.5454545454545454|
|603091367|                      1.0|
|516873641|     0.016666666666666666|
|595529721|                      1.0|
|524321301|      0.15384615384615385|
|531677136|     0.023809523809523808|
|513504459|                      0.8|
|598364094|       0.2702702702702703|
|592713655|       0.3333333333333333|
|562508584|                      0.2|
|519298781|      0.14285714285714285|
+---------+-------------------------+
only showing top 20 rows



In [47]:
full = full.join(avg_purchases_per_session, full.user_id == avg_purchases_per_session.user_id).drop(avg_purchases_per_session.user_id)

#### STD of number of purchases per session per person (std_purchases_per_session)

In [48]:
std_purchases_per_session = events_per_session.groupBy('user_id') \
                                              .agg(stddev('purchase')) \
                                              .withColumnRenamed('stddev_samp(purchase)', "std_purchases_per_session")
std_purchases_per_session.show(5)

+---------+-------------------------+
|  user_id|std_purchases_per_session|
+---------+-------------------------+
|519121308|        0.408248290463863|
|588954881|      0.30151134457776363|
|568888698|       0.4803844614152614|
|512700240|       0.5345224838248488|
|512975406|       0.5614568423503459|
+---------+-------------------------+
only showing top 5 rows



In [49]:
full = full.join(std_purchases_per_session, full.user_id == std_purchases_per_session.user_id).drop(std_purchases_per_session.user_id)

#### Total number of each type of event over whole month (monthlyCartTotal, monthlyPurchaseTotal, monthlyViewTotal)

In [50]:
event_counts_month = event_counts.groupBy('user_id').sum('cart', 'purchase', 'view')\
                     .withColumnRenamed('sum(cart)', 'monthlyCartTotal') \
                     .withColumnRenamed('sum(purchase)', 'monthlyPurchaseTotal') \
                     .withColumnRenamed('sum(view)', 'monthlyViewTotal')

In [51]:
event_counts_month.show()

+---------+----------------+--------------------+----------------+
|  user_id|monthlyCartTotal|monthlyPurchaseTotal|monthlyViewTotal|
+---------+----------------+--------------------+----------------+
|519121308|               6|                   1|              20|
|588954881|               1|                   1|              25|
|568888698|               8|                   4|              75|
|512700240|               6|                   4|              23|
|512975406|              26|                  10|             330|
|539141084|               2|                   1|             184|
|512939102|               7|                   1|             181|
|568834070|               3|                   1|            1592|
|597560455|               8|                   2|              25|
|574391586|               9|                   6|              28|
|603091367|               1|                   1|               2|
|516873641|              26|                   1|             

In [52]:
full = full.join(event_counts_month, full.user_id == event_counts_month.user_id).drop(event_counts_month.user_id)

#### Total number of sessions that contain event over whole month (NumSessWithPurchases, NumSessWithCart, NumSessWithView)

In [53]:
events_over_month = events_per_session.withColumn('purchaseInSession', when(col('purchase') == 0, 0).otherwise(1)) \
                                      .withColumn('cartInSession', when(col('cart')==0, 0).otherwise(1)) \
                                      .withColumn('viewInSession', when(col('view')==0, 0).otherwise(1))

In [54]:
num_sesh_containing_event = events_over_month.groupBy('user_id').sum('purchaseInSession', "cartInSession", "viewInSession") \
                            .withColumnRenamed("sum(purchaseInSession)", "NumSessWithPurchases") \
                            .withColumnRenamed("sum(cartInSession)", "NumSessWithCart") \
                            .withColumnRenamed("sum(viewInSession)", "NumSessWithView")

In [55]:
num_sesh_containing_event.show()

+---------+--------------------+---------------+---------------+
|  user_id|NumSessWithPurchases|NumSessWithCart|NumSessWithView|
+---------+--------------------+---------------+---------------+
|519121308|                   1|              2|              6|
|588954881|                   1|              1|             11|
|568888698|                   4|              7|             13|
|512700240|                   4|              4|              7|
|512975406|                   7|             19|             46|
|539141084|                   1|              1|             25|
|512939102|                   1|              5|             21|
|568834070|                   1|              3|           1302|
|597560455|                   2|              4|             10|
|574391586|                   6|              6|             11|
|603091367|                   1|              1|              1|
|516873641|                   1|              6|             59|
|595529721|              

In [56]:
full = full.join(num_sesh_containing_event, full.user_id == num_sesh_containing_event.user_id).drop(num_sesh_containing_event.user_id)

#### Percent of individual's sessions that end in cart/purchase (ses_end_purch, ses_end_cart)

In [57]:
session_ends2 = event_counts.withColumn('sess_end_purchase', \
                                when(col('purchase') != 0, 1) \
                                .otherwise(0)) \
                            .withColumn('sess_end_cart', \
                                when((col("purchase") == 0) & (col("cart") != 0), 1) \
                                .otherwise(0))
session_ends2.show(5)

+---------+--------------------+----+--------+----+-----------------+-------------+
|  user_id|        user_session|cart|purchase|view|sess_end_purchase|sess_end_cart|
+---------+--------------------+----+--------+----+-----------------+-------------+
|514303875|205fae11-e924-407...|   0|       0|   1|                0|            0|
|519241453|a9437a1d-41e5-4fa...|   0|       0|  14|                0|            0|
|523201266|9daa4af6-6dac-4d1...|   1|       1|   2|                1|            0|
|543762302|22e1020d-8b73-4fa...|   0|       0|   4|                0|            0|
|546604646|f5d96e2e-18d1-462...|   0|       0|   1|                0|            0|
+---------+--------------------+----+--------+----+-----------------+-------------+
only showing top 5 rows



In [58]:
session_sum = session_ends2.groupBy('user_id').agg(count('user_session'), sum('sess_end_purchase'), sum('sess_end_cart'))
session_sum.show(5)

+---------+-------------------+----------------------+------------------+
|  user_id|count(user_session)|sum(sess_end_purchase)|sum(sess_end_cart)|
+---------+-------------------+----------------------+------------------+
|519121308|                  6|                     1|                 1|
|588954881|                 11|                     1|                 0|
|568888698|                 13|                     4|                 3|
|512700240|                  7|                     4|                 0|
|512975406|                 52|                     7|                12|
+---------+-------------------+----------------------+------------------+
only showing top 5 rows



In [59]:
session_sum = session_sum.withColumn('ses_end_purch', col('sum(sess_end_purchase)')/col('count(user_session)')) \
                         .withColumn('ses_end_cart', col('sum(sess_end_cart)')/col('count(user_session)'))
session_sum.show(5)

+---------+-------------------+----------------------+------------------+-------------------+-------------------+
|  user_id|count(user_session)|sum(sess_end_purchase)|sum(sess_end_cart)|      ses_end_purch|       ses_end_cart|
+---------+-------------------+----------------------+------------------+-------------------+-------------------+
|519121308|                  6|                     1|                 1|0.16666666666666666|0.16666666666666666|
|588954881|                 11|                     1|                 0|0.09090909090909091|                0.0|
|568888698|                 13|                     4|                 3| 0.3076923076923077|0.23076923076923078|
|512700240|                  7|                     4|                 0| 0.5714285714285714|                0.0|
|512975406|                 52|                     7|                12| 0.1346153846153846|0.23076923076923078|
+---------+-------------------+----------------------+------------------+---------------

In [60]:
temp = session_sum.select('user_id', "ses_end_purch", "ses_end_cart")
temp.show()

+---------+--------------------+--------------------+
|  user_id|       ses_end_purch|        ses_end_cart|
+---------+--------------------+--------------------+
|519121308| 0.16666666666666666| 0.16666666666666666|
|588954881| 0.09090909090909091|                 0.0|
|568888698|  0.3076923076923077| 0.23076923076923078|
|512700240|  0.5714285714285714|                 0.0|
|512975406|  0.1346153846153846| 0.23076923076923078|
|539141084|                0.04|                 0.0|
|512939102|0.047619047619047616| 0.19047619047619047|
|568834070|7.680491551459293E-4|0.001536098310291...|
|597560455|                 0.2|                 0.2|
|574391586|  0.5454545454545454|                 0.0|
|603091367|                 1.0|                 0.0|
|516873641|0.016666666666666666| 0.08333333333333333|
|595529721|  0.3333333333333333|  0.6666666666666666|
|524321301| 0.15384615384615385| 0.07692307692307693|
|531677136|0.023809523809523808|                 0.0|
|513504459|                 

In [61]:
full = full.join(temp, full.user_id == temp.user_id).drop(temp.user_id)

### Preview full dataframe

In [62]:
full.schema

StructType(List(StructField(user_id,IntegerType,true),StructField(m2_total_spend,DoubleType,true),StructField(m1_total_spend,DoubleType,true),StructField(m1_total_events,LongType,true),StructField(m1_purchase_events,LongType,true),StructField(m1_user_sessions,LongType,true),StructField(num_sessions_month,LongType,false),StructField(AvgSessLen,DoubleType,true),StructField(stddev_SessionLengthSecs,DoubleType,true),StructField(avg_interactions_per_session,DoubleType,true),StructField(stddev_int_per_session,DoubleType,true),StructField(max_interactions_one_session,LongType,true),StructField(purchase_pct_of_total_events,DoubleType,true),StructField(cart_pct_of_total_events,DoubleType,true),StructField(view_pct_of_total_events,DoubleType,true),StructField(avg_purchases_per_session,DoubleType,true),StructField(std_purchases_per_session,DoubleType,true),StructField(monthlyCartTotal,LongType,true),StructField(monthlyPurchaseTotal,LongType,true),StructField(monthlyViewTotal,LongType,true),Struct

In [63]:
full.show(1)

+---------+------------------+------------------+---------------+------------------+----------------+------------------+-----------------+------------------------+----------------------------+----------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+-------------------------+----------------+--------------------+----------------+--------------------+---------------+---------------+-------------+------------+
|  user_id|    m2_total_spend|    m1_total_spend|m1_total_events|m1_purchase_events|m1_user_sessions|num_sessions_month|       AvgSessLen|stddev_SessionLengthSecs|avg_interactions_per_session|stddev_int_per_session|max_interactions_one_session|purchase_pct_of_total_events|cart_pct_of_total_events|view_pct_of_total_events|avg_purchases_per_session|std_purchases_per_session|monthlyCartTotal|monthlyPurchaseTotal|monthlyViewTotal|NumSessWithPurchases|NumSessWithCart|NumSessWithView|ses_en

In [66]:
full.count()

359105

#### Save as parquet. Make sure to change permissions in bash using chmod 777 filename

In [64]:
%%time
full.write.mode("overwrite").parquet("/project/ds5559/group12/processed_data/engineered_features.parquet")

CPU times: user 4.19 ms, sys: 4.48 ms, total: 8.67 ms
Wall time: 44.2 s
