In [1]:
# Importing the necessary libraries

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import max
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import *
from pyspark.sql.functions import broadcast
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
# Creating the SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Case-Study_2") \
    .getOrCreate()

In [3]:
# Read the CSV file

fact = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\fact.csv")
lookup = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\lookup.csv")

In [4]:
# Doing BroadCast Join as a Big-Dataset & Small-Datset is present

join_df = fact.join(broadcast(lookup), 'WEB_PAGEID', 'inner').\
    select("USER_ID","WEB_PAGEID","VIEW_TIME","WEBPAGE_TYPE")

# Printing the data
join_df.show(5,False)

# Printing the Schema
join_df.printSchema()

+---------+----------+----------------+------------+
|USER_ID  |WEB_PAGEID|VIEW_TIME       |WEBPAGE_TYPE|
+---------+----------+----------------+------------+
|149977241|3740865072|10/02/2016 18:54|news        |
|142413313|3621915402|21/01/2016 16:10|news        |
|142413313|3621915402|21/01/2016 16:20|news        |
|142413313|3621915402|21/01/2016 16:00|news        |
|112797679|2276268225|12/04/2016 13:05|news        |
+---------+----------+----------------+------------+
only showing top 5 rows

root
 |-- USER_ID: string (nullable = true)
 |-- WEB_PAGEID: string (nullable = true)
 |-- VIEW_TIME: string (nullable = true)
 |-- WEBPAGE_TYPE: string (nullable = true)



In [5]:
# Convering View_TIME format to date format('YYYY-MM-DD')
# Calculate difference in days between dateofreference()'2019-10-12') and the Invoice_Date

df = join_df.withColumn('VIEW_TIME', to_date(unix_timestamp(col('VIEW_TIME'), 'dd/MM/yyyy HH:mm').cast("timestamp")))\
    .withColumn("DATE_OF_REFERENCE", to_date(lit('12-10-2019'),'dd-MM-yyyy'))\
    .withColumn("RecencyDays", expr("datediff(DATE_OF_REFERENCE, VIEW_TIME)"))

df.show(5, False)
df.printSchema()

+---------+----------+----------+------------+-----------------+-----------+
|USER_ID  |WEB_PAGEID|VIEW_TIME |WEBPAGE_TYPE|DATE_OF_REFERENCE|RecencyDays|
+---------+----------+----------+------------+-----------------+-----------+
|149977241|3740865072|2016-02-10|news        |2019-10-12       |1340       |
|142413313|3621915402|2016-01-21|news        |2019-10-12       |1360       |
|142413313|3621915402|2016-01-21|news        |2019-10-12       |1360       |
|142413313|3621915402|2016-01-21|news        |2019-10-12       |1360       |
|112797679|2276268225|2016-04-12|news        |2019-10-12       |1278       |
+---------+----------+----------+------------+-----------------+-----------+
only showing top 5 rows

root
 |-- USER_ID: string (nullable = true)
 |-- WEB_PAGEID: string (nullable = true)
 |-- VIEW_TIME: date (nullable = true)
 |-- WEBPAGE_TYPE: string (nullable = true)
 |-- DATE_OF_REFERENCE: date (nullable = true)
 |-- RecencyDays: integer (nullable = true)



In [6]:

"""

df = fact.join(lookup,'WEB_PAGEID','inner')\
    .select(fact.USER_ID,fact.VIEW_TIME,fact.WEB_PAGEID,lookup.WEBPAGE_TYPE)\
    .withColumn("DATE_OF_REFERENCE", to_date(lit('12-10-2019'),'dd-MM-yyyy'))\
    .withColumn('VIEW_TIME', to_date(unix_timestamp(col('VIEW_TIME'), 'dd/MM/yyyy HH:mm').cast("timestamp")))\
    .withColumn("RecencyDays", expr("datediff(DATE_OF_REFERENCE, VIEW_TIME)"))

df.show(5)
df.printSchema()

"""

'\n\ndf = fact.join(lookup,\'WEB_PAGEID\',\'inner\')    .select(fact.USER_ID,fact.VIEW_TIME,fact.WEB_PAGEID,lookup.WEBPAGE_TYPE)    .withColumn("DATE_OF_REFERENCE", to_date(lit(\'12-10-2019\'),\'dd-MM-yyyy\'))    .withColumn(\'VIEW_TIME\', to_date(unix_timestamp(col(\'VIEW_TIME\'), \'dd/MM/yyyy HH:mm\').cast("timestamp")))    .withColumn("RecencyDays", expr("datediff(DATE_OF_REFERENCE, VIEW_TIME)"))\n\ndf.show(5)\ndf.printSchema()\n\n'

In [7]:
# Calculating the RFM 

rfm_table = df.groupBy("USER_ID")\
                        .agg(min("RecencyDays").alias("Recency"), \
                             count("WEB_PAGEID").alias("Frequency"))

rfm_table.show(5)

+---------+-------+---------+
|  USER_ID|Recency|Frequency|
+---------+-------+---------+
| 98343030|    932|        3|
| 59736608|    914|        2|
|115056661|   1336|        1|
| 88602347|   1259|        1|
|162690170|   1263|        1|
+---------+-------+---------+
only showing top 5 rows



In [8]:
# Create a TempView:

df.createOrReplaceTempView("records")

# Calculating 'Frequency'

# NEWS PAGE_TYPE

In [9]:
# timewindow=365  

fre_news_365 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_365 from records where WEBPAGE_TYPE = 'news' and RecencyDays < '365' group by USER_ID")
fre_news_365.show(5)

+----------+---------------------+
|   USER_ID|pageview_news_fre_365|
+----------+---------------------+
| -69271739|                    1|
|-133621877|                    1|
| 157684212|                    1|
|-285299518|                    1|
|-277209141|                    3|
+----------+---------------------+
only showing top 5 rows



In [10]:
# timewindow=730 

fre_news_730 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_730 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '365' and '730' group by USER_ID")
fre_news_730.show(5)

+----------+---------------------+
|   USER_ID|pageview_news_fre_730|
+----------+---------------------+
|-247361411|                    3|
|-231838278|                    1|
|-252290668|                    2|
| 155111993|                    1|
|  24160260|                    2|
+----------+---------------------+
only showing top 5 rows



In [11]:
# Join fre_news_365 vs fre_news_730

fre_news_join_df1 = fre_news_365.join(fre_news_730, 'USER_ID', 'outer').na.fill(0)
fre_news_join_df1.show(5)

+----------+---------------------+---------------------+
|   USER_ID|pageview_news_fre_365|pageview_news_fre_730|
+----------+---------------------+---------------------+
|-247361411|                    0|                    3|
|-231838278|                    0|                    1|
|-252290668|                    0|                    2|
| -69271739|                    1|                    0|
|-133621877|                    1|                    0|
+----------+---------------------+---------------------+
only showing top 5 rows



In [12]:
# timewindow=1430

fre_news_1460 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_1460 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '730' and '1460' group by USER_ID")
fre_news_1460.show(5)

+---------+----------------------+
|  USER_ID|pageview_news_fre_1460|
+---------+----------------------+
| 98343030|                     3|
| 59736608|                     2|
|115056661|                     1|
|162690170|                     1|
|169469168|                     1|
+---------+----------------------+
only showing top 5 rows



In [13]:
fre_news_2920 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_2920 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '1460' and '2920' group by USER_ID")
fre_news_2920.show(5)

+-------+----------------------+
|USER_ID|pageview_news_fre_2920|
+-------+----------------------+
+-------+----------------------+



In [14]:
# Join fre_news_1460 vs fre_news_2920

fre_news_join_df2 = fre_news_1460.join(fre_news_2920, 'USER_ID', 'outer').na.fill(0)
fre_news_join_df2.show(5,False)

+---------+----------------------+----------------------+
|USER_ID  |pageview_news_fre_1460|pageview_news_fre_2920|
+---------+----------------------+----------------------+
|98343030 |3                     |0                     |
|115056661|1                     |0                     |
|59736608 |2                     |0                     |
|162690170|1                     |0                     |
|167725829|1                     |0                     |
+---------+----------------------+----------------------+
only showing top 5 rows



In [15]:
# Final NEWS_frequncy dataset

# Join fre_join_df1 vs fre_join_df2

fre_news_df = fre_news_join_df1.join(fre_news_join_df2, 'USER_ID', 'outer').na.fill(0) 

fre_news_df.show(5)

+---------+---------------------+---------------------+----------------------+----------------------+
|  USER_ID|pageview_news_fre_365|pageview_news_fre_730|pageview_news_fre_1460|pageview_news_fre_2920|
+---------+---------------------+---------------------+----------------------+----------------------+
| 98343030|                    0|                    0|                     3|                     0|
|115056661|                    0|                    0|                     1|                     0|
| 59736608|                    0|                    0|                     2|                     0|
|162690170|                    0|                    0|                     1|                     0|
|167725829|                    0|                    0|                     1|                     0|
+---------+---------------------+---------------------+----------------------+----------------------+
only showing top 5 rows



# Movie PAGE_TYPE

In [16]:
# Movie_365

fre_movies_365 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays < '365' group by USER_ID")
fre_movies_365.show(5)

+-------+-----------------------+
|USER_ID|pageview_movies_fre_365|
+-------+-----------------------+
+-------+-----------------------+



In [17]:
# Movie_730

fre_movies_730 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_730 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '365' and '730' group by USER_ID")
fre_movies_730.show(5)

+-------+-----------------------+
|USER_ID|pageview_movies_fre_730|
+-------+-----------------------+
+-------+-----------------------+



In [18]:
# Join fre_movies_365 vs fre_movies_730

fre_movies_join_df1 = fre_movies_365.join(fre_movies_730, 'USER_ID', 'outer').na.fill(0)
fre_movies_join_df1.show(5)

+-------+-----------------------+-----------------------+
|USER_ID|pageview_movies_fre_365|pageview_movies_fre_730|
+-------+-----------------------+-----------------------+
+-------+-----------------------+-----------------------+



In [19]:
# Movie_1430

fre_movies_1460 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_1460 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '730' and '1460' group by USER_ID")
fre_movies_1460.show(5)

+---------+------------------------+
|  USER_ID|pageview_movies_fre_1460|
+---------+------------------------+
| 88602347|                       1|
| 78741118|                       2|
| 30317072|                       1|
|197002823|                       1|
|115544304|                       1|
+---------+------------------------+
only showing top 5 rows



In [20]:
# Movie_2920

fre_movies_2920 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_2920 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '1460' and '2920' group by USER_ID")
fre_movies_2920.show(5)

+-------+------------------------+
|USER_ID|pageview_movies_fre_2920|
+-------+------------------------+
+-------+------------------------+



In [21]:
# Join fre_movies_1460 vs fre_movies_2920

fre_movies_join_df2 = fre_movies_1460.join(fre_movies_2920, 'USER_ID', 'outer').na.fill(0)
fre_movies_join_df2.show(5)

+---------+------------------------+------------------------+
|  USER_ID|pageview_movies_fre_1460|pageview_movies_fre_2920|
+---------+------------------------+------------------------+
| 88602347|                       1|                       0|
| 78741118|                       2|                       0|
| 30317072|                       1|                       0|
|197002823|                       1|                       0|
|115544304|                       1|                       0|
+---------+------------------------+------------------------+
only showing top 5 rows



In [22]:
# Final MOVIES_freruency dataset

# Join fre_join_df1 vs fre_join_df2

fre_movies_df = fre_movies_join_df1.join(fre_movies_join_df2, 'USER_ID', 'outer').na.fill(0) 
fre_movies_df.show(5)

+---------+-----------------------+-----------------------+------------------------+------------------------+
|  USER_ID|pageview_movies_fre_365|pageview_movies_fre_730|pageview_movies_fre_1460|pageview_movies_fre_2920|
+---------+-----------------------+-----------------------+------------------------+------------------------+
| 88602347|                      0|                      0|                       1|                       0|
| 78741118|                      0|                      0|                       2|                       0|
| 30317072|                      0|                      0|                       1|                       0|
|197002823|                      0|                      0|                       1|                       0|
|115544304|                      0|                      0|                       1|                       0|
+---------+-----------------------+-----------------------+------------------------+------------------------+
only showi

# Final 'FREQUENCY' Dataset

In [23]:
# Final "FREQUENCY" Dataset

fre_df = fre_news_df.join(fre_movies_df, 'USER_ID', 'outer').na.fill(0) 
fre_df.limit(5).toPandas().head()

Unnamed: 0,USER_ID,pageview_news_fre_365,pageview_news_fre_730,pageview_news_fre_1460,pageview_news_fre_2920,pageview_movies_fre_365,pageview_movies_fre_730,pageview_movies_fre_1460,pageview_movies_fre_2920
0,98343030,0,0,3,0,0,0,0,0
1,115056661,0,0,1,0,0,0,0,0
2,59736608,0,0,2,0,0,0,0,0
3,88602347,0,0,0,0,0,0,1,0
4,162690170,0,0,1,0,0,0,0,0


# Recency

In [24]:
# pageview_news_rec

pageview_news_rec = spark.sql("select USER_ID,min(RecencyDays) as pageview_news_rec from records where WEBPAGE_TYPE = 'news' group by USER_ID")
pageview_news_rec.show(5)

+---------+-----------------+
|  USER_ID|pageview_news_rec|
+---------+-----------------+
| 98343030|              932|
| 59736608|              914|
|115056661|             1336|
|162690170|             1263|
|169469168|             1263|
+---------+-----------------+
only showing top 5 rows



In [25]:
# pageview_movies_rec

pageview_movies_rec = spark.sql("select USER_ID,min(RecencyDays) as pageview_movies_rec from records where WEBPAGE_TYPE = 'movies' group by USER_ID")
pageview_movies_rec.show(5)

+---------+-------------------+
|  USER_ID|pageview_movies_rec|
+---------+-------------------+
| 88602347|               1259|
| 78741118|               1349|
| 30317072|               1361|
|197002823|               1181|
|115544304|               1331|
+---------+-------------------+
only showing top 5 rows



In [26]:
# Final Recency Dataset

rec_df = pageview_news_rec.join(pageview_movies_rec, 'USER_ID', 'outer').na.fill(0) 

rec_df.show(5)

+---------+-----------------+-------------------+
|  USER_ID|pageview_news_rec|pageview_movies_rec|
+---------+-----------------+-------------------+
| 98343030|              932|                  0|
|115056661|             1336|                  0|
| 59736608|              914|                  0|
| 88602347|                0|               1259|
|162690170|             1263|                  0|
+---------+-----------------+-------------------+
only showing top 5 rows



# 'Recency' & 'Frequency' Dataset

In [27]:
final_df = fre_df.join(rec_df, 'USER_ID', 'outer').na.fill(0) 
final_df.limit(5).toPandas().head()

Unnamed: 0,USER_ID,pageview_news_fre_365,pageview_news_fre_730,pageview_news_fre_1460,pageview_news_fre_2920,pageview_movies_fre_365,pageview_movies_fre_730,pageview_movies_fre_1460,pageview_movies_fre_2920,pageview_news_rec,pageview_movies_rec
0,98343030,0,0,3,0,0,0,0,0,932,0
1,115056661,0,0,1,0,0,0,0,0,1336,0
2,59736608,0,0,2,0,0,0,0,0,914,0
3,88602347,0,0,0,0,0,0,1,0,0,1259
4,162690170,0,0,1,0,0,0,0,0,1263,0


# Final DataSet

In [28]:
main_df = final_df.join(rfm_table, 'USER_ID', 'outer').na.fill(0)
main_df.limit(5).toPandas().head()

Unnamed: 0,USER_ID,pageview_news_fre_365,pageview_news_fre_730,pageview_news_fre_1460,pageview_news_fre_2920,pageview_movies_fre_365,pageview_movies_fre_730,pageview_movies_fre_1460,pageview_movies_fre_2920,pageview_news_rec,pageview_movies_rec,Recency,Frequency
0,98343030,0,0,3,0,0,0,0,0,932,0,932,3
1,115056661,0,0,1,0,0,0,0,0,1336,0,1336,1
2,59736608,0,0,2,0,0,0,0,0,914,0,914,2
3,88602347,0,0,0,0,0,0,1,0,0,1259,1259,1
4,162690170,0,0,1,0,0,0,0,0,1263,0,1263,1
