In [None]:
# Importing the necessary libraries

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import max
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import *
from pyspark.sql.functions import broadcast
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
# Creating the SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Case-Study_2") \
    .getOrCreate()

In [None]:
# Read the CSV file

fact = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\fact.csv")
lookup = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\lookup.csv")

In [None]:
# Doing BroadCast Join as a Big-Dataset & Small-Datset is present

join_df = fact.join(broadcast(lookup), 'WEB_PAGEID', 'inner').\
    select("USER_ID","WEB_PAGEID","VIEW_TIME","WEBPAGE_TYPE").\
    withColumn("DATE_OF_REFERENCE", to_date(lit('12-10-2019'),'dd-MM-yyyy'))\

# Printing the data
join_df.show(5,False)

# Printing the Schema
join_df.printSchema()

In [None]:
# Convering View_TIME format to date format('YYYY-MM-DD')
# Calculate difference in days between dateofreference()'2019-10-12') and the Invoice_Date

df = join_df.withColumn('VIEW_TIME', to_date(unix_timestamp(col('VIEW_TIME'), 'dd/MM/yyyy HH:mm').cast("timestamp")))\
    .withColumn("DATE_OF_REFERENCE", to_date(lit('12-10-2019'),'dd-MM-yyyy'))\
    .withColumn("Diff_In_Days", expr("datediff(DATE_OF_REFERENCE, VIEW_TIME)"))

df.show(5, False)
df.printSchema()

In [None]:
# Calculating the RFM 

rfm_table = df.groupBy("USER_ID")\
                        .agg(min("RecencyDays").alias("Recency"), \
                             count("WEB_PAGEID").alias("Frequency"))

rfm_table.show(5)

In [None]:
# Create a TempView:

df.createOrReplaceTempView("records")

# Calculating 'Frequency'

# NEWS PAGE_TYPE

In [None]:
# timewindow=365  

fre_news_365 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_365 from records where WEBPAGE_TYPE = 'news' and RecencyDays < '365' group by USER_ID")
fre_news_365.show(5)

In [None]:
# timewindow=730 

fre_news_730 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_730 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '365' and '730' group by USER_ID")
fre_news_730.show(5)

In [None]:
# Join fre_news_365 vs fre_news_730

fre_news_join_df1 = fre_news_365.join(fre_news_730, 'USER_ID', 'outer').na.fill(0)
fre_news_join_df1.show(5)

In [None]:
# timewindow=1430

fre_news_1460 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_1460 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '730' and '1460' group by USER_ID")
fre_news_1460.show(5)

In [None]:
fre_news_2920 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_2920 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '1460' and '2920' group by USER_ID")
fre_news_2920.show(5)

In [None]:
# Join fre_news_1460 vs fre_news_2920

fre_news_join_df2 = fre_news_1460.join(fre_news_2920, 'USER_ID', 'outer').na.fill(0)
fre_news_join_df2.show(5,False)

In [None]:
# Final NEWS_frequncy dataset

# Join fre_join_df1 vs fre_join_df2

fre_news_df = fre_news_join_df1.join(fre_news_join_df2, 'USER_ID', 'outer').na.fill(0) 

fre_news_df.show(5)

# Movie PAGE_TYPE

In [None]:
# Movie_365

fre_movies_365 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays < '365' group by USER_ID")
fre_movies_365.show(5)

In [None]:
# Movie_730

fre_movies_730 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_730 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '365' and '730' group by USER_ID")
fre_movies_730.show(5)

In [None]:
# Join fre_movies_365 vs fre_movies_730

fre_movies_join_df1 = fre_movies_365.join(fre_movies_730, 'USER_ID', 'outer').na.fill(0)
fre_movies_join_df1.show(5)

In [None]:
# Movie_1430

fre_movies_1460 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_1460 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '730' and '1460' group by USER_ID")
fre_movies_1460.show(5)

In [None]:
# Movie_2920

fre_movies_2920 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_2920 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '1460' and '2920' group by USER_ID")
fre_movies_2920.show(5)

In [None]:
# Join fre_movies_1460 vs fre_movies_2920

fre_movies_join_df2 = fre_movies_1460.join(fre_movies_2920, 'USER_ID', 'outer').na.fill(0)
fre_movies_join_df2.show(5)

In [None]:
# Final MOVIES_freruency dataset

# Join fre_join_df1 vs fre_join_df2

fre_movies_df = fre_movies_join_df1.join(fre_movies_join_df2, 'USER_ID', 'outer').na.fill(0) 
fre_movies_df.show(5)

# Final 'FREQUENCY' Dataset

In [None]:
# Final "FREQUENCY" Dataset

fre_df = fre_news_df.join(fre_movies_df, 'USER_ID', 'outer').na.fill(0) 
fre_df.limit(5).toPandas().head()

# Recency

In [None]:
# pageview_news_rec

pageview_news_rec = spark.sql("select USER_ID,min(RecencyDays) as pageview_news_rec from records where WEBPAGE_TYPE = 'news' group by USER_ID")
pageview_news_rec.show(5)

In [None]:
# pageview_movies_rec

pageview_movies_rec = spark.sql("select USER_ID,min(RecencyDays) as pageview_movies_rec from records where WEBPAGE_TYPE = 'movies' group by USER_ID")
pageview_movies_rec.show(5)

In [None]:
# Final Recency Dataset

rec_df = pageview_news_rec.join(pageview_movies_rec, 'USER_ID', 'outer').na.fill(0) 

rec_df.show(5)

# 'Recency' & 'Frequency' Dataset

In [None]:
final_df = fre_df.join(rec_df, 'USER_ID', 'outer').na.fill(0) 
final_df.limit(5).toPandas().head()

# Final DataSet

In [None]:
main_df = final_df.join(rfm_table, 'USER_ID', 'outer').na.fill(0)
main_df.limit(5).toPandas().head()