In [1]:
# Importing the necessary libraries

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import max
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import *
from pyspark.sql.functions import broadcast
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import sum, desc, max, min

In [2]:
# Creating the SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Case-Study_2") \
    .getOrCreate()

In [None]:
# Read the Csv file

fact = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\fact.csv")
lookup = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\lookup.csv")

In [None]:
df = fact.join(broadcast(lookup), 'WEB_PAGEID', 'inner')\
    .select(fact.USER_ID,fact.VIEW_TIME,fact.WEB_PAGEID,lookup.WEBPAGE_TYPE)\
    .withColumn("DATE_OF_REFERENCE", to_date(lit('12-10-2019'),'dd-MM-yyyy'))\
    .withColumn('VIEW_TIME', to_date(unix_timestamp(col('VIEW_TIME'), 'dd/MM/yyyy HH:mm').cast("timestamp")))\
    .withColumn("diff", expr("datediff(DATE_OF_REFERENCE, VIEW_TIME)"))

df.show(5)
df.printSchema()

In [None]:
df = df.withColumn("pageview_news_fre_365", when((upper(df['WEBPAGE_TYPE']) == "NEWS") 
                                                 & (df['diff'] <= "365"), 1).otherwise(0)) \
       .withColumn("pageview_news_fre_730", when((upper(df['WEBPAGE_TYPE']) == "NEWS") 
                                                 & (df.diff.between("365", "730")), 1).otherwise(0)) \
       .withColumn("pageview_news_fre_1460", when((upper(df['WEBPAGE_TYPE']) == "NEWS") 
                                                  & (df.diff.between("730", "1460")), 1).otherwise(0)) \
       .withColumn("pageview_news_fre_2920", when((upper(df['WEBPAGE_TYPE']) == "NEWS") 
                                                  & (df.diff.between("1460", "2920")), 1).otherwise(0)) \
       .withColumn("pageview_movies_fre_365", when((upper(df['WEBPAGE_TYPE']) == "MOVIES") 
                                                   & (df['diff'] <= "365"), 1).otherwise(0)) \
       .withColumn("pageview_movies_fre_730", when((upper(df['WEBPAGE_TYPE']) == "MOVIES")
                                                   & (df.diff.between("365", "730")), 1).otherwise(0)) \
       .withColumn("pageview_movies_fre_1460", when((upper(df['WEBPAGE_TYPE']) == "MOVIES") 
                                                    & (df.diff.between("730", "1460")), 1).otherwise(0)) \
       .withColumn("pageview_movies_fre_2920", when((upper(df['WEBPAGE_TYPE']) == "MOVIES") 
                                                    & (df.diff.between("1460", "2920")), 1).otherwise(0))
df.limit(4).toPandas().head()

In [None]:
# from pyspark.sql.functions import sum, desc, max, min

exprs = [sum(x).alias(x) for x in
         ['pageview_news_fre_365', 'pageview_news_fre_730', 'pageview_news_fre_1460', 'pageview_news_fre_2920',
          'pageview_movies_fre_365', 'pageview_movies_fre_730', 'pageview_movies_fre_1460', 'pageview_movies_fre_2920']]

exprs.append(max('diff').alias('recency_view_time'))
exprs.append(count('diff').alias('frequency_view_time'))
df.groupBy("USER_ID").agg(*exprs).toPandas().head(4)