In [1]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import max
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import *
from pyspark.sql.functions import broadcast
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
# Creating the SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Case-Study-Data-Processing") \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
# Read the Csv file

fact_data = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\fact.csv")
lookup_data = spark.read.format("csv").option("header", "true").load("C:\\BigData\\use-case-data-processing-main\\lookup.csv")

In [4]:
# Doing BroadCast Join As One Big Data Set and One Small DatSet is present

join_data = fact_data.join(broadcast(lookup_data), fact_data.WEB_PAGEID == lookup_data.WEB_PAGEID, "inner").drop(lookup_data.WEB_PAGEID)
join_data.show(5,False)

+---------+----------------+----------+------------+
|USER_ID  |VIEW_TIME       |WEB_PAGEID|WEBPAGE_TYPE|
+---------+----------------+----------+------------+
|149977241|10/02/2016 18:54|3740865072|news        |
|142413313|21/01/2016 16:10|3621915402|news        |
|142413313|21/01/2016 16:20|3621915402|news        |
|142413313|21/01/2016 16:00|3621915402|news        |
|112797679|12/04/2016 13:05|2276268225|news        |
+---------+----------------+----------+------------+
only showing top 5 rows



In [5]:
# Convering View_TIME format to 'DD/MM/YYYY' in Column(Data_S)

new_data = join_data.withColumn('Date_S',from_unixtime(unix_timestamp(join_data["VIEW_TIME"],'dd/MM/yyyy mm:ss'),'dd/MM/yyyy'))
new_data.show(5,False)
new_data.printSchema()

+---------+----------------+----------+------------+----------+
|USER_ID  |VIEW_TIME       |WEB_PAGEID|WEBPAGE_TYPE|Date_S    |
+---------+----------------+----------+------------+----------+
|149977241|10/02/2016 18:54|3740865072|news        |10/02/2016|
|142413313|21/01/2016 16:10|3621915402|news        |21/01/2016|
|142413313|21/01/2016 16:20|3621915402|news        |21/01/2016|
|142413313|21/01/2016 16:00|3621915402|news        |21/01/2016|
|112797679|12/04/2016 13:05|2276268225|news        |12/04/2016|
+---------+----------------+----------+------------+----------+
only showing top 5 rows

root
 |-- USER_ID: string (nullable = true)
 |-- VIEW_TIME: string (nullable = true)
 |-- WEB_PAGEID: string (nullable = true)
 |-- WEBPAGE_TYPE: string (nullable = true)
 |-- Date_S: string (nullable = true)



In [6]:
# Convering Data_S from String datatype to Date format in Date field.

data = new_data.withColumn('Date_S',from_unixtime(unix_timestamp(new_data["VIEW_TIME"],'dd/MM/yyyy mm:ss'),'dd/MM/yyyy')) \
.select(col("USER_ID"),("VIEW_TIME"),("WEB_PAGEID"),("WEBPAGE_TYPE"),to_date(col("Date_S"),'dd/MM/yyyy').alias("Date")) 

data.show(5,False)
data.printSchema()

+---------+----------------+----------+------------+----------+
|USER_ID  |VIEW_TIME       |WEB_PAGEID|WEBPAGE_TYPE|Date      |
+---------+----------------+----------+------------+----------+
|149977241|10/02/2016 18:54|3740865072|news        |2016-02-10|
|142413313|21/01/2016 16:10|3621915402|news        |2016-01-21|
|142413313|21/01/2016 16:20|3621915402|news        |2016-01-21|
|142413313|21/01/2016 16:00|3621915402|news        |2016-01-21|
|112797679|12/04/2016 13:05|2276268225|news        |2016-04-12|
+---------+----------------+----------+------------+----------+
only showing top 5 rows

root
 |-- USER_ID: string (nullable = true)
 |-- VIEW_TIME: string (nullable = true)
 |-- WEB_PAGEID: string (nullable = true)
 |-- WEBPAGE_TYPE: string (nullable = true)
 |-- Date: date (nullable = true)



In [7]:
# Calculate difference in days between 2011-12-31 and the Invoice Date

data = data.withColumn("RecencyDays", expr("datediff('2019-10-12', Date)"))

data.show(5,False)

# data.select('USER_ID').distinct().count()

+---------+----------------+----------+------------+----------+-----------+
|USER_ID  |VIEW_TIME       |WEB_PAGEID|WEBPAGE_TYPE|Date      |RecencyDays|
+---------+----------------+----------+------------+----------+-----------+
|149977241|10/02/2016 18:54|3740865072|news        |2016-02-10|1340       |
|142413313|21/01/2016 16:10|3621915402|news        |2016-01-21|1360       |
|142413313|21/01/2016 16:20|3621915402|news        |2016-01-21|1360       |
|142413313|21/01/2016 16:00|3621915402|news        |2016-01-21|1360       |
|112797679|12/04/2016 13:05|2276268225|news        |2016-04-12|1278       |
+---------+----------------+----------+------------+----------+-----------+
only showing top 5 rows



In [8]:
rfm_table = data.groupBy("USER_ID")\
                        .agg(min("RecencyDays").alias("Recency"), \
                             count("WEB_PAGEID").alias("Frequency"))

rfm_table.show()

+----------+-------+---------+
|   USER_ID|Recency|Frequency|
+----------+-------+---------+
|  98343030|    932|        3|
|  59736608|    914|        2|
| 115056661|   1336|        1|
|  88602347|   1259|        1|
| 162690170|   1263|        1|
| 169469168|   1263|        1|
| 167725829|   1280|        1|
| 264289612|   1009|        2|
| 191712149|   1263|        1|
|-247361411|    705|        3|
|  66612057|   1263|        1|
|    788539|   1170|        1|
|   8856308|   1282|        1|
| 171211230|   1262|        1|
| 159147772|   1308|        2|
| 161303572|   1263|        1|
|  78741118|   1349|        2|
|  55620801|   1313|        1|
| 159348322|   1262|        1|
|  96024291|   1361|        1|
+----------+-------+---------+
only showing top 20 rows



In [9]:
# Create a TempView:

data.createOrReplaceTempView("records")

# NEWS PAGE_TYPE

In [10]:
# pageview_news_rec_365

pageview_news_rec_365 = spark.sql("select count(WEB_PAGEID) as pageview_news_rec_365 from records where WEBPAGE_TYPE = 'news' and RecencyDays < '365'")
pageview_news_rec_365.show()

+---------------------+
|pageview_news_rec_365|
+---------------------+
|                   65|
+---------------------+



In [11]:
# pageview_news_rec_365

news_365 = spark.sql("select USER_ID,count(*) as pageview_news_rec_365 from records where WEBPAGE_TYPE = 'news' and RecencyDays < '365' group by USER_ID")
news_365.show(11)
news_365.count()

+----------+---------------------+
|   USER_ID|pageview_news_rec_365|
+----------+---------------------+
| -69271739|                    1|
|-133621877|                    1|
| 157684212|                    1|
|-285299518|                    1|
|-277209141|                    3|
|-151394098|                    3|
| 250009214|                    1|
| -73385561|                    1|
|  78585170|                    1|
|-163187677|                    3|
|-167528374|                   49|
+----------+---------------------+



11

In [12]:
# pageview_news_rec_730

pageview_news_rec_365 = spark.sql("select count(WEB_PAGEID) as pageview_news_rec_730 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '365' and '730'")
pageview_news_rec_365.show()

+---------------------+
|pageview_news_rec_730|
+---------------------+
|                   35|
+---------------------+



In [13]:
news_730 = spark.sql("select USER_ID,count(*) as pageview_news_rec_730 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '365' and '730' group by USER_ID")
news_730.show(5)
news_730.count()

+----------+---------------------+
|   USER_ID|pageview_news_rec_730|
+----------+---------------------+
|-247361411|                    3|
|-231838278|                    1|
|-252290668|                    2|
| 155111993|                    1|
|  24160260|                    2|
+----------+---------------------+
only showing top 5 rows



24

In [14]:
# Join news_365 vs news_730
join_df1 = news_730.join(news_365, 'USER_ID', 'outer').na.fill(0)
join_df1.show(5,truncate=False)
join_df1.count()

+----------+---------------------+---------------------+
|USER_ID   |pageview_news_rec_730|pageview_news_rec_365|
+----------+---------------------+---------------------+
|-247361411|3                    |0                    |
|-231838278|1                    |0                    |
|-252290668|2                    |0                    |
|-69271739 |0                    |1                    |
|-133621877|0                    |1                    |
+----------+---------------------+---------------------+
only showing top 5 rows



35

In [15]:
# pageview_news_rec_1460

pageview_news_rec_1460 = spark.sql("select count(WEB_PAGEID) as pageview_news_rec_1460 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '730' and '1460'")
pageview_news_rec_1460.show()

+----------------------+
|pageview_news_rec_1460|
+----------------------+
|                   874|
+----------------------+



In [16]:
news_1460 = spark.sql("select USER_ID,count(*) as pageview_news_rec_1460 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '730' and '1460' group by USER_ID")
news_1460.show(5)
news_1460.count()

+---------+----------------------+
|  USER_ID|pageview_news_rec_1460|
+---------+----------------------+
| 98343030|                     3|
| 59736608|                     2|
|115056661|                     1|
|162690170|                     1|
|169469168|                     1|
+---------+----------------------+
only showing top 5 rows



746

In [17]:
# pageview_news_rec_2920

pageview_news_rec_2920 = spark.sql("select count(WEB_PAGEID) as pageview_news_rec_2920 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '1460' and '2920'")
pageview_news_rec_2920.show()

+----------------------+
|pageview_news_rec_2920|
+----------------------+
|                     0|
+----------------------+



In [18]:
news_2920 = spark.sql("select USER_ID,count(*) as pageview_news_rec_2920 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '1460' and '2920' group by USER_ID")
news_2920.show(5)
news_2920.count()

+-------+----------------------+
|USER_ID|pageview_news_rec_2920|
+-------+----------------------+
+-------+----------------------+



0

In [19]:
# Join news_1460 vs news_2920

join_df2 = news_2920.join(news_1460, 'USER_ID', 'outer').na.fill(0)
join_df2.show(5,truncate=False)
join_df2.count()

+---------+----------------------+----------------------+
|USER_ID  |pageview_news_rec_2920|pageview_news_rec_1460|
+---------+----------------------+----------------------+
|98343030 |0                     |3                     |
|115056661|0                     |1                     |
|59736608 |0                     |2                     |
|162690170|0                     |1                     |
|167725829|0                     |1                     |
+---------+----------------------+----------------------+
only showing top 5 rows



746

In [20]:
# Join join_df1 vs join_df2

news_df = join_df1.join(join_df2, 'USER_ID', 'outer').drop(join_df2.USER_ID).na.fill(0) \
.select("USER_ID","pageview_news_rec_365","pageview_news_rec_730","pageview_news_rec_1460","pageview_news_rec_2920")

news_df.show(5)

+---------+---------------------+---------------------+----------------------+----------------------+
|  USER_ID|pageview_news_rec_365|pageview_news_rec_730|pageview_news_rec_1460|pageview_news_rec_2920|
+---------+---------------------+---------------------+----------------------+----------------------+
| 98343030|                    0|                    0|                     3|                     0|
|115056661|                    0|                    0|                     1|                     0|
| 59736608|                    0|                    0|                     2|                     0|
|162690170|                    0|                    0|                     1|                     0|
|167725829|                    0|                    0|                     1|                     0|
+---------+---------------------+---------------------+----------------------+----------------------+
only showing top 5 rows



# MOVIES PAGE_TYPE

In [21]:
# pageview_movies_rec_365

# data.createOrReplaceTempView("records")

pageview_movies_rec_365 = spark.sql("select count(WEB_PAGEID) as pageview_movies_rec_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays < '365'")
pageview_movies_rec_365.show()

+-----------------------+
|pageview_movies_rec_365|
+-----------------------+
|                      0|
+-----------------------+



In [22]:
# pageview_movies_rec_365

movies_365 = spark.sql("select USER_ID,count(*) as pageview_movies_rec_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays < '365' group by USER_ID")
movies_365.show(11)
movies_365.count()

+-------+-----------------------+
|USER_ID|pageview_movies_rec_365|
+-------+-----------------------+
+-------+-----------------------+



0

In [23]:
# pageview_news_rec_730

pageview_movies_rec_730 = spark.sql("select count(WEB_PAGEID) as pageview_movies_rec_730 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '365' and '730'")
pageview_movies_rec_730.show()

+-----------------------+
|pageview_movies_rec_730|
+-----------------------+
|                      0|
+-----------------------+



In [24]:
movies_730 = spark.sql("select USER_ID,count(*) as pageview_movies_rec_730 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '365' and '730' group by USER_ID")
movies_730.show(5)
movies_730.count()

+-------+-----------------------+
|USER_ID|pageview_movies_rec_730|
+-------+-----------------------+
+-------+-----------------------+



0

In [25]:
# Join movies_365 vs movies_730

join_df3 = movies_730.join(movies_365, 'USER_ID', 'outer').na.fill(0)
join_df3.show(5,truncate=False)
join_df3.count()

+-------+-----------------------+-----------------------+
|USER_ID|pageview_movies_rec_730|pageview_movies_rec_365|
+-------+-----------------------+-----------------------+
+-------+-----------------------+-----------------------+



0

In [26]:
# pageview_movies_rec_1460

pageview_movies_rec_1460 = spark.sql("select count(WEB_PAGEID) as pageview_movies_rec_1460 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '730' and '1460'")
pageview_movies_rec_1460.show()

+------------------------+
|pageview_movies_rec_1460|
+------------------------+
|                      26|
+------------------------+



In [27]:
movies_1460 = spark.sql("select USER_ID,count(*) as pageview_movies_rec_1460 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '730' and '1460' group by USER_ID")
movies_1460.show(5)
movies_1460.count()

+---------+------------------------+
|  USER_ID|pageview_movies_rec_1460|
+---------+------------------------+
| 88602347|                       1|
| 78741118|                       2|
| 30317072|                       1|
|197002823|                       1|
|115544304|                       1|
+---------+------------------------+
only showing top 5 rows



18

In [28]:
# pageview_movies_rec_2920

pageview_movies_rec_2920 = spark.sql("select count(WEB_PAGEID) as pageview_movies_rec_2920 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '1460' and '2920'")
pageview_movies_rec_2920.show()

+------------------------+
|pageview_movies_rec_2920|
+------------------------+
|                       0|
+------------------------+



In [29]:
movies_2920 = spark.sql("select USER_ID,count(*) as pageview_movies_rec_2920 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '1460' and '2920' group by USER_ID")
movies_2920.show(5)
movies_2920.count()

+-------+------------------------+
|USER_ID|pageview_movies_rec_2920|
+-------+------------------------+
+-------+------------------------+



0

In [30]:
# Join movies_1460 vs movies_2920

join_df4 = movies_2920.join(movies_1460, 'USER_ID', "outer").na.fill(0)
join_df4.show(5,truncate=False)
join_df4.count()

+---------+------------------------+------------------------+
|USER_ID  |pageview_movies_rec_2920|pageview_movies_rec_1460|
+---------+------------------------+------------------------+
|88602347 |0                       |1                       |
|78741118 |0                       |2                       |
|30317072 |0                       |1                       |
|197002823|0                       |1                       |
|115544304|0                       |1                       |
+---------+------------------------+------------------------+
only showing top 5 rows



18

In [31]:
# Join join_df3 vs join_df4

movie_df = join_df3.join(join_df4, 'USER_ID', 'outer').na.fill(0) \
.select("USER_ID","pageview_movies_rec_365","pageview_movies_rec_730","pageview_movies_rec_1460","pageview_movies_rec_2920")

movie_df.show(5)

+---------+-----------------------+-----------------------+------------------------+------------------------+
|  USER_ID|pageview_movies_rec_365|pageview_movies_rec_730|pageview_movies_rec_1460|pageview_movies_rec_2920|
+---------+-----------------------+-----------------------+------------------------+------------------------+
| 88602347|                      0|                      0|                       1|                       0|
| 78741118|                      0|                      0|                       2|                       0|
| 30317072|                      0|                      0|                       1|                       0|
|197002823|                      0|                      0|                       1|                       0|
|115544304|                      0|                      0|                       1|                       0|
+---------+-----------------------+-----------------------+------------------------+------------------------+
only showi

# Final 'Dur' DataSet

In [32]:
# Joining Movie_df vs News_df

result_df = news_df.join(movie_df, 'USER_ID', 'outer').na.fill(0) \
# select("USER_ID","pageview_news_rec_365","pageview_news_rec_730","pageview_news_rec_1460","pageview_news_rec_2920","pageview_movies_rec_365","pageview_movies_rec_730","pageview_movies_rec_1460","pageview_movies_rec_2920")

result_df.limit(5).toPandas().head()

Unnamed: 0,USER_ID,pageview_news_rec_365,pageview_news_rec_730,pageview_news_rec_1460,pageview_news_rec_2920,pageview_movies_rec_365,pageview_movies_rec_730,pageview_movies_rec_1460,pageview_movies_rec_2920
0,98343030,0,0,3,0,0,0,0,0
1,115056661,0,0,1,0,0,0,0,0
2,59736608,0,0,2,0,0,0,0,0
3,88602347,0,0,0,0,0,0,1,0
4,162690170,0,0,1,0,0,0,0,0


# Calculating 'Fre'

# NEWS PAGE_TYPE

In [59]:
fre_news_365 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_365 from records where WEBPAGE_TYPE = 'news' and RecencyDays < '365' group by USER_ID")
fre_news_365.show(5)
fre_news_365.count()

+----------+---------------------+
|   USER_ID|pageview_news_fre_365|
+----------+---------------------+
| -69271739|                    1|
|-133621877|                    1|
| 157684212|                    1|
|-285299518|                    1|
|-277209141|                    3|
+----------+---------------------+
only showing top 5 rows



11

In [48]:
fre_news_730 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_730 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '365' and '730' group by USER_ID")
fre_news_730.show(5)
fre_news_730.count()

+----------+---------------------+
|   USER_ID|pageview_news_fre_730|
+----------+---------------------+
|-247361411|                    3|
|-231838278|                    1|
|-252290668|                    2|
| 155111993|                    1|
|  24160260|                    2|
+----------+---------------------+
only showing top 5 rows



24

In [69]:
# Join news_365 vs news_730

fre_join_df1 = fre_news_365.join(fre_news_730, 'USER_ID', 'outer').na.fill(0)
fre_join_df1.show(5)
fre_join_df1.count()

+----------+---------------------+---------------------+
|USER_ID   |pageview_news_fre_365|pageview_news_fre_730|
+----------+---------------------+---------------------+
|-247361411|0                    |3                    |
|-231838278|0                    |1                    |
|-252290668|0                    |2                    |
|-69271739 |1                    |0                    |
|-133621877|1                    |0                    |
+----------+---------------------+---------------------+
only showing top 5 rows



35

In [65]:
fre_news_1460 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_1460 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '730' and '1460' group by USER_ID")
fre_news_1460.show(5)
fre_news_1460.count()

+---------+----------------------+
|  USER_ID|pageview_news_fre_1460|
+---------+----------------------+
| 98343030|                     3|
| 59736608|                     2|
|115056661|                     1|
|162690170|                     1|
|169469168|                     1|
+---------+----------------------+
only showing top 5 rows



746

In [66]:
fre_news_2920 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_news_fre_2920 from records where WEBPAGE_TYPE = 'news' and RecencyDays between '1460' and '2920' group by USER_ID")
fre_news_2920.show(5)
fre_news_2920.count()

+-------+----------------------+
|USER_ID|pageview_news_fre_2920|
+-------+----------------------+
+-------+----------------------+



0

In [67]:
# Join fre_news_1460 vs fre_news_2920

fre_join_df2 = fre_news_1460.join(fre_news_2920, 'USER_ID', 'outer').na.fill(0)
fre_join_df2.show(5,truncate=False)
fre_join_df2.count()

+---------+----------------------+----------------------+
|USER_ID  |pageview_news_fre_1460|pageview_news_fre_2920|
+---------+----------------------+----------------------+
|98343030 |3                     |0                     |
|115056661|1                     |0                     |
|59736608 |2                     |0                     |
|162690170|1                     |0                     |
|167725829|1                     |0                     |
+---------+----------------------+----------------------+
only showing top 5 rows



746

In [70]:
# Join fre_join_df1 vs fre_join_df2

fre_news_df = fre_join_df1.join(fre_join_df2, 'USER_ID', 'outer').drop(join_df2.USER_ID).na.fill(0) 

fre_news_df.show(5)

+---------+---------------------+---------------------+----------------------+----------------------+
|  USER_ID|pageview_news_fre_365|pageview_news_fre_730|pageview_news_fre_1460|pageview_news_fre_2920|
+---------+---------------------+---------------------+----------------------+----------------------+
| 98343030|                    0|                    0|                     3|                     0|
|115056661|                    0|                    0|                     1|                     0|
| 59736608|                    0|                    0|                     2|                     0|
|162690170|                    0|                    0|                     1|                     0|
|167725829|                    0|                    0|                     1|                     0|
+---------+---------------------+---------------------+----------------------+----------------------+
only showing top 5 rows



# Movie PAGE_TYPE

In [72]:
fre_movies_365 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays < '365' group by USER_ID")
fre_movies_365.show(5)
fre_movies_365.count()

+-------+-----------------------+
|USER_ID|pageview_movies_fre_365|
+-------+-----------------------+
+-------+-----------------------+



0

In [74]:
fre_movies_730 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '365' and '730' group by USER_ID")
fre_movies_730.show(5)
fre_movies_730.count()

+-------+-----------------------+
|USER_ID|pageview_movies_fre_365|
+-------+-----------------------+
+-------+-----------------------+



0

In [75]:
fre_movies_1460 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '730' and '1460' group by USER_ID")
fre_movies_1460.show(5)
fre_movies_1460.count()

+---------+-----------------------+
|  USER_ID|pageview_movies_fre_365|
+---------+-----------------------+
| 88602347|                      1|
| 78741118|                      2|
| 30317072|                      1|
|197002823|                      1|
|115544304|                      1|
+---------+-----------------------+
only showing top 5 rows



18

In [76]:
fre_movies_2920 = spark.sql("select USER_ID,count(WEB_PAGEID) as pageview_movies_fre_365 from records where WEBPAGE_TYPE = 'movies' and RecencyDays between '1460' and '2920' group by USER_ID")
fre_movies_2920.show(5)
fre_movies_2920.count()

+-------+-----------------------+
|USER_ID|pageview_movies_fre_365|
+-------+-----------------------+
+-------+-----------------------+



0

In [None]:
# Join movies_1460 vs movies_2920

join_df4 = movies_2920.join(movies_1460, 'USER_ID', "outer").na.fill(0)
join_df4.show(5,truncate=False)
join_df4.count()

# Final DataSet

In [33]:
final_df = rfm_table.join(result_df, 'USER_ID', 'outer').na.fill(0)
final_df.limit(5).toPandas().head()

Unnamed: 0,USER_ID,Recency,Frequency,pageview_news_rec_365,pageview_news_rec_730,pageview_news_rec_1460,pageview_news_rec_2920,pageview_movies_rec_365,pageview_movies_rec_730,pageview_movies_rec_1460,pageview_movies_rec_2920
0,98343030,932,3,0,0,3,0,0,0,0,0
1,115056661,1336,1,0,0,1,0,0,0,0,0
2,59736608,914,2,0,0,2,0,0,0,0,0
3,88602347,1259,1,0,0,0,0,0,0,1,0
4,162690170,1263,1,0,0,1,0,0,0,0,0
