In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
def read_data():
    # List of paths
    june_data_paths = []
    july_data_paths = []
    
    for i in range(14):
        if (i+1 < 10):
            folder_path_6 = 'D:\\Dev\\DE\\Dataset\\log_search\\2022060{}\\*.parquet'.format(i+1)
            folder_path_7 = 'D:\\Dev\\DE\\Dataset\\log_search\\2022070{}\\*.parquet'.format(i+1)
        else:
            folder_path_6 = 'D:\\Dev\\DE\\Dataset\\log_search\\202206{}\\*.parquet'.format(i+1)
            folder_path_7 = 'D:\\Dev\\DE\\Dataset\\log_search\\202207{}\\*.parquet'.format(i+1)
        june_data_paths.append(folder_path_6)
        july_data_paths.append(folder_path_7)

    june_df = spark.read.parquet(*june_data_paths)
    july_df = spark.read.parquet(*july_data_paths)

    return june_df, july_df

In [5]:
def handle_most_search(june_df, july_df):
    # Get user_id and keyword columns
    june_df = june_df.select('user_id', 'keyword').filter(june_df.user_id.isNotNull() & june_df.keyword.isNotNull())
    july_df = july_df.select('user_id', 'keyword').filter(july_df.user_id.isNotNull() & july_df.keyword.isNotNull())

    # Count the number of searching keyword (Times column)
    june_df = june_df.groupBy(june_df.user_id, june_df.keyword).agg(count('*').alias('Times'))
    july_df = july_df.groupBy(july_df.user_id, july_df.keyword).agg(count('*').alias('Times'))

    # Windows function
    windowSpec  = Window.partitionBy("user_id").orderBy(col("Times").desc())

    # Rank by Times
    june_df = june_df.withColumn("rank", rank().over(windowSpec))
    july_df = july_df.withColumn("rank", rank().over(windowSpec))

    # Filter keywords in top 3
    june_df = june_df.filter(col('rank') == 1)
    july_df = july_df.filter(col('rank') == 1)
    
    # Get most_search
    june_df = june_df.select('user_id', 'keyword').withColumnRenamed('keyword', 'most_search_june')
    july_df = july_df.select('user_id', 'keyword').withColumnRenamed('keyword', 'most_search_july')

    # Join 2 Dataframe of 2 months
    result = june_df.join(july_df, 'user_id', 'inner')

    return result

**Main:**

In [6]:
june_df, july_df = read_data()
df = handle_most_search(june_df, july_df)

In [39]:
# Save csv

# df.repartition(1).write.options(header='True', encoding='UTF-8').csv("D:\\Sourcecode\\test\\", header=True)

In [7]:
# Read mapped data
df = spark.read.options(header='True').csv("D:\\Sourcecode\\test\\*.csv").limit(1000)

In [8]:
df.show()

+-------+--------------------+---------------+--------------------+--------------+
|user_id|    most_search_june|  category_june|    most_search_july| category_july|
+-------+--------------------+---------------+--------------------+--------------+
|   1396|yêu trong đau thu...| Romantic movie|phim tươi cười ph...|        Comedy|
|   2032|    chú vẹt đuôi dài|Animation movie|      THIEN THAN BON|         Other|
|   2384|       ngoc lau xuan|        C-Drama|    vẻ đẹp đích thực|Romantic movie|
|   2816|định mệnh anh yêu em| Romantic movie|     vân tịch truyện|       C-Drama|
|   3361|         tây hành kỷ|   CN Animation|         lộc đỉnh ký|       C-Drama|
|   3361|           vuot nguc|   Action movie|         lộc đỉnh ký|       C-Drama|
|   3371|                 vtv|             TV|hoang anh gia lai...|         Sport|
|   3691| thế giới khủng long|   Action movie|tiếng gọi nơi hoa...|  Action movie|
|   3926|người yêu siêu cấ...| Romantic movie|học viện anh hùng...|         Anime|
|   

In [9]:
def handle_trending_type(df):
    df = df.withColumn('trending_type', when(df.category_june == df.category_july, "Unchanged").otherwise("Changed"))

    return df

In [10]:
df = handle_trending_type(df)

In [11]:
def handle_previous(df):
    df = df.withColumn('previous', when(df.trending_type == "Unchanged", "Unchanged") \
        .otherwise(concat_ws('; ', *['category_june', 'category_july'])))

    return df

In [12]:
df = handle_previous(df)

In [13]:
df.show(truncate=False)

+-------+--------------------------------+---------------+----------------------------+--------------+-------------+-----------------------+
|user_id|most_search_june                |category_june  |most_search_july            |category_july |trending_type|previous               |
+-------+--------------------------------+---------------+----------------------------+--------------+-------------+-----------------------+
|1396   |yêu trong đau thuong thuýet minh|Romantic movie |phim tươi cười phấn khởi    |Comedy        |Changed      |Romantic movie; Comedy |
|2032   |chú vẹt đuôi dài                |Animation movie|THIEN THAN BON              |Other         |Changed      |Animation movie; Other |
|2384   |ngoc lau xuan                   |C-Drama        |vẻ đẹp đích thực            |Romantic movie|Changed      |C-Drama; Romantic movie|
|2816   |định mệnh anh yêu em            |Romantic movie |vân tịch truyện             |C-Drama       |Changed      |Romantic movie; C-Drama|
|3361   |tây 

In [14]:
df.count()

1000

In [15]:
# Save data
df.repartition(1).write.csv('./Final_Data', header=True)