In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Create a SparkSession
spark = SparkSession.builder.appName("PatentKeyPlayersAnalytics") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()


24/12/01 18:26:32 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/12/01 18:26:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 18:26:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/01 18:26:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/01 18:26:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/12/01 18:26:32 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/12/01 18:26:32 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/12/01 18:26:32 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4

In [2]:
# Constants

filtered_patents_input_path = "../../data_source/filtered_patents"
keywords_data_path = "../../data_source/keywords"

top_keywords_path = "file:/Users/bhland/hive/warehouse/dashboard_analytics_results/top_keywords"

In [3]:
patents_filtered_df = spark.read.parquet(filtered_patents_input_path)
patents_filtered_df.show(truncate=False)

+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|patent_type|patent_date|patent_title                                                                                                                   |num_claims|application_id|filing_date|series_code|patent_id|branch                            |code     |
+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|utility    |2018-06-19 |Filler neck closure assembly                                                                                                   |19        |15412444      |2017-01-23 |15         |10000117 |Software D

In [4]:
from nltk.corpus import stopwords
from pyspark.sql.types import StringType
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


def lemmatize_keyword(keyword):
    if keyword:
        tagged = pos_tag(word_tokenize(keyword))
        pos = tagged[0][1] if tagged else 'NN'  

        if pos.startswith('V'):
            pos = 'v'  
        elif pos.startswith('N'):
            pos = 'n'  
        elif pos.startswith('J'):
            pos = 'a' 
        else:
            pos = 'n'  
        
        return lemmatizer.lemmatize(keyword, pos)
    return None

# Define a UDF to apply lemmatization
lemmatize_keyword_udf = F.udf(lemmatize_keyword, StringType())

keywords_df = spark.read.parquet(keywords_data_path)

joined_df = keywords_df.join(patents_filtered_df, on="patent_id")

exploded_df = joined_df.withColumn("keyword_struct", F.explode(F.col("keywords")))

keyword_split_df = exploded_df.withColumn("keyword", F.col("keyword_struct._1")) \
    .withColumn("weight", F.col("keyword_struct._2"))


keyword_normalized_df = keyword_split_df.withColumn("normalized_keyword", lemmatize_keyword_udf(F.col("keyword")))


keyword_cleaned_df = keyword_normalized_df.filter(
    F.col("normalized_keyword").isNotNull() & (F.col("normalized_keyword") != "") & 
    (F.trim(F.col("normalized_keyword")) != "")
)


keyword_agg_df = keyword_cleaned_df.groupBy("branch", "normalized_keyword").agg(
    F.count("*").alias("keyword_count"),
    F.sum("weight").alias("total_weight")
)


keyword_rank_df = keyword_agg_df.withColumn(
    "ranking_metric", F.col("keyword_count") * F.col("total_weight")
)


window_spec = Window.partitionBy("branch").orderBy(F.desc("ranking_metric"))


ranked_df = keyword_rank_df.withColumn(
    "rank", F.row_number().over(window_spec)
).filter(F.col("rank") <= 50)


top_keywords_df = ranked_df.select("branch", "normalized_keyword", "keyword_count", "total_weight", "ranking_metric", "rank")

top_keywords_df.show(truncate=False)




+-------------------------------+------------------+-------------+------------------+------------------+----+
|branch                         |normalized_keyword|keyword_count|total_weight      |ranking_metric    |rank|
+-------------------------------+------------------+-------------+------------------+------------------+----+
|Advanced Computing Technologies|display           |29           |13.075399999999998|379.18659999999994|1   |
|Advanced Computing Technologies|device            |25           |10.477499999999997|261.93749999999994|2   |
|Advanced Computing Technologies|screen            |14           |6.041900000000001 |84.58660000000002 |3   |
|Advanced Computing Technologies|image             |13           |5.5571            |72.2423           |4   |
|Advanced Computing Technologies|view              |8            |3.409             |27.272            |5   |
|Advanced Computing Technologies|camera            |7            |3.2539999999999996|22.778            |6   |
|Advanced 

                                                                                

In [5]:
final_keywords_with_schema = top_keywords_df.select(
    col("branch").cast(StringType()),
    col("normalized_keyword").alias("keyword").cast(StringType()),
    col("keyword_count").cast(IntegerType()),
    col("rank").cast(IntegerType())
)

final_keywords_with_schema.show()



+--------------------+-------------+-------------+----+
|              branch|      keyword|keyword_count|rank|
+--------------------+-------------+-------------+----+
|Advanced Computin...|      display|           29|   1|
|Advanced Computin...|       device|           25|   2|
|Advanced Computin...|       screen|           14|   3|
|Advanced Computin...|        image|           13|   4|
|Advanced Computin...|         view|            8|   5|
|Advanced Computin...|       camera|            7|   6|
|Advanced Computin...|        robot|            7|   7|
|Advanced Computin...|      service|            7|   8|
|Advanced Computin...|   projection|            5|   9|
|Advanced Computin...|       retail|            5|  10|
|Advanced Computin...|   processing|            5|  11|
|Advanced Computin...|      project|            5|  12|
|Advanced Computin...|        video|            5|  13|
|Advanced Computin...|  transaction|            5|  14|
|Advanced Computin...|authorization|            

                                                                                

In [6]:
final_keywords_with_schema.write.mode("overwrite").parquet(top_keywords_path)
print("The Top Keywords results have been successfully written to the parquet file in hive warehouse.")



The Top Keywords results have been successfully written to the parquet file in hive warehouse.


                                                                                