In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Create a SparkSession
spark = SparkSession.builder.appName("PatentKeyPlayersAnalytics") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()


In [9]:
# Constants

filtered_patents_input_path = "../../data_source/filtered_patents"
applicant_data_path = "../../data_source/preprocessed_data_input/applicant_info"

top_applicants_path = "file:/Users/bhland/hive/warehouse/dashboard_analytics_results/top_applicants"

In [10]:
patents_filtered_df = spark.read.parquet(filtered_patents_input_path)
patents_filtered_df.show(truncate=False)

+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|patent_type|patent_date|patent_title                                                                                                                   |num_claims|application_id|filing_date|series_code|patent_id|branch                            |code     |
+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|utility    |2018-06-19 |Filler neck closure assembly                                                                                                   |19        |15412444      |2017-01-23 |15         |10000117 |Software D

In [11]:
applicant_df = spark.read.parquet(applicant_data_path)

applicant_filtered_df = applicant_df.join(
    patents_filtered_df,
    applicant_df.patent_id == patents_filtered_df.patent_id, 
    "inner"
)

applicant_filtered_df = applicant_filtered_df.drop(applicant_df['patent_id'])
applicant_filtered_df.printSchema()

root
 |-- applicant_sequence: integer (nullable = true)
 |-- raw_applicant_name_first: string (nullable = true)
 |-- raw_applicant_name_last: string (nullable = true)
 |-- raw_applicant_organization: string (nullable = true)
 |-- applicant_type: string (nullable = true)
 |-- patent_type: string (nullable = true)
 |-- patent_date: date (nullable = true)
 |-- patent_title: string (nullable = true)
 |-- num_claims: integer (nullable = true)
 |-- application_id: string (nullable = true)
 |-- filing_date: date (nullable = true)
 |-- series_code: string (nullable = true)
 |-- patent_id: string (nullable = true)
 |-- branch: string (nullable = true)
 |-- code: string (nullable = true)



In [12]:
from pyspark.sql import Window
from pyspark.sql import functions as F


applicant_cleaned_df = applicant_filtered_df.filter(F.col("raw_applicant_organization").isNotNull())

applicant_normalized_df = applicant_cleaned_df.withColumn(
    "normalized_organization", 
    F.initcap(F.lower(F.col("raw_applicant_organization")))
)

applicant_grouped_df = applicant_normalized_df.groupBy(
    "branch", 
    "normalized_organization"
).agg(
    F.count("patent_id").alias("patent_count")  
)

applicant_window_spec = Window.partitionBy("branch").orderBy(F.desc("patent_count"))


applicant_ranked_df = applicant_grouped_df.withColumn(
    "rank", 
    F.row_number().over(applicant_window_spec)
)


top_applicants = applicant_ranked_df.filter(F.col("rank") <= 20)

final_applicant_result = top_applicants.select(
    F.col("branch"),
    F.col("normalized_organization").alias("organization"),
    F.col("patent_count"),
    F.col("rank")
).orderBy("branch", "rank")

final_applicant_result.show(truncate=False)

+-------------------------------+----------------------------------------------------+------------+----+
|branch                         |organization                                        |patent_count|rank|
+-------------------------------+----------------------------------------------------+------------+----+
|Advanced Computing Technologies|Samsung Electronics Co., Ltd.                       |108         |1   |
|Advanced Computing Technologies|Sony Corporation                                    |65          |2   |
|Advanced Computing Technologies|Amazon Technologies, Inc.                           |48          |3   |
|Advanced Computing Technologies|Google Inc.                                         |43          |4   |
|Advanced Computing Technologies|International Business Machines Corporation         |41          |5   |
|Advanced Computing Technologies|Samsung Display Co., Ltd.                           |40          |6   |
|Advanced Computing Technologies|Ebay Inc.             

In [13]:
final_applicant_with_schema = final_applicant_result.select(
    col("branch").cast(StringType()),
    col("organization").cast(StringType()),
    col("patent_count").cast(IntegerType()),
    col("rank").cast(IntegerType())
)

In [14]:
final_applicant_with_schema.write.parquet(top_applicants_path, mode="overwrite")

print("The Top Applicants results have been successfully written to the parquet file in hive warehouse.")

The Top Applicants results have been successfully written to the parquet file in hive warehouse.
