In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Create a SparkSession
spark = SparkSession.builder.appName("PatentKeyPlayersAnalytics") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()


24/12/01 18:13:22 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/12/01 18:13:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 18:13:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/01 18:13:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/01 18:13:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/12/01 18:13:23 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/12/01 18:13:23 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [2]:
# Constants

filtered_patents_input_path = "../../data_source/filtered_patents"
inventor_data_path = "../../data_source/preprocessed_data_input/inventor_info"

top_inventors_path = "file:/Users/bhland/hive/warehouse/dashboard_analytics_results/top_inventors"

In [3]:
patents_filtered_df = spark.read.parquet(filtered_patents_input_path)
patents_filtered_df.show(truncate=False)

+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|patent_type|patent_date|patent_title                                                                                                                   |num_claims|application_id|filing_date|series_code|patent_id|branch                            |code     |
+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|utility    |2018-06-19 |Filler neck closure assembly                                                                                                   |19        |15412444      |2017-01-23 |15         |10000117 |Software D

In [4]:
inventor_df = spark.read.parquet(inventor_data_path)

inventor_filtered_df = inventor_df.join(
    patents_filtered_df,
    inventor_df.patent_id == patents_filtered_df.patent_id, 
    "inner"
)

inventor_filtered_df = inventor_filtered_df.drop(inventor_df['patent_id'])

In [5]:
grouped_df = inventor_filtered_df.groupBy(
    "branch", 
    "inventor_id", 
    "disambig_inventor_name_first", 
    "disambig_inventor_name_last", 
    "gender_code", 
    "latitude",
    "longitude",
    "disambig_state"
).agg(
    F.count("patent_id").alias("patent_count")  
)


window_spec = Window.partitionBy("branch").orderBy(F.desc("patent_count"))


ranked_df = grouped_df.withColumn("rank", F.row_number().over(window_spec))

top_inventors = ranked_df.filter(F.col("rank") <= 20)

final_result = top_inventors.select(
    F.col("branch"),
    F.col("inventor_id"),
    F.concat(F.col("disambig_inventor_name_first"), F.lit(" "), F.col("disambig_inventor_name_last")).alias("inventor_name"),
    F.col("gender_code").alias("gender"),
    F.col("disambig_state").alias("state"),
    F.col("latitude"),
    F.col("longitude"),
    F.col("patent_count"),
    F.col("rank")
).orderBy("branch", "rank")

final_result.show(truncate=False)

[Stage 6:>                                                          (0 + 8) / 9]

+-------------------------------+----------------------+--------------------+------+-----+---------+-----------+------------+----+
|branch                         |inventor_id           |inventor_name       |gender|state|latitude |longitude  |patent_count|rank|
+-------------------------------+----------------------+--------------------+------+-----+---------+-----------+------------+----+
|Advanced Computing Technologies|fl:ja_ln:walker-54    |Jay S. Walker       |M     |CT   |41.281483|-73.49818  |34          |1   |
|Advanced Computing Technologies|fl:ma_ln:stefik-1     |Mark J. Stefik      |M     |CA   |37.37363 |-122.21905 |26          |2   |
|Advanced Computing Technologies|fl:pe_ln:pirolli-1    |Peter L. T. Pirolli |M     |CA   |37.779026|-122.41991 |24          |3   |
|Advanced Computing Technologies|fl:sh_ln:hussain-12   |Shariq Hussain      |M     |CA   |33.200035|-117.24254 |20          |4   |
|Advanced Computing Technologies|fl:br_ln:sierer-1     |Brian Sierer        |M     

                                                                                

In [6]:
final_result_with_schema = final_result.select(
    col("branch").cast(StringType()),
    col("inventor_id").cast(StringType()),
    col("inventor_name").cast(StringType()),
    col("gender").cast(StringType()),
    col("state").cast(StringType()),
    col("latitude").cast(DoubleType()),
    col("longitude").cast(DoubleType()),
    col("patent_count").cast(IntegerType()),
    col("rank").cast(IntegerType())
)

final_result_with_schema.printSchema()

root
 |-- branch: string (nullable = true)
 |-- inventor_id: string (nullable = true)
 |-- inventor_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- patent_count: integer (nullable = false)
 |-- rank: integer (nullable = false)



In [7]:
final_result_with_schema.write.parquet(top_inventors_path, mode="overwrite")

print("The Top Inventors results have been successfully written to the parquet file in hive warehouse.")

                                                                                

The Top Inventors results have been successfully written to the parquet file in hive warehouse.
