In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("PatentBranchAnalytics") \
    .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.5-bundle_2.12:0.15.0") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.sql.hive.convertMetastoreParquet", "false") \
    .config("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") \
    .config("spark.sql.datetime.java8API.enabled", "true") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

24/12/01 18:04:45 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/12/01 18:04:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/bhland/.ivy2/cache
The jars for the packages stored in: /Users/bhland/.ivy2/jars
org.apache.hudi#hudi-spark3.5-bundle_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-25373d1c-4ee2-4898-a822-bcc1443ff84f;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark3.5-bundle_2.12;0.15.0 in central
	found org.apache.hive#hive-storage-api;2.8.1 in central
	found org.slf4j#slf4j-api;1.7.36 in local-m2-cache
:: resolution report :: resolve 70ms :: artifacts dl 3ms
	:: modules in use:
	org.apache.hive#hive-storage-api;2.8.1 from central in [default]
	org.apache.hudi#hudi-spark3.5-bundle_2.12;0.15.0 from central in [default]
	org.slf4j#slf4j-api;1.7.36 from local-m2-cache in [default]
	---

:: loading settings :: url = jar:file:/Users/bhland/miniforge3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


24/12/01 18:04:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 18:04:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/01 18:04:46 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/12/01 18:04:46 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
# Constants

filtered_patents_input_path = "../../data_source/filtered_patents"
inventor_data_path = "../../data_source/preprocessed_data_input/inventor_info"

geo_distribution_table_path = "file:/Users/bhland/hive/warehouse/dashboard_analytics_results/geo_distribution"  

In [3]:
patents_filtered_df = spark.read.parquet(filtered_patents_input_path)
patents_filtered_df.show(truncate=False)

inventor_df = spark.read.parquet(inventor_data_path)

inventor_filtered_df = inventor_df.join(
    patents_filtered_df,
    inventor_df.patent_id == patents_filtered_df.patent_id, 
    "inner"
)

inventor_filtered_df = inventor_filtered_df.drop(inventor_df['patent_id'])

+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|patent_type|patent_date|patent_title                                                                                                                   |num_claims|application_id|filing_date|series_code|patent_id|branch                            |code     |
+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|utility    |2018-06-19 |Filler neck closure assembly                                                                                                   |19        |15412444      |2017-01-23 |15         |10000117 |Software D

In [4]:
from pyspark.sql import functions as F

inventor_df_with_year = inventor_filtered_df.withColumn("year", F.year("patent_date")).withColumn("month", F.month("patent_date")) 

geo_data = inventor_df_with_year.groupBy("year", "month", "branch", "disambig_state").agg(
    F.countDistinct("inventor_id").alias("inventor_count"),  
    F.countDistinct("patent_id").alias("patent_count")  
)

geo_data_df = geo_data.withColumn("timestamp", make_date(col("year"), col("month"), lit(1)))


geo_data_df = geo_data_df.drop("month")

# Step 2: Select relevant columns
geo_distribution = geo_data_df.select(
    F.col("timestamp"),
    F.col("branch"),
    F.concat(F.lit("US-"), F.col("disambig_state")).alias("state"),
    F.col("inventor_count"),
    F.col("patent_count")
)


# Show the resulting DataFrame
geo_distribution.show(truncate=False)

geo_distribution_with_schema = geo_distribution.select(
    col("timestamp").cast(TimestampType()),
    col("timestamp").alias("year"),
    col("branch").cast(StringType()),
    col("state").cast(StringType()),
    col("inventor_count").cast(IntegerType()),
    col("patent_count").cast(IntegerType())
)

geo_distribution_with_schema.printSchema()
geo_distribution_with_schema = geo_distribution_with_schema.na.drop(how="any")
geo_distribution_with_schema.show(truncate=False)

                                                                                

+----------+----------------------------------+-----+--------------+------------+
|timestamp |branch                            |state|inventor_count|patent_count|
+----------+----------------------------------+-----+--------------+------------+
|2019-09-01|Data Science and Analytics        |US-NE|1             |1           |
|2024-01-01|Networking and Distributed Systems|US-VA|95            |84          |
|2016-03-01|Software Development and Security |US-CO|10            |7           |
|2022-12-01|Networking and Distributed Systems|US-OR|45            |30          |
|1989-11-01|Data Science and Analytics        |US-WA|3             |1           |
|2013-09-01|Software Development and Security |US-FL|10            |8           |
|2015-11-01|Networking and Distributed Systems|US-NJ|100           |61          |
|2016-04-01|Artificial Intelligence           |US-OR|16            |13          |
|2011-05-01|Networking and Distributed Systems|US-TX|66            |33          |
|2022-04-01|Arti



+-------------------+----------+----------------------------------+-----+--------------+------------+
|timestamp          |year      |branch                            |state|inventor_count|patent_count|
+-------------------+----------+----------------------------------+-----+--------------+------------+
|2019-09-01 00:00:00|2019-09-01|Data Science and Analytics        |US-NE|1             |1           |
|2024-01-01 00:00:00|2024-01-01|Networking and Distributed Systems|US-VA|95            |84          |
|2016-03-01 00:00:00|2016-03-01|Software Development and Security |US-CO|10            |7           |
|2022-12-01 00:00:00|2022-12-01|Networking and Distributed Systems|US-OR|45            |30          |
|1989-11-01 00:00:00|1989-11-01|Data Science and Analytics        |US-WA|3             |1           |
|2013-09-01 00:00:00|2013-09-01|Software Development and Security |US-FL|10            |8           |
|2015-11-01 00:00:00|2015-11-01|Networking and Distributed Systems|US-NJ|100      

                                                                                

In [5]:
 geo_distribution_hudi_options = {
    'hoodie.table.name': 'geo_distribution',
    'hoodie.datasource.write.recordkey.field': 'timestamp,branch,state',
    'hoodie.datasource.write.precombine.field': "inventor_count",
    'hoodie.datasource.write.table.name': 'geo_distribution',
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
    'hoodie.datasource.write.operation': 'insert',
    'hoodie.upsert.shuffle.parallelism': 2,
    'hoodie.insert.shuffle.parallelism': 2,
}
geo_distribution_with_schema.write.format("org.apache.hudi").options(**geo_distribution_hudi_options).mode("overwrite").save(geo_distribution_table_path)

print("The Geo Distribution results have been successfully written to the hudi table in hive warehouse.")

24/12/01 18:05:09 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf
24/12/01 18:05:09 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
24/12/01 18:05:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-hbase.properties,hadoop-metrics2.properties
                                                                                

The Geo Distribution results have been successfully written to the hudi table in hive warehouse.


24/12/01 18:05:22 WARN HoodieSparkSqlWriterInternal: Closing write client
