In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("PatentBranchAnalytics") \
    .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.5-bundle_2.12:0.15.0") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.sql.hive.convertMetastoreParquet", "false") \
    .config("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") \
    .config("spark.sql.datetime.java8API.enabled", "true") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

24/12/01 18:02:07 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/12/01 18:02:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/bhland/.ivy2/cache
The jars for the packages stored in: /Users/bhland/.ivy2/jars
org.apache.hudi#hudi-spark3.5-bundle_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1dc583ca-812c-42ac-aa2b-97c572d6378d;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark3.5-bundle_2.12;0.15.0 in central
	found org.apache.hive#hive-storage-api;2.8.1 in central
	found org.slf4j#slf4j-api;1.7.36 in local-m2-cache
:: resolution report :: resolve 70ms :: artifacts dl 2ms
	:: modules in use:
	org.apache.hive#hive-storage-api;2.8.1 from central in [default]
	org.apache.hudi#hudi-spark3.5-bundle_2.12;0.15.0 from central in [default]
	org.slf4j#slf4j-api;1.7.36 from local-m2-cache in [default]
	---

:: loading settings :: url = jar:file:/Users/bhland/miniforge3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


:: retrieving :: org.apache.spark#spark-submit-parent-1dc583ca-812c-42ac-aa2b-97c572d6378d
	confs: [default]
	0 artifacts copied, 3 already retrieved (0kB/2ms)
24/12/01 18:02:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 18:02:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/01 18:02:08 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
# Constants

filtered_patents_input_path = "../../data_source/filtered_patents"
inventor_data_path = "../../data_source/preprocessed_data_input/inventor_info"

gender_trends_table_path = "file:/Users/bhland/hive/warehouse/dashboard_analytics_results/gender_trends"  

In [3]:
patents_filtered_df = spark.read.parquet(filtered_patents_input_path)
patents_filtered_df.show(truncate=False)

+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|patent_type|patent_date|patent_title                                                                                                                   |num_claims|application_id|filing_date|series_code|patent_id|branch                            |code     |
+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|utility    |2018-06-19 |Filler neck closure assembly                                                                                                   |19        |15412444      |2017-01-23 |15         |10000117 |Software D

In [4]:
inventor_df = spark.read.parquet(inventor_data_path)

inventor_filtered_df = inventor_df.join(
    patents_filtered_df,
    inventor_df.patent_id == patents_filtered_df.patent_id, 
    "inner"
)

inventor_filtered_df = inventor_filtered_df.drop(inventor_df['patent_id'])

inventor_df_with_year = inventor_filtered_df.withColumn("year", F.year("patent_date")).withColumn("month", F.month("patent_date")) 


In [5]:
inventer_with_gender = inventor_df_with_year.withColumn(
    "gender_code",
    F.coalesce(F.col("gender_code"), F.lit("U"))
)


male_df = inventer_with_gender.filter(F.col("gender_code") == "M") 

male_count_df = male_df.groupBy("year", "month", "branch").agg(
    count("patent_id").alias("male_count")
)

male_count_df = male_count_df.withColumn("year", make_date(col("year"), col("month"), lit(1)))


male_df = male_count_df.drop("month")

male_df.show(truncate=False)



+----------+----------------------------------+----------+
|year      |branch                            |male_count|
+----------+----------------------------------+----------+
|2023-10-01|Data Science and Analytics        |473       |
|1979-05-01|Artificial Intelligence           |7         |
|1981-12-01|Artificial Intelligence           |17        |
|1985-12-01|Artificial Intelligence           |28        |
|1986-05-01|Networking and Distributed Systems|10        |
|1988-08-01|Artificial Intelligence           |33        |
|1995-01-01|Networking and Distributed Systems|80        |
|2001-08-01|Artificial Intelligence           |288       |
|2004-09-01|Artificial Intelligence           |367       |
|2006-08-01|Artificial Intelligence           |634       |
|2006-09-01|Advanced Computing Technologies   |30        |
|2010-02-01|Networking and Distributed Systems|929       |
|2013-04-01|Artificial Intelligence           |1221      |
|2022-04-01|Artificial Intelligence           |5246     

                                                                                

In [6]:
female_df = inventer_with_gender.filter(F.col("gender_code") == "F") 

female_count_df = female_df.groupBy("year", "month", "branch").agg(
    count("patent_id").alias("female_count")
)

female_count_df = female_count_df.withColumn("year", make_date(col("year"), col("month"), lit(1)))


female_combined_df = male_df.join(
    female_count_df,
    on=["year", "branch"],
    how="left"
).orderBy(desc("year"))

female_combined_df = female_combined_df.drop("month")

female_combined_df.show(truncate=False)



+----------+----------------------------------+----------+------------+
|year      |branch                            |male_count|female_count|
+----------+----------------------------------+----------+------------+
|2024-07-01|Data Science and Analytics        |371       |61          |
|2024-07-01|Advanced Computing Technologies   |32        |4           |
|2024-07-01|Software Development and Security |477       |87          |
|2024-07-01|Artificial Intelligence           |7339      |1352        |
|2024-07-01|Networking and Distributed Systems|5225      |924         |
|2024-06-01|Data Science and Analytics        |356       |61          |
|2024-06-01|Software Development and Security |374       |55          |
|2024-06-01|Artificial Intelligence           |5701      |1027        |
|2024-06-01|Advanced Computing Technologies   |22        |8           |
|2024-06-01|Networking and Distributed Systems|4287      |809         |
|2024-05-01|Artificial Intelligence           |5497      |1009  

                                                                                

In [7]:
final_combined_result = female_combined_df.select(
    female_combined_df.year,
    female_combined_df.branch,
    F.coalesce(female_combined_df.female_count, F.lit(0)).alias("female_count"),
    F.coalesce(female_combined_df.male_count, F.lit(0)).alias("male_count")
)

final_combined_result.show(truncate=False)



+----------+----------------------------------+------------+----------+
|year      |branch                            |female_count|male_count|
+----------+----------------------------------+------------+----------+
|2024-07-01|Data Science and Analytics        |61          |371       |
|2024-07-01|Advanced Computing Technologies   |4           |32        |
|2024-07-01|Software Development and Security |87          |477       |
|2024-07-01|Artificial Intelligence           |1352        |7339      |
|2024-07-01|Networking and Distributed Systems|924         |5225      |
|2024-06-01|Data Science and Analytics        |61          |356       |
|2024-06-01|Software Development and Security |55          |374       |
|2024-06-01|Artificial Intelligence           |1027        |5701      |
|2024-06-01|Advanced Computing Technologies   |8           |22        |
|2024-06-01|Networking and Distributed Systems|809         |4287      |
|2024-05-01|Artificial Intelligence           |1009        |5497

                                                                                

In [8]:
gender_result_with_schema = final_combined_result.select(
    col("year").cast(TimestampType()).alias("timestamp"),
    col("year").cast(DateType()),
    col("branch").cast(StringType()),
    col("male_count").cast(IntegerType()),
    col("female_count").cast(IntegerType())
)

gender_result_with_schema.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- year: date (nullable = true)
 |-- branch: string (nullable = true)
 |-- male_count: integer (nullable = false)
 |-- female_count: integer (nullable = false)



In [10]:
gender_trends_hudi_options = {
    'hoodie.table.name': 'gender_trends',
    'hoodie.datasource.write.recordkey.field': 'year,branch',
    'hoodie.datasource.write.precombine.field': "male_count",
    'hoodie.datasource.write.table.name': 'gender_trends',
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
    'hoodie.datasource.write.operation': 'insert',
    'hoodie.upsert.shuffle.parallelism': 2,
    'hoodie.insert.shuffle.parallelism': 2,
}
gender_result_with_schema.write.format("org.apache.hudi").options(**gender_trends_hudi_options).mode("overwrite").save(gender_trends_table_path)

print("The Gender Trends results have been successfully written to the hudi table in hive warehouse.")

24/12/01 18:02:58 WARN HoodieSparkSqlWriterInternal: hoodie table at file:/Users/bhland/hive/warehouse/dashboard_analytics_results/gender_trends already exists. Deleting existing data & overwriting with new data.
                                                                                

The Gender Trends results have been successfully written to the hudi table in hive warehouse.


24/12/01 18:03:04 WARN HoodieSparkSqlWriterInternal: Closing write client
