In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Create a SparkSession
spark = SparkSession.builder.appName("PatentBranchAnalytics").config("spark.driver.memory", "8g").config("spark.executor.memory", "8g").getOrCreate()


In [3]:
branches_data = [
    
    ("Artificial Intelligence", ["G06N", "G06F19/00", "G06F17/27", "G06F17/28", "G10L15/22", "G06T", "G06K9/00"]),
    ("Data Science and Analytics", ["G06F16/00", "G06F17/00", "H04L67/00", "H04L12/28", 
                                    "G06F3/06", "G06F12/02", "G06F16/27", "H04L29/06", "G06Q50/00", "B82Y30/00"]),
    ("Networking and Distributed Systems", ["H04L29/00", "H04L12/00", "H04W", "H04L12/24", "H04L67/10", "G06F9/50", "H04L67/22", "H04L29/08", "G06F9/46", "G06F15/173", "G06F15/16"]),
    ("Software Development and Security", ["G06F8/00", "G06F9/00", "G06F9/44", "G06F9/451", "H04L67/02", "G06F3/048", "G06Q10/00","G06F3/00", "H04L9/00", "G06F21/00", "G06Q20/40"]),
    ("Advanced Computing Technologies", ["B25J9/00", "G05B19/00", "G06F15/18", "H03K17/00", "G09G5/00", "G06Q30/00"])
       
]


# Create the branches DataFrame
branches_df = spark.createDataFrame(branches_data, ["branch", "codes"])

# Explode the `codes` array into individual rows and remove spaces in codes
branches_df_exploded = branches_df.withColumn("code", explode(col("codes"))).drop("codes")

branches_df_exploded.printSchema()
branches_df_exploded.show(truncate=False)

root
 |-- branch: string (nullable = true)
 |-- code: string (nullable = true)



                                                                                

+----------------------------------+---------+
|branch                            |code     |
+----------------------------------+---------+
|Artificial Intelligence           |G06N     |
|Artificial Intelligence           |G06F19/00|
|Artificial Intelligence           |G06F17/27|
|Artificial Intelligence           |G06F17/28|
|Artificial Intelligence           |G10L15/22|
|Artificial Intelligence           |G06T     |
|Artificial Intelligence           |G06K9/00 |
|Data Science and Analytics        |G06F16/00|
|Data Science and Analytics        |G06F17/00|
|Data Science and Analytics        |H04L67/00|
|Data Science and Analytics        |H04L12/28|
|Data Science and Analytics        |G06F3/06 |
|Data Science and Analytics        |G06F12/02|
|Data Science and Analytics        |G06F16/27|
|Data Science and Analytics        |H04L29/06|
|Data Science and Analytics        |G06Q50/00|
|Data Science and Analytics        |B82Y30/00|
|Networking and Distributed Systems|H04L29/00|
|Networking a

In [4]:
cpc_df = spark.read.parquet("input_parquet/cpc_info")  

# Step 3: Join the datasets to map patents to branches and codes
filtered_patents = cpc_df.join(
    branches_df_exploded, 
    (cpc_df.cpc_group == branches_df_exploded.code) | (cpc_df.cpc_subclass == branches_df_exploded.code), 
    "inner"
).select("patent_id", "branch", "code")

filtered_patents.show(truncate=False)

filtered_patents.printSchema()


                                                                                

+---------+-----------------------+----+
|patent_id|branch                 |code|
+---------+-----------------------+----+
|4575628  |Artificial Intelligence|G06T|
|4575628  |Artificial Intelligence|G06T|
|4575628  |Artificial Intelligence|G06T|
|4575751  |Artificial Intelligence|G06T|
|4349739  |Artificial Intelligence|G06T|
|4349739  |Artificial Intelligence|G06T|
|4349739  |Artificial Intelligence|G06T|
|4533957  |Artificial Intelligence|G06T|
|4533959  |Artificial Intelligence|G06T|
|4670781  |Artificial Intelligence|G06T|
|4670788  |Artificial Intelligence|G06T|
|4670788  |Artificial Intelligence|G06T|
|4670788  |Artificial Intelligence|G06T|
|4670788  |Artificial Intelligence|G06T|
|4670793  |Artificial Intelligence|G06T|
|4670793  |Artificial Intelligence|G06T|
|4486784  |Artificial Intelligence|G06T|
|4670840  |Artificial Intelligence|G06T|
|4670840  |Artificial Intelligence|G06T|
|4670848  |Artificial Intelligence|G06N|
+---------+-----------------------+----+
only showing top

                                                                                

In [5]:
unique_patents_df = filtered_patents.dropDuplicates(["patent_id"])

unique_patents_df.show(truncate=False)

#record_count = filtered_patents.count()
#print(f"Total number of records: {record_count}")

#record_count = unique_patents_df.count()
#print(f"Total number of unique records: {record_count}")



+---------+----------------------------------+---------+
|patent_id|branch                            |code     |
+---------+----------------------------------+---------+
|10000036 |Data Science and Analytics        |B82Y30/00|
|10001293 |Networking and Distributed Systems|H04L67/10|
|10001380 |Networking and Distributed Systems|H04W     |
|10001910 |Software Development and Security |G06F3/048|
|10002005 |Software Development and Security |G06F9/451|
|10002009 |Software Development and Security |G06F21/00|
|10002036 |Networking and Distributed Systems|H04L67/10|
|10002095 |Artificial Intelligence           |G06N     |
|10002107 |Artificial Intelligence           |G06N     |
|10002157 |Data Science and Analytics        |G06F16/27|
|10002302 |Networking and Distributed Systems|H04W     |
|10002323 |Artificial Intelligence           |G06N     |
|10002325 |Artificial Intelligence           |G06N     |
|10002327 |Artificial Intelligence           |G06N     |
|10002337 |Artificial Intellige

                                                                                

In [6]:
patent_df = spark.read.parquet("input_parquet/patent_info")


patents_filtered_df = patent_df.join(
    unique_patents_df,
    patent_df.patent_id == unique_patents_df.patent_id, 
    "inner"
)

patents_filtered_df = patents_filtered_df.drop(patent_df['patent_id'])

patents_filtered_df.show(truncate=False)





+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|patent_type|patent_date|patent_title                                                                                                                                           |num_claims|application_id|filing_date|series_code|patent_id|branch                            |code     |
+-----------+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------+-----------+-----------+---------+----------------------------------+---------+
|utility    |2018-06-19 |High kinetic energy penetrator shielding and high wear resistance materials fabricated with boron nitride nanotubes (BNNTS) an

                                                                                

In [7]:
patent_df_with_years = patents_filtered_df.withColumn("grant_year", year(col("patent_date"))) \
                                .withColumn("filed_year", year(col("filing_date")))


patent_df_with_years = patent_df_with_years.withColumn("grant_year", make_date(col("grant_year"), lit(1), lit(1)))
patent_df_with_years = patent_df_with_years.withColumn("filed_year", make_date(col("filed_year"), lit(1), lit(1)))


In [8]:
# Step 3: Group by Year and Aggregate Counts
grant_counts = patent_df_with_years.groupBy("grant_year", "branch").agg(
    count("patent_id").alias("patent_grant_count")
)

filed_counts = patent_df_with_years.groupBy("filed_year", "branch").agg(
    count("patent_id").alias("patent_filed_count")
)


In [9]:
grant_counts = grant_counts.withColumnRenamed("branch", "grant_branch")
filed_counts = filed_counts.withColumnRenamed("branch", "filed_branch")


# Step 4: Combine Both Aggregations
result = grant_counts.join(
    filed_counts,
    (grant_counts.grant_year == filed_counts.filed_year) & (grant_counts.grant_branch == filed_counts.filed_branch),    
    "full_outer"
)

result_filled = result.fillna({"patent_grant_count": 0, "patent_filed_count": 0})


In [10]:
from pyspark.sql import functions as F


final_result = result_filled.select(
    F.coalesce(grant_counts.grant_branch, filed_counts.filed_branch).alias("branch"),
    F.coalesce(grant_counts.grant_year, filed_counts.filed_year).alias("year"),
    F.coalesce(result_filled.patent_grant_count, F.lit(0)).alias("grant_count"),
    F.coalesce(result_filled.patent_filed_count, F.lit(0)).alias("filed_count")
)

# Show the result
final_result.show(truncate=False)




+----------------------------------+----------+-----------+-----------+
|branch                            |year      |grant_count|filed_count|
+----------------------------------+----------+-----------+-----------+
|Software Development and Security |1959-01-01|0          |1          |
|Networking and Distributed Systems|1965-01-01|0          |1          |
|Artificial Intelligence           |1969-01-01|0          |1          |
|Software Development and Security |1969-01-01|0          |2          |
|Networking and Distributed Systems|1971-01-01|0          |1          |
|Artificial Intelligence           |1972-01-01|0          |1          |
|Artificial Intelligence           |1973-01-01|0          |8          |
|Data Science and Analytics        |1973-01-01|0          |2          |
|Networking and Distributed Systems|1973-01-01|0          |4          |
|Software Development and Security |1973-01-01|0          |1          |
|Artificial Intelligence           |1974-01-01|0          |27   

                                                                                

In [11]:
df_with_year = patents_filtered_df.withColumn("year", F.year("patent_date"))  # Or use grant_year if needed

# Step 2: Group by year and branch, and calculate the average number of claims
avg_claims = df_with_year.groupBy("year", "branch").agg(
    F.avg("num_claims").alias("avg_claims")
)

avg_claims = avg_claims.withColumn("year", make_date(col("year"), lit(1), lit(1)))

combined_df = final_result.join(
    avg_claims,
    on=["year", "branch"],
    how="left"
)

combined_df.show(truncate=False)

[Stage 38:>                 (0 + 8) / 9][Stage 43:>                 (0 + 0) / 1]

+----------+----------------------------------+-----------+-----------+------------------+
|year      |branch                            |grant_count|filed_count|avg_claims        |
+----------+----------------------------------+-----------+-----------+------------------+
|1965-01-01|Networking and Distributed Systems|0          |1          |NULL              |
|1975-01-01|Advanced Computing Technologies   |0          |1          |NULL              |
|1974-01-01|Software Development and Security |0          |2          |NULL              |
|1975-01-01|Artificial Intelligence           |0          |60         |NULL              |
|1973-01-01|Artificial Intelligence           |0          |8          |NULL              |
|1975-01-01|Software Development and Security |0          |2          |NULL              |
|1973-01-01|Networking and Distributed Systems|0          |4          |NULL              |
|1974-01-01|Data Science and Analytics        |0          |10         |NULL              |

                                                                                

In [12]:
inventor_data_path = "input_parquet/inventor_info"  # Path to the patent data
inventor_df = spark.read.parquet(inventor_data_path)

inventor_filtered_df = inventor_df.join(
    patents_filtered_df,
    inventor_df.patent_id == patents_filtered_df.patent_id, 
    "inner"
)

inventor_filtered_df = inventor_filtered_df.drop(inventor_df['patent_id'])

inventor_df_with_year = inventor_filtered_df.withColumn("year", F.year("patent_date"))  # Or use grant_year if needed


In [13]:
# Step 2: Group by year and branch, and calculate the average number of claims

inventer_with_gender = inventor_df_with_year.withColumn(
    "gender_code",
    F.coalesce(F.col("gender_code"), F.lit("U"))
)



male_df = inventer_with_gender.filter(F.col("gender_code") == "M") 

male_count_df = male_df.groupBy("year", "branch").agg(
    count("patent_id").alias("male_count")
)

male_count_df = male_count_df.withColumn("year", make_date(col("year"), lit(1), lit(1)))

male_combined_df = combined_df.join(
    male_count_df,
    on=["year", "branch"],
    how="left"
).orderBy(desc("year"))


male_combined_df.show(truncate=False)

                                                                                

+----------+----------------------------------+-----------+-----------+------------------+----------+
|year      |branch                            |grant_count|filed_count|avg_claims        |male_count|
+----------+----------------------------------+-----------+-----------+------------------+----------+
|2024-01-01|Networking and Distributed Systems|17502      |56         |18.759227516855216|41876     |
|2024-01-01|Data Science and Analytics        |1265       |3          |17.583399209486167|3208      |
|2024-01-01|Software Development and Security |1524       |3          |19.211286089238847|3956      |
|2024-01-01|Advanced Computing Technologies   |123        |0          |15.959349593495935|268       |
|2024-01-01|Artificial Intelligence           |22116      |104        |18.173132573702297|56927     |
|2023-01-01|Software Development and Security |2266       |365        |19.127537511032656|5831      |
|2023-01-01|Artificial Intelligence           |29707      |3831       |18.43097586

In [14]:
female_df = inventer_with_gender.filter(F.col("gender_code") == "F") 

female_count_df = female_df.groupBy("year", "branch").agg(
    count("patent_id").alias("female_count")
)

female_count_df = female_count_df.withColumn("year", make_date(col("year"), lit(1), lit(1)))


female_combined_df = male_combined_df.join(
    female_count_df,
    on=["year", "branch"],
    how="left"
).orderBy(desc("year"))


female_combined_df.show(truncate=False)

                                                                                

+----------+----------------------------------+-----------+-----------+------------------+----------+------------+
|year      |branch                            |grant_count|filed_count|avg_claims        |male_count|female_count|
+----------+----------------------------------+-----------+-----------+------------------+----------+------------+
|2024-01-01|Networking and Distributed Systems|17502      |56         |18.759227516855216|41876     |7586        |
|2024-01-01|Data Science and Analytics        |1265       |3          |17.583399209486167|3208      |607         |
|2024-01-01|Software Development and Security |1524       |3          |19.211286089238847|3956      |680         |
|2024-01-01|Advanced Computing Technologies   |123        |0          |15.959349593495935|268       |40          |
|2024-01-01|Artificial Intelligence           |22116      |104        |18.173132573702297|56927     |10100       |
|2023-01-01|Software Development and Security |2266       |365        |19.127537

In [15]:
unknown_df = inventer_with_gender.filter(F.col("gender_code") == "U") 

unknown_count_df = unknown_df.groupBy("year", "branch").agg(
    count("patent_id").alias("unknown_count")
)

unknown_count_df = unknown_count_df.withColumn("year", make_date(col("year"), lit(1), lit(1)))

unknown_count_df_filled = unknown_count_df.withColumn(
    "unknown_count", 
    F.nanvl(F.col("unknown_count"), F.lit(0))
)

unknown_combined_df = female_combined_df.join(
    unknown_count_df,
    on=["year", "branch"],
    how="left"
)




In [16]:
final_combined_result = unknown_combined_df.select(
    unknown_combined_df.year,
    unknown_combined_df.branch,
    unknown_combined_df.grant_count,
    unknown_combined_df.filed_count,
    F.coalesce(unknown_combined_df.unknown_count, F.lit(0)).alias("unknown_count"),
    F.coalesce(unknown_combined_df.female_count, F.lit(0)).alias("female_count"),
    F.coalesce(unknown_combined_df.male_count, F.lit(0)).alias("male_count"),
    F.coalesce(unknown_combined_df.avg_claims, F.lit(0)).alias("avg_claims")
)

final_combined_result.show(truncate=False)

[Stage 155:>  (0 + 8) / 9][Stage 160:>  (0 + 0) / 1][Stage 163:>  (0 + 0) / 1]0]

+----------+----------------------------------+-----------+-----------+-------------+------------+----------+------------------+
|year      |branch                            |grant_count|filed_count|unknown_count|female_count|male_count|avg_claims        |
+----------+----------------------------------+-----------+-----------+-------------+------------+----------+------------------+
|1965-01-01|Networking and Distributed Systems|0          |1          |0            |0           |0         |0.0               |
|1975-01-01|Advanced Computing Technologies   |0          |1          |0            |0           |0         |0.0               |
|1974-01-01|Software Development and Security |0          |2          |0            |0           |0         |0.0               |
|1975-01-01|Artificial Intelligence           |0          |60         |0            |0           |0         |0.0               |
|1973-01-01|Artificial Intelligence           |0          |8          |0            |0           

                                                                                

In [17]:
result_with_schema = final_combined_result.select(
    col("year").cast(DateType()),
    col("branch").cast(StringType()),
    col("grant_count").cast(IntegerType()),
    col("filed_count").cast(IntegerType()),
    col("male_count").cast(IntegerType()),
    col("female_count").cast(IntegerType()),
    col("unknown_count").cast(IntegerType()),
    col("avg_claims").cast(FloatType()),
)

result_with_schema.printSchema()

root
 |-- year: date (nullable = true)
 |-- branch: string (nullable = true)
 |-- grant_count: integer (nullable = false)
 |-- filed_count: integer (nullable = false)
 |-- male_count: integer (nullable = false)
 |-- female_count: integer (nullable = false)
 |-- unknown_count: integer (nullable = false)
 |-- avg_claims: float (nullable = false)



In [18]:
result_with_schema.write.parquet("analytics_output_parquet/trend_analytics", mode="overwrite")

                                                                                