In [2]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import re

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Preprocessing") \
    .config("spark.driver.memory", "8g").config("spark.executor.memory", "8g") \
    .getOrCreate()

file_path = "raw_data/g_application.tsv"

application_schema = StructType([
    StructField("application_id", StringType(), True), 
    StructField("patent_id", StringType(), True),      
    StructField("patent_application_type", StringType(), True),  
    StructField("filing_date", DateType(), True),       
    StructField("series_code", StringType(), True),    
    StructField("rule_47_flag", IntegerType(), True)    
])


application_df = spark.read.option("delimiter", "\t").option("header", "true").schema(application_schema).csv(file_path)

application_df.printSchema()

record_count = application_df.count()
print(f"Total number of records: {record_count}")

application_df.show(10)

root
 |-- application_id: string (nullable = true)
 |-- patent_id: string (nullable = true)
 |-- patent_application_type: string (nullable = true)
 |-- filing_date: date (nullable = true)
 |-- series_code: string (nullable = true)
 |-- rule_47_flag: integer (nullable = true)



                                                                                

Total number of records: 8977871
+--------------+---------+-----------------------+-----------+-----------+------------+
|application_id|patent_id|patent_application_type|filing_date|series_code|rule_47_flag|
+--------------+---------+-----------------------+-----------+-----------+------------+
|      05497504|  3963197|                     05| 1074-08-14|         05|           0|
|      05508062|  3933359|                     05| 1074-09-23|         05|           0|
|      05518254|  3941467|                     05| 1074-10-29|         05|           0|
|      05518570|  3936670|                     05| 1074-10-29|         05|           0|
|      05555245|  4003574|                     05| 1075-03-04|         05|           0|
|      05563957|  3937110|                     05| 1075-04-01|         05|           0|
|      05564147|  3943740|                     05| 1075-04-01|         05|           0|
|      05571931|  3967129|                     05| 1075-04-28|         05|           0|

In [3]:
def filter_valid_dates(df: DataFrame, date_column: str):
    valid_date_pattern = r"^(19|20)\d{2}-[0-1]\d-[0-3]\d$"
    
    invalid_date_df = df.filter(~col(date_column).rlike(valid_date_pattern))
    invalid_date_count = invalid_date_df.count()
    
    valid_date_df = df.filter(col(date_column).rlike(valid_date_pattern))

    print(f"Number of records with incorrect dates: {invalid_date_count}")
    print("Top 10 records with valid dates:")
    valid_date_df.show(10)
    
    return valid_date_df

In [4]:
application_valid_date_df = filter_valid_dates(application_df, "filing_date")



Number of records with incorrect dates: 48
Top 10 records with valid dates:
+--------------+---------+-----------------------+-----------+-----------+------------+
|application_id|patent_id|patent_application_type|filing_date|series_code|rule_47_flag|
+--------------+---------+-----------------------+-----------+-----------+------------+
|      06185782|  D268871|                     06| 1900-09-18|         06|           0|
|      07469540|  5180907|                     07| 1901-06-03|         07|           0|
|      07720223|  5340165|                     07| 1901-06-21|         07|           0|
|      07918107|  5243083|                     07| 1902-07-24|         07|           0|
|      07960398|  5298141|                     07| 1903-01-15|         07|           0|
|      08012055|  5479042|                     08| 1903-02-01|         08|           0|
|      08068932|  5381357|                     08| 1903-05-28|         08|           0|
|      08078125|  5484917|                  

                                                                                

In [5]:
def drop_columns(df: DataFrame, columns_to_drop: list) -> DataFrame:
    
    dropped_df = df.drop(*columns_to_drop)
    dropped_df.show(10)
    return dropped_df

In [6]:
columns_to_drop = ["rule_47_flag", "patent_application_type"]  
application_df_dropped = drop_columns(application_valid_date_df, columns_to_drop)

+--------------+---------+-----------+-----------+
|application_id|patent_id|filing_date|series_code|
+--------------+---------+-----------+-----------+
|      06185782|  D268871| 1900-09-18|         06|
|      07469540|  5180907| 1901-06-03|         07|
|      07720223|  5340165| 1901-06-21|         07|
|      07918107|  5243083| 1902-07-24|         07|
|      07960398|  5298141| 1903-01-15|         07|
|      08012055|  5479042| 1903-02-01|         08|
|      08068932|  5381357| 1903-05-28|         08|
|      08078125|  5484917| 1903-06-16|         08|
|       D008593|  D356729| 1903-05-19|          D|
|      06670322|  4875871| 1904-11-09|         06|
+--------------+---------+-----------+-----------+
only showing top 10 rows



In [7]:
file_path = "raw_data/g_patent.tsv"

patent_schema = StructType([
    StructField("patent_id", StringType(), True),       
    StructField("patent_type", StringType(), True),     
    StructField("patent_date", DateType(), True),        
    StructField("patent_title", StringType(), True),    
    StructField("wipo_kind", StringType(), True),        
    StructField("num_claims", IntegerType(), True),      
    StructField("withdrawn", IntegerType(), True),      
    StructField("filename", StringType(), True)         
])


patent_df = spark.read.option("delimiter", "\t").option("header", "true").schema(patent_schema).csv(file_path)
patent_df.printSchema()


record_count = patent_df.count()
print(f"Total number of records: {record_count}")

patent_df.show(10)

root
 |-- patent_id: string (nullable = true)
 |-- patent_type: string (nullable = true)
 |-- patent_date: date (nullable = true)
 |-- patent_title: string (nullable = true)
 |-- wipo_kind: string (nullable = true)
 |-- num_claims: integer (nullable = true)
 |-- withdrawn: integer (nullable = true)
 |-- filename: string (nullable = true)



[Stage 9:>                                                          (0 + 8) / 8]

Total number of records: 8980130
+---------+-----------+-----------+--------------------+---------+----------+---------+-------------+
|patent_id|patent_type|patent_date|        patent_title|wipo_kind|num_claims|withdrawn|     filename|
+---------+-----------+-----------+--------------------+---------+----------+---------+-------------+
| 10000000|    utility| 2018-06-19|Coherent LADAR us...|       B2|        20|        0|ipg180619.xml|
| 10000001|    utility| 2018-06-19|Injection molding...|       B2|        12|        0|ipg180619.xml|
| 10000002|    utility| 2018-06-19|Method for manufa...|       B2|         9|        0|ipg180619.xml|
| 10000003|    utility| 2018-06-19|Method for produc...|       B2|        18|        0|ipg180619.xml|
| 10000004|    utility| 2018-06-19|Process of obtain...|       B2|         6|        0|ipg180619.xml|
| 10000005|    utility| 2018-06-19|Article vacuum fo...|       B2|         4|        0|ipg180619.xml|
| 10000006|    utility| 2018-06-19|Thermoforming 

                                                                                

In [8]:
patent_valid_date_df = filter_valid_dates(patent_df, "patent_date")

[Stage 13:>                                                         (0 + 8) / 8]

Number of records with incorrect dates: 0
Top 10 records with valid dates:
+---------+-----------+-----------+--------------------+---------+----------+---------+-------------+
|patent_id|patent_type|patent_date|        patent_title|wipo_kind|num_claims|withdrawn|     filename|
+---------+-----------+-----------+--------------------+---------+----------+---------+-------------+
| 10000000|    utility| 2018-06-19|Coherent LADAR us...|       B2|        20|        0|ipg180619.xml|
| 10000001|    utility| 2018-06-19|Injection molding...|       B2|        12|        0|ipg180619.xml|
| 10000002|    utility| 2018-06-19|Method for manufa...|       B2|         9|        0|ipg180619.xml|
| 10000003|    utility| 2018-06-19|Method for produc...|       B2|        18|        0|ipg180619.xml|
| 10000004|    utility| 2018-06-19|Process of obtain...|       B2|         6|        0|ipg180619.xml|
| 10000005|    utility| 2018-06-19|Article vacuum fo...|       B2|         4|        0|ipg180619.xml|
| 10000

                                                                                

In [9]:
columns_to_drop = ["wipo_kind", "withdrawn", "filename"]  
patent_df_dropped = drop_columns(patent_valid_date_df, columns_to_drop)

+---------+-----------+-----------+--------------------+----------+
|patent_id|patent_type|patent_date|        patent_title|num_claims|
+---------+-----------+-----------+--------------------+----------+
| 10000000|    utility| 2018-06-19|Coherent LADAR us...|        20|
| 10000001|    utility| 2018-06-19|Injection molding...|        12|
| 10000002|    utility| 2018-06-19|Method for manufa...|         9|
| 10000003|    utility| 2018-06-19|Method for produc...|        18|
| 10000004|    utility| 2018-06-19|Process of obtain...|         6|
| 10000005|    utility| 2018-06-19|Article vacuum fo...|         4|
| 10000006|    utility| 2018-06-19|Thermoforming mol...|         8|
| 10000007|    utility| 2018-06-19|  PEX expanding tool|        24|
| 10000008|    utility| 2018-06-19|Bracelet mold and...|        11|
| 10000009|    utility| 2018-06-19|Sterile environme...|        21|
+---------+-----------+-----------+--------------------+----------+
only showing top 10 rows



In [10]:
key_column = "patent_id"

patent_info_joined_df = patent_df_dropped.join(application_df_dropped, patent_df_dropped[key_column] == application_df_dropped[key_column], "inner")

patent_info_joined_df = patent_info_joined_df.drop(application_df_dropped[key_column])

record_count = patent_info_joined_df.count()
print(f"Total number of records: {record_count}")
patent_info_joined_df.printSchema()
patent_info_joined_df.show(10)

                                                                                

Total number of records: 8977823
root
 |-- patent_id: string (nullable = true)
 |-- patent_type: string (nullable = true)
 |-- patent_date: date (nullable = true)
 |-- patent_title: string (nullable = true)
 |-- num_claims: integer (nullable = true)
 |-- application_id: string (nullable = true)
 |-- filing_date: date (nullable = true)
 |-- series_code: string (nullable = true)





+---------+-----------+-----------+--------------------+----------+--------------+-----------+-----------+
|patent_id|patent_type|patent_date|        patent_title|num_claims|application_id|filing_date|series_code|
+---------+-----------+-----------+--------------------+----------+--------------+-----------+-----------+
| 10000007|    utility| 2018-06-19|  PEX expanding tool|        24|      15178786| 2016-06-10|         15|
| 10000016|    utility| 2018-06-19|Film edge sealing...|        22|      15294450| 2016-10-14|         15|
| 10000018|    utility| 2018-06-19|Pull tab design f...|        13|      15148543| 2016-05-06|         15|
| 10000021|    utility| 2018-06-19|Method for manufa...|         4|      13378475| 2010-06-23|         13|
| 10000024|    utility| 2018-06-19|Apparatus and met...|        25|      14588197| 2014-12-31|         14|
| 10000036|    utility| 2018-06-19|High kinetic ener...|        20|      14753848| 2015-06-29|         14|
| 10000043|    utility| 2018-06-19|Mu

                                                                                

In [11]:
file_path = "raw_data/g_cpc_current.tsv"

cpc_schema = StructType([
    StructField("patent_id", StringType(), True),          
    StructField("cpc_sequence", IntegerType(), True),     
    StructField("cpc_section", StringType(), True),       
    StructField("cpc_class", StringType(), True),          
    StructField("cpc_subclass", StringType(), True),      
    StructField("cpc_group", StringType(), True),          
    StructField("cpc_type", StringType(), True)            
])


cpc_df = spark.read.option("delimiter", "\t").option("header", "true").schema(cpc_schema).csv(file_path)
cpc_df.printSchema()

record_count = cpc_df.count()
print(f"Total number of records: {record_count}")

cpc_df.show(10)

root
 |-- patent_id: string (nullable = true)
 |-- cpc_sequence: integer (nullable = true)
 |-- cpc_section: string (nullable = true)
 |-- cpc_class: string (nullable = true)
 |-- cpc_subclass: string (nullable = true)
 |-- cpc_group: string (nullable = true)
 |-- cpc_type: string (nullable = true)





Total number of records: 54914750
+---------+------------+-----------+---------+------------+-----------+-----------+
|patent_id|cpc_sequence|cpc_section|cpc_class|cpc_subclass|  cpc_group|   cpc_type|
+---------+------------+-----------+---------+------------+-----------+-----------+
|  3950000|           0|          A|      A63|        A63C|  A63C9/001|inventional|
|  3950000|           1|          A|      A63|        A63C|   A63C9/00|inventional|
|  3950000|           2|          A|      A63|        A63C|  A63C9/002|inventional|
|  3950000|           3|          A|      A63|        A63C|  A63C9/081|inventional|
|  3950001|           0|          A|      A63|        A63C|  A63C9/086|inventional|
|  3950001|           1|          A|      A63|        A63C|  A63C9/005|inventional|
|  3950001|           2|          A|      A63|        A63C|  A63C9/003| additional|
|  3950002|           0|          A|      A63|        A63C|  A63C9/001|inventional|
|  3950002|           1|          A|      

                                                                                

In [12]:
file_path = "data/g_cpc_title.tsv"

cpc_title_schema = StructType([
    StructField("cpc_subclass", StringType(), True),         
    StructField("cpc_subclass_title", StringType(), True),    
    StructField("cpc_group", StringType(), True),             
    StructField("cpc_group_title", StringType(), True),      
    StructField("cpc_class", StringType(), True),            
    StructField("cpc_class_title", StringType(), True)        
])


cpc_title_df = spark.read.option("delimiter", "\t").option("header", "true").schema(cpc_title_schema).csv(file_path)
cpc_title_df.printSchema()

record_count = cpc_title_df.count()
print(f"Total number of records: {record_count}")

cpc_title_df.show(10)

root
 |-- cpc_subclass: string (nullable = true)
 |-- cpc_subclass_title: string (nullable = true)
 |-- cpc_group: string (nullable = true)
 |-- cpc_group_title: string (nullable = true)
 |-- cpc_class: string (nullable = true)
 |-- cpc_class_title: string (nullable = true)

Total number of records: 267350
+------------+--------------------+---------+--------------------+---------+--------------------+
|cpc_subclass|  cpc_subclass_title|cpc_group|     cpc_group_title|cpc_class|     cpc_class_title|
+------------+--------------------+---------+--------------------+---------+--------------------+
|        A01B|SOIL WORKING IN A...| A01B1/00|         Hand tools |      A01|AGRICULTURE; FORE...|
|        A01B|SOIL WORKING IN A...| A01B1/02|Hand tools -Spade...|      A01|AGRICULTURE; FORE...|
|        A01B|SOIL WORKING IN A...|A01B1/022|Hand tools -Spade...|      A01|AGRICULTURE; FORE...|
|        A01B|SOIL WORKING IN A...|A01B1/024|Hand tools -Spade...|      A01|AGRICULTURE; FORE...|
|     

In [13]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("cpc_section", StringType(), True),
    StructField("section_description", StringType(), True)
])

data = [
    ("A", "Human Necessities"),
    ("B", "Performing Operations; Transporting"),
    ("C", "Chemistry; Metallurgy"),
    ("D", "Textiles; Paper"),
    ("E", "Fixed Constructions"),
    ("F", "Mechanical Engineering; Lighting; Heating; Weapons; Blasting Engines or Pumps"),
    ("G", "Physics"),
    ("H", "Electricity"),
    ("Y", "General Tagging of New Technological Developments")
]

cpc_section_df = spark.createDataFrame(data, schema)

cpc_section_df.show(truncate=False)

+-----------+-----------------------------------------------------------------------------+
|cpc_section|section_description                                                          |
+-----------+-----------------------------------------------------------------------------+
|A          |Human Necessities                                                            |
|B          |Performing Operations; Transporting                                          |
|C          |Chemistry; Metallurgy                                                        |
|D          |Textiles; Paper                                                              |
|E          |Fixed Constructions                                                          |
|F          |Mechanical Engineering; Lighting; Heating; Weapons; Blasting Engines or Pumps|
|G          |Physics                                                                      |
|H          |Electricity                                                        

In [14]:
cpc_group_df = cpc_title_df.select("cpc_group", "cpc_group_title").distinct()

cpc_subclass_df = cpc_title_df.select("cpc_subclass", "cpc_subclass_title").distinct()

cpc_class_df = cpc_title_df.select("cpc_class", "cpc_class_title").distinct()

print("Unique CPC Groups:")
cpc_group_df.show(10)

print("Unique CPC Subclasses:")
cpc_subclass_df.show(10)

print("Unique CPC Classes:")
cpc_class_df.show(10)

Unique CPC Groups:


                                                                                

+-------------+--------------------+
|    cpc_group|     cpc_group_title|
+-------------+--------------------+
|    A01B33/14|Tilling implement...|
|   A01C23/007|Distributing devi...|
|    A01D11/02|Other hand implem...|
|   A01D34/015|Mowers ; Mowing a...|
|    A01D43/07|Mowers combined w...|
|    A01D46/26|Picking of fruits...|
|    A01D65/02|Grain-crop lifter...|
|   A01D75/246|Accessories for h...|
|A01F2015/0891|Baling presses fo...|
|    A01G17/02|Cultivation of ho...|
+-------------+--------------------+
only showing top 10 rows

Unique CPC Subclasses:
+------------+--------------------+
|cpc_subclass|  cpc_subclass_title|
+------------+--------------------+
|        A01J|MANUFACTURE OF DA...|
|        A23F|COFFEE; TEA; THEI...|
|        A61C|DENTISTRY; APPARA...|
|        A61K|PREPARATIONS FOR ...|
|        B01D|         SEPARATION |
|        A44B|BUTTONS, PINS, BU...|
|        A63J|DEVICES FOR THEAT...|
|        A22C|PROCESSING MEAT, ...|
|        A01F|PROCESSING OF HAR...|
|

In [15]:
file_path = "raw_data/g_inventor_disambiguated.tsv"

inventor_schema = StructType([
    StructField("patent_id", StringType(), True),                     
    StructField("inventor_sequence", IntegerType(), True),            
    StructField("inventor_id", StringType(), True),                    
    StructField("disambig_inventor_name_first", StringType(), True),  
    StructField("disambig_inventor_name_last", StringType(), True),   
    StructField("gender_code", StringType(), True),                    
    StructField("location_id", StringType(), True)                    
])

inventor_df = spark.read.option("delimiter", "\t").option("header", "true").schema(inventor_schema).csv(file_path)
inventor_df.printSchema()

record_count = inventor_df.count()
print(f"Total number of records: {record_count}")

inventor_df.show(10)

root
 |-- patent_id: string (nullable = true)
 |-- inventor_sequence: integer (nullable = true)
 |-- inventor_id: string (nullable = true)
 |-- disambig_inventor_name_first: string (nullable = true)
 |-- disambig_inventor_name_last: string (nullable = true)
 |-- gender_code: string (nullable = true)
 |-- location_id: string (nullable = true)





Total number of records: 22595784
+---------+-----------------+--------------------+----------------------------+---------------------------+-----------+--------------------+
|patent_id|inventor_sequence|         inventor_id|disambig_inventor_name_first|disambig_inventor_name_last|gender_code|         location_id|
+---------+-----------------+--------------------+----------------------------+---------------------------+-----------+--------------------+
| D1006496|                0|  fl:we_ln:jiang-128|                     Wenjing|                      Jiang|          F|9d072d42-49af-11e...|
| 12029253|                4|  fl:ei_ln:baumker-1|                        Eiko|                    BÄUMKER|          M|67149e17-49af-11e...|
|  6584128|                0|  fl:ri_ln:kroeger-1|                     Richard|                    Kroeger|          M|                NULL|
|  4789863|                0|     fl:th_ln:bush-1|                   Thomas A.|                       Bush|          M| 

                                                                                

In [16]:
file_path = "raw_data/g_location_disambiguated.tsv"

location_schema = StructType([
    StructField("location_id", StringType(), True),                
    StructField("disambig_city", StringType(), True),             
    StructField("disambig_state", StringType(), True),             
    StructField("disambig_country", StringType(), True),          
    StructField("latitude", FloatType(), True),                    
    StructField("longitude", FloatType(), True),                   
    StructField("county", StringType(), True),                     
    StructField("state_fips", StringType(), True),               
    StructField("county_fips", StringType(), True)                
])

location_df = spark.read.option("delimiter", "\t").option("header", "true").schema(location_schema).csv(file_path)
location_df.printSchema()

record_count = location_df.count()
print(f"Total number of records: {record_count}")

location_df.show(10)

root
 |-- location_id: string (nullable = true)
 |-- disambig_city: string (nullable = true)
 |-- disambig_state: string (nullable = true)
 |-- disambig_country: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- county: string (nullable = true)
 |-- state_fips: string (nullable = true)
 |-- county_fips: string (nullable = true)

Total number of records: 96039
+--------------------+---------------+--------------+----------------+---------+----------+-------------------+----------+-----------+
|         location_id|  disambig_city|disambig_state|disambig_country| latitude| longitude|             county|state_fips|county_fips|
+--------------------+---------------+--------------+----------------+---------+----------+-------------------+----------+-----------+
|00235947-16c8-11e...|      Westfield|            PA|              US|41.919235| -77.53887|              Tioga|        42|        117|
|00236a27-16c8-11e...|    Helfenstein| 

In [17]:
columns_to_drop = ["state_fips", "county_fips"]  
location_df_dropped = drop_columns(location_df, columns_to_drop)

+--------------------+---------------+--------------+----------------+---------+----------+-------------------+
|         location_id|  disambig_city|disambig_state|disambig_country| latitude| longitude|             county|
+--------------------+---------------+--------------+----------------+---------+----------+-------------------+
|00235947-16c8-11e...|      Westfield|            PA|              US|41.919235| -77.53887|              Tioga|
|00236a27-16c8-11e...|    Helfenstein|            PA|              US|  40.7505|-76.447334|  Schuylkill County|
|00236f47-16c8-11e...|     Pine Forge|            PA|              US| 40.28192| -75.69224|       Berks County|
|00237418-16c8-11e...|        Partlow|            VA|              US| 38.03875| -77.63888|Spotsylvania County|
|002378d7-16c8-11e...|   Stumpy Point|            NC|              US|35.698505|-75.740456|               Dare|
|00238cb7-16c8-11e...|        Millers|            MD|              US|39.671215| -76.85109|     Carroll 

In [18]:
key_column = "location_id"

joined_df_inventor = inventor_df.join(location_df_dropped, inventor_df[key_column] == location_df_dropped[key_column], "left")

joined_df_inventor = joined_df_inventor.drop(location_df_dropped[key_column])

joined_df_inventor.printSchema()
record_count = joined_df_inventor.count()
print(f"Total number of records: {record_count}")
joined_df_inventor.show(10)

root
 |-- patent_id: string (nullable = true)
 |-- inventor_sequence: integer (nullable = true)
 |-- inventor_id: string (nullable = true)
 |-- disambig_inventor_name_first: string (nullable = true)
 |-- disambig_inventor_name_last: string (nullable = true)
 |-- gender_code: string (nullable = true)
 |-- location_id: string (nullable = true)
 |-- disambig_city: string (nullable = true)
 |-- disambig_state: string (nullable = true)
 |-- disambig_country: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- county: string (nullable = true)



                                                                                

Total number of records: 22595784
+---------+-----------------+--------------------+----------------------------+---------------------------+-----------+--------------------+---------------+--------------+----------------+---------+----------+---------+
|patent_id|inventor_sequence|         inventor_id|disambig_inventor_name_first|disambig_inventor_name_last|gender_code|         location_id|  disambig_city|disambig_state|disambig_country| latitude| longitude|   county|
+---------+-----------------+--------------------+----------------------------+---------------------------+-----------+--------------------+---------------+--------------+----------------+---------+----------+---------+
| D1006496|                0|  fl:we_ln:jiang-128|                     Wenjing|                      Jiang|          F|9d072d42-49af-11e...|        Guizhou|          NULL|              CN|30.987291|110.713524|     NULL|
| 12029253|                4|  fl:ei_ln:baumker-1|                        Eiko|       

In [20]:
file_path = "raw_data/g_applicant_not_disambiguated.tsv"

applicant_schema = StructType([
    StructField("patent_id", StringType(), True),                     
    StructField("applicant_sequence", IntegerType(), True),           
    StructField("raw_applicant_name_first", StringType(), True),       
    StructField("raw_applicant_name_last", StringType(), True),        
    StructField("raw_applicant_organization", StringType(), True),     
    StructField("applicant_type", StringType(), True),                
    StructField("applicant_designation", StringType(), True),          
    StructField("applicant_authority", StringType(), True),            
    StructField("rawlocation_id", StringType(), True)                 
])



applicant_df = spark.read.option("delimiter", "\t").option("header", "true").schema(applicant_schema).csv(file_path)
applicant_df.printSchema()

record_count = applicant_df.count()
print(f"Total number of records: {record_count}")

applicant_df.show(10)

root
 |-- patent_id: string (nullable = true)
 |-- applicant_sequence: integer (nullable = true)
 |-- raw_applicant_name_first: string (nullable = true)
 |-- raw_applicant_name_last: string (nullable = true)
 |-- raw_applicant_organization: string (nullable = true)
 |-- applicant_type: string (nullable = true)
 |-- applicant_designation: string (nullable = true)
 |-- applicant_authority: string (nullable = true)
 |-- rawlocation_id: string (nullable = true)

Total number of records: 5908224
+---------+------------------+------------------------+-----------------------+--------------------------+--------------+---------------------+-------------------+--------------------+
|patent_id|applicant_sequence|raw_applicant_name_first|raw_applicant_name_last|raw_applicant_organization|applicant_type|applicant_designation|applicant_authority|      rawlocation_id|
+---------+------------------+------------------------+-----------------------+--------------------------+--------------+-------------

                                                                                

In [21]:
columns_to_drop = ["applicant_authority", "rawlocation_id", "applicant_designation"]  
applicant_df_dropped = drop_columns(applicant_df, columns_to_drop)

+---------+------------------+------------------------+-----------------------+--------------------------+--------------+
|patent_id|applicant_sequence|raw_applicant_name_first|raw_applicant_name_last|raw_applicant_organization|applicant_type|
+---------+------------------+------------------------+-----------------------+--------------------------+--------------+
|  9069405|                 3|                   David|                 Bordui|                      NULL|     applicant|
|  9117193|                 6|                James W.|                 Seaman|                      NULL|     applicant|
|  9764256|                 2|          Terence Arthur|                 Devlin|                      NULL|     applicant|
| 10947428|                 1|                    NULL|                   NULL|      PPG Industries Oh...|     applicant|
| 11212562|                 1|                    NULL|                   NULL|      Amazon Technologi...|     applicant|
| 10188493|             

In [23]:
def save_dfs_as_parquet_with_names(dfs, directory_path, df_names):
    
    for df, df_name in zip(dfs, df_names):
        file_path = f"{directory_path}/{df_name}"
        
        try:
            df.write.parquet(file_path, mode="overwrite")
            print(f"DataFrame '{df_name}' saved successfully to {file_path}")
        except Exception as e:
            print(f"Error saving DataFrame '{df_name}' to {file_path}: {e}")


save_dfs_as_parquet_with_names([patent_info_joined_df, cpc_df, cpc_section_df, cpc_group_df,cpc_subclass_df, cpc_class_df, joined_df_inventor, applicant_df_dropped], "preprocessed_data", 
                               ["patent_info", "cpc_info", "cpc_section", "cpc_group", "cpc_subclass", "cpc_class", "inventor_info", "applicant_info"])

                                                                                

DataFrame 'patent_info' saved successfully to output_parquet_schema/patent_info


                                                                                

DataFrame 'cpc_info' saved successfully to output_parquet_schema/cpc_info
DataFrame 'cpc_section' saved successfully to output_parquet_schema/cpc_section
DataFrame 'cpc_group' saved successfully to output_parquet_schema/cpc_group
DataFrame 'cpc_subclass' saved successfully to output_parquet_schema/cpc_subclass
DataFrame 'cpc_class' saved successfully to output_parquet_schema/cpc_class


                                                                                

DataFrame 'inventor_info' saved successfully to output_parquet_schema/inventor_info


[Stage 110:>                                                        (0 + 8) / 8]

DataFrame 'applicant_info' saved successfully to output_parquet_schema/applicant_info


24/11/23 11:07:13 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1006990 ms exceeds timeout 120000 ms
24/11/23 11:07:13 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/23 11:15:55 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$