In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("KeyBERT") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

24/11/27 19:37:37 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/11/27 19:37:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/27 19:37:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
input_path = "data/g_brf_sum_text_2024.tsv"
output_path = "data/summary_partitioned"

# Read the large text file
text_rdd = spark.sparkContext.textFile(input_path)

# Optional: Maintain order using zipWithIndex and repartition based on keys
text_rdd_with_index = text_rdd.zipWithIndex().map(lambda x: (x[1], x[0]))

# Number of output files (partitions)
num_partitions = 10

# Repartition the RDD to control the number of output files
partitioned_rdd = text_rdd_with_index.repartition(num_partitions).sortByKey().values()

# Save partitioned files to the output directory
partitioned_rdd.saveAsTextFile(output_path)

In [2]:
filtered_patents_id_path = "data/patent_ids"

patent_ids_df = spark.read.parquet(input_parquet_path)

patent_ids_df.show(10)
patent_ids_count = patent_ids_df.count()

print(f"Total number of patent IDs: {patent_ids_count}")

+---------+
|patent_id|
+---------+
| 11966422|
| 11966824|
| 11880729|
| 12061966|
| 11928212|
| 11928737|
| 11886956|
| 11887494|
| 11887599|
| 12067651|
+---------+
only showing top 10 rows

Total number of patent IDs: 42530


In [3]:
patent_ids = patent_ids_df.select("patent_id").rdd.flatMap(lambda row: row).collect()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from keybert import KeyBERT



def extract_keywords_from_dataframe(spark_df: DataFrame, 
                                    text_col: str, 
                                    id_col: str, 
                                    num_keywords: int = 5, 
                                    model_name: str = 'all-MiniLM-L6-v2') -> DataFrame:

    print("here")
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model_name)

    # Collect data from the Spark DataFrame
    data_collected = spark_df.select(id_col, text_col).collect()

    # Prepare results
    results = []
    for row in data_collected:
        patent_id = row[id_col]
        text = row[text_col]
    
        # Extract keywords
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 1),
            top_n=num_keywords
        )

        # Append results
        results.append((patent_id, [(kw[0], kw[1]) for kw in keywords]))

    # Convert results back to Spark DataFrame
    result_df = spark.createDataFrame(results, [id_col, "keywords"])
    return result_df

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import udf, array_sort, slice, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
import os
import gc

def list_files_in_directory(directory_path, extension):
    try:
        files = [
            f for f in os.listdir(directory_path)
            if os.path.isfile(os.path.join(directory_path, f))  
            and (extension == '' or f.endswith(extension))  
            and not f.startswith('.')  # Exclude hidden files
            and not f.endswith('.crc')  # Exclude .crc files
            and os.path.getsize(os.path.join(directory_path, f)) > 0  # Exclude empty files
        ]
        return files
    except FileNotFoundError:
        print(f"The directory {directory_path} was not found.")
        return []
    except PermissionError:
        print(f"Permission denied to access the directory {directory_path}.")
        return []

def extract_keywords_save_to_file(input_file):

    print(input_file)
    
    lines_df = spark.read.text(input_file)
    
    # Skip the header and process the lines
    lines = [row["value"] for row in lines_df.collect()][1:]  
    
    # Extract patent_id and summary_text
    data = []
    current_id = None
    current_summary = []
    
    for line in lines:
        try:
            if line.startswith('"'):
                if current_id is not None and current_summary:
                    # Save the current record
                    data.append((current_id, " ".join(current_summary).strip()))
                # Extract the new patent_id
                current_id = line.split('"')[1]  
                # Extract the start of the summary text
                current_summary = [line.split('"', 2)[2].strip()] if '"' in line else []
            else:
                # Add subsequent lines to the summary
                current_summary.append(line.strip())
        except:
            pass
    
    # Append the last record
    if current_id is not None and current_summary:
        data.append((current_id, " ".join(current_summary).strip()))

    df = spark.createDataFrame(data, ["patent_id", "summary_text"])
    
    df = df.filter(col("patent_id") != '')
    
    filtered_df = df.filter(df["patent_id"].isin(patent_ids))
    print(filtered_df.count())

    keywords_df = extract_keywords_from_dataframe(
    spark_df=filtered_df,
    text_col="summary_text",
    id_col="patent_id",
    num_keywords=5
    )

    keywords_df.show(truncate=False)
   
    output_path = "data/keywords"
    keywords_df.write.mode("append").parquet(output_path)
    
    print(f"DataFrame {input_file} saved to {output_path}")


directory_path = 'data/summary_partitioned'
extension = ''
files = list_files_in_directory(directory_path, extension)
for file in files:
    extract_keywords_save_to_file(os.path.join(directory_path, file))


output_summary_partitioned/part-00001


24/11/27 19:38:09 WARN TaskSetManager: Stage 7 contains a task of very large size (41666 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

2944
here


24/11/27 19:38:12 WARN TaskSetManager: Stage 10 contains a task of very large size (41666 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+---------+-----------------------------------------------------------------------------------------------------------------+
|patent_id|keywords                                                                                                         |
+---------+-----------------------------------------------------------------------------------------------------------------+
|11881627 |[{satellites, 0.5401}, {satellite, 0.4685}, {antennas, 0.4384}, {antenna, 0.4092}, {radiated, 0.3852}]           |
|11881629 |[{antenna, 0.4533}, {transmitting, 0.4325}, {wireless, 0.4203}, {polarization, 0.3885}, {transmits, 0.3674}]     |
|11881714 |[{lte, 0.5121}, {telecommunication, 0.5106}, {communications, 0.4669}, {wireless, 0.4654}, {transmit, 0.4562}]   |
|11881727 |[{wireless, 0.4695}, {charging, 0.4662}, {devices, 0.4488}, {telephone, 0.4463}, {telephones, 0.4413}]           |
|11881758 |[{transformer, 0.5703}, {display, 0.3971}, {converter, 0.3824}, {circuits, 0.3052}, {voltages, 0.302}]     

24/11/27 19:46:20 WARN TaskSetManager: Stage 14 contains a task of very large size (81563 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

5247
here


24/11/27 19:46:24 WARN TaskSetManager: Stage 17 contains a task of very large size (81563 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+---------+----------------------------------------------------------------------------------------------------------------------------+
|patent_id|keywords                                                                                                                    |
+---------+----------------------------------------------------------------------------------------------------------------------------+
|11856881 |[{patent, 0.434}, {detection, 0.4196}, {recognizing, 0.4027}, {classifying, 0.4011}, {photos, 0.3493}]                      |
|11856883 |[{probes, 0.467}, {watering, 0.3555}, {vegetation, 0.3425}, {moisture, 0.3305}, {soil, 0.3174}]                             |
|11856937 |[{herbicides, 0.3918}, {weeds, 0.3794}, {herbicide, 0.3749}, {applications, 0.3725}, {weed, 0.3632}]                        |
|11857063 |[{wireless, 0.5168}, {wirelessly, 0.5131}, {gps, 0.4116}, {devices, 0.3897}, {antenna, 0.3857}]                             |
|11857151 |[{endoscope, 0.5687}, {optics,