In [2]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("PartitionTextFile") \
    .getOrCreate()

# Path to the large text file
input_path = "g_brf_sum_text_2024.tsv"
output_path = "./output_summary_partitioned"

# Read the large text file
text_rdd = spark.sparkContext.textFile(input_path)

# Optional: Maintain order using zipWithIndex and repartition based on keys
text_rdd_with_index = text_rdd.zipWithIndex().map(lambda x: (x[1], x[0]))

# Number of output files (partitions)
num_partitions = 10

# Repartition the RDD to control the number of output files
partitioned_rdd = text_rdd_with_index.repartition(num_partitions).sortByKey().values()

# Save partitioned files to the output directory
partitioned_rdd.saveAsTextFile(output_path)

# Stop the Spark session
spark.stop()


                                                                                

In [5]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import udf, array_sort, slice, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
import os
import gc

def list_files_in_directory(directory_path, extension):
    try:
        files = [
            f for f in os.listdir(directory_path)
            if os.path.isfile(os.path.join(directory_path, f))  # Ensure it's a file
            and (extension == '' or f.endswith(extension))  # Filter by extension if provided
            and not f.startswith('.')  # Exclude hidden files
            and not f.endswith('.crc')  # Exclude .crc files
            and os.path.getsize(os.path.join(directory_path, f)) > 0  # Exclude empty files
        ]
        return files
    except FileNotFoundError:
        print(f"The directory {directory_path} was not found.")
        return []
    except PermissionError:
        print(f"Permission denied to access the directory {directory_path}.")
        return []

def extract_keywords_save_to_file(input_file):
    # Initialize a Spark session
    spark = SparkSession.builder \
    .appName("Key words extraction") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()
    
    lines_df = spark.read.text(input_file)
    
    # Step 2: Skip the header and process the lines
    lines = [row["value"] for row in lines_df.collect()][1:]  # Skip the header line
    
    # Step 3: Extract patent_id and summary_text
    data = []
    current_id = None
    current_summary = []
    
    for line in lines:
        try:
            if line.startswith('"'):
                if current_id is not None and current_summary:
                    # Save the current record
                    data.append((current_id, " ".join(current_summary).strip()))
                # Extract the new patent_id
                current_id = line.split('"')[1]  # First text block inside quotes
                # Extract the start of the summary text
                current_summary = [line.split('"', 2)[2].strip()] if '"' in line else []
            else:
                # Add subsequent lines to the summary
                current_summary.append(line.strip())
        except:
            pass
    
    # Append the last record
    if current_id is not None and current_summary:
        data.append((current_id, " ".join(current_summary).strip()))
    
    # Step 4: Create DataFrame
    df = spark.createDataFrame(data, ["patent_id", "summary_text"])
    
    df = df.filter(col("patent_id") != '')
    
    tokenizer = Tokenizer(inputCol="summary_text", outputCol="words")
    df_tokens = tokenizer.transform(df)
    
    # Remove stopwords
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    df_filtered = stopwords_remover.transform(df_tokens)
    
    # Compute TF (Term Frequency)
    hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
    df_tf = hashing_tf.transform(df_filtered)
    
    # Compute IDF (Inverse Document Frequency)
    idf = IDF(inputCol="raw_features", outputCol="features")
    idf_model = idf.fit(df_tf)
    df_tfidf = idf_model.transform(df_tf)
    
    # Extract Keywords
    def extract_top_keywords(features, words, n=20):
        # Map TF-IDF scores to words
        tfidf_scores = list(zip(words, features.toArray()))
        # Sort by score in descending order and take top n
        sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
        return [keyword for keyword, score in sorted_keywords[:n]]
    
    # Register UDF for extracting top keywords
    extract_keywords_udf = udf(lambda features, words: extract_top_keywords(features, words), ArrayType(StringType()))
    
    # Apply the UDF to extract top keywords
    df_keywords = df_tfidf.withColumn(
        "keywords",
        extract_keywords_udf(col("features"), col("filtered_words"))
    )
    
    # Select relevant columns and show results
    df_keywords.select("patent_id", "keywords").show(10)
    
    from pyspark.sql.functions import concat_ws
    
    # Convert the 'keywords' column (array) into a single string
    df_keywords_csv = df_keywords.withColumn("keywords", concat_ws(", ", "keywords"))
    
    # Write the DataFrame to a CSV file
    output_path = "output_keywords"
    df_keywords_csv.select("patent_id", "keywords").write.csv(output_path, header=True, mode="append")
    
    print(f"DataFrame {input_file} saved to {output_path}")

    del df
    del df_tokens
    del df_filtered
    del df_tfidf
    del df_tf
    del df_keywords
    del df_keywords_csv
    spark.catalog.clearCache()
    spark.stop()
    gc.collect()

directory_path = 'output_summary_partitioned'
extension = ''
files = list_files_in_directory(directory_path, extension)
for file in files:
    extract_keywords_save_to_file(os.path.join(directory_path, file))


24/11/21 18:28:59 WARN TaskSetManager: Stage 1 contains a task of very large size (49659 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:29:05 WARN TaskSetManager: Stage 3 contains a task of very large size (49659 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:29:09 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
                                                                                

+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 11990388|[cooling, supplie...|
| 11990389|[device, interpos...|
| 11990390|["technical, fiel...|
| 11990391|[field, "technica...|
| 11990392|[inductance, pate...|
| 11990393|[first, "technica...|
| 11990394|[mentioned, reduc...|
| 11990395|[example,, soluti...|
| 11990396|[assembly., metal...|
| 11990397|[issue, conductiv...|
+---------+--------------------+
only showing top 10 rows



24/11/21 18:29:09 WARN TaskSetManager: Stage 4 contains a task of very large size (49659 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

DataFrame output_summary_partitioned/part-00005 saved to output_keywords


24/11/21 18:29:25 WARN TaskSetManager: Stage 1 contains a task of very large size (59063 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:29:32 WARN TaskSetManager: Stage 3 contains a task of very large size (59063 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:29:36 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:29:36 WARN TaskSetManager: Stage 4 contains a task of very large size (59063 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 11906438|["background, , h...|
| 11906439|["cross-reference...|
| 11906440|["cross-reference...|
| 11906441|[execute, geometr...|
| 11906442|[difficult, displ...|
| 11906443|[accurately, refl...|
| 11906444|[result., necessa...|
| 11906445|[data, location, ...|
| 11906446|[view, inspection...|
| 11906447|[imaging, least, ...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00002 saved to output_keywords


24/11/21 18:29:55 WARN TaskSetManager: Stage 1 contains a task of very large size (78920 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:30:01 WARN TaskSetManager: Stage 3 contains a task of very large size (78920 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:30:05 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:30:06 WARN TaskSetManager: Stage 4 contains a task of very large size (78920 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 11937697|["background, , 1...|
| 11937698|[vibrations, matt...|
| 11937699|[unit, frame., ma...|
| 11937700|[body., painful.,...|
| 11937701|[(e.g.,, pressure...|
| 11937702|[escaping, mattre...|
| 11937703|[wall, pressure.,...|
| 11937704|[air, mattresses,...|
| 11937705|[fast, signal, in...|
| 11937706|[mattress, patien...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00003 saved to output_keywords


24/11/21 18:30:23 WARN TaskSetManager: Stage 1 contains a task of very large size (54333 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:30:27 WARN TaskSetManager: Stage 3 contains a task of very large size (54333 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:30:31 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:30:31 WARN TaskSetManager: Stage 4 contains a task of very large size (54333 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 11968511|[openings, circui...|
| 11968512|["background, , i...|
| 11968513|[mobile, facing, ...|
| 11968514|[mobile, two, mag...|
| 11968515|[hole;, deformati...|
| 11968516|[sound, , sound, ...|
| 11968517|[first, method, ,...|
| 11968518|[monopole, depend...|
| 11968519|[executed, device...|
| 11968520|[element, indicat...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00004 saved to output_keywords


24/11/21 18:30:45 WARN TaskSetManager: Stage 1 contains a task of very large size (43004 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:30:49 WARN TaskSetManager: Stage 3 contains a task of very large size (43004 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:30:53 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:30:53 WARN TaskSetManager: Stage 4 contains a task of very large size (43004 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 12053941|[locking, plurali...|
| 12053942|[manufacturing, l...|
| 12053943|[create, area, th...|
| 12053944|[development, lig...|
| 12053945|[weight, thermofo...|
| 12053946|[mm., paraffin,, ...|
| 12053947|[“weld”., second,...|
| 12053948|[sockliners,, thr...|
| 12053949|[installation,, i...|
| 12053950|[items, digital, ...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00008 saved to output_keywords


24/11/21 18:31:06 WARN TaskSetManager: Stage 1 contains a task of very large size (41666 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:31:09 WARN TaskSetManager: Stage 3 contains a task of very large size (41666 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:31:13 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:31:13 WARN TaskSetManager: Stage 4 contains a task of very large size (41666 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 11881523|[subcollector,, d...|
| 11881524|[electrode, chip,...|
| 11881525|[semiconductor, b...|
| 11881526|[surface, type, "...|
| 11881527|[region, guard, r...|
| 11881528|[in,, apparent, d...|
| 11881529|[semiconductor, s...|
| 11881530|["background, , a...|
| 11881531|[array, , dispose...|
| 11881532|[terminal, solar,...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00001 saved to output_keywords


24/11/21 18:31:25 WARN TaskSetManager: Stage 1 contains a task of very large size (48957 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:31:29 WARN TaskSetManager: Stage 3 contains a task of very large size (48957 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:31:33 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:31:33 WARN TaskSetManager: Stage 4 contains a task of very large size (48957 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 12010587|[walls,, signal, ...|
| 12010588|[aspect,, state,,...|
| 12010589|["field, disclosu...|
| 12010590|[station, referen...|
| 12010591|[(mbms), setup”,,...|
| 12010592|[module, system, ...|
| 12010593|[interest, switch...|
| 12010594|[aspect, receivin...|
| 12010595|[associated, inst...|
| 12010596|[safety-related, ...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00006 saved to output_keywords


24/11/21 18:31:46 WARN TaskSetManager: Stage 1 contains a task of very large size (65497 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:31:50 WARN TaskSetManager: Stage 3 contains a task of very large size (65497 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:31:54 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:31:55 WARN TaskSetManager: Stage 4 contains a task of very large size (65497 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 12029594|[selected, probe,...|
| 12029595|[directed, substr...|
| 12029596|[axis, e.g., two,...|
| 12029597|[mri, receiver, c...|
| 12029599|[distribution, as...|
| 12029600|[comprise, values...|
| 12029601|[sentinel, second...|
| 12029602|[breast, image;, ...|
| 12029603|[radiographic, re...|
| 12029604|[target, detectio...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00007 saved to output_keywords


24/11/21 18:32:10 WARN TaskSetManager: Stage 1 contains a task of very large size (81563 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:32:16 WARN TaskSetManager: Stage 3 contains a task of very large size (81563 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:32:20 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:32:20 WARN TaskSetManager: Stage 4 contains a task of very large size (81563 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
|  PP35566|[consisting, plan...|
|  PP35567|[, "genus, specie...|
|  PP35568|[variety, , genus...|
|  RE49771|[thread, , screw,...|
|  RE49772|[locomotion, pola...|
|  RE49773|[two, hence, engi...|
|  RE49774|[hapten-antibody,...|
|  RE49775|[engines,, repeat...|
|  RE49776|[axis, lamp, pate...|
|  RE49777|[state, case, cal...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00000 saved to output_keywords


24/11/21 18:32:40 WARN TaskSetManager: Stage 1 contains a task of very large size (65076 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:32:46 WARN TaskSetManager: Stage 3 contains a task of very large size (65076 KiB). The maximum recommended task size is 1000 KiB.
24/11/21 18:32:50 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 18): Attempting to kill Python Worker
24/11/21 18:32:50 WARN TaskSetManager: Stage 4 contains a task of very large size (65076 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+
|patent_id|            keywords|
+---------+--------------------+
| 12068339|[development, cir...|
| 12068340|[front, image, ar...|
| 12068341|[silicon, present...|
| 12068342|[however,, predet...|
| 12068343|[isolation, recen...|
| 12068344|[shielding, proce...|
| 12068345|[surface, various...|
| 12068346|[moisture, coveri...|
| 12068347|[light,, light, w...|
| 12068348|[mold, includes, ...|
+---------+--------------------+
only showing top 10 rows



                                                                                

DataFrame output_summary_partitioned/part-00009 saved to output_keywords


In [6]:
from pyspark.sql import SparkSession

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Concatenate CSV Files") \
    .getOrCreate()

# Step 2: Load all CSV files from the directory
input_directory = "./output_keywords"
all_csv_files = spark.read.csv(input_directory, header=True, inferSchema=True)

# Optional: Preview the data
print("Schema of concatenated data:")
all_csv_files.printSchema()

# Step 3: Write the combined data into smaller files
output_directory = "./output_keywords_smaller_files"
number_of_partitions = 8  # Adjust the number of partitions as needed
all_csv_files.repartition(number_of_partitions).write.csv(output_directory, header=True)

# Stop the Spark Session
spark.stop()

Schema of concatenated data:
root
 |-- patent_id: string (nullable = true)
 |-- keywords: string (nullable = true)



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, trim

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Clean Keywords in CSV Files") \
    .getOrCreate()

# Step 2: Load all CSV files from the directory
input_directory = "./output_keywords_smaller_files"
df = spark.read.csv(input_directory, header=True, inferSchema=True)

# Step 3: Clean the 'keywords' column
# Remove unwanted characters like quotes and ensure comma-separated keywords
df_cleaned = df.withColumn(
    "keywords",
    # Remove quotes, brackets, and dots; fix double commas
    regexp_replace(
        col("keywords"),
        r"[\"\'\[\]\{\}\(\)\.]", ""  # Remove unwanted characters
    )
).withColumn(
    "keywords",
    regexp_replace(col("keywords"), r",,+", ",")  # Replace double commas with a single comma
).withColumn(
    "keywords",
    trim(regexp_replace(col("keywords"), r"(^,|,$)", ""))  # Remove leading/trailing commas
)

# Optional: Preview the cleaned data
print("Schema and cleaned data:")
df_cleaned.printSchema()
df_cleaned.show(10, truncate=False)

# Step 4: Write the cleaned data into smaller files
output_directory = "./output_keywords_cleaned_files"
number_of_partitions = 8  # Adjust the number of partitions as needed
df_cleaned.repartition(number_of_partitions).write.csv(output_directory, header=True)

# Stop the Spark Session
spark.stop()

24/11/21 18:52:09 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/11/21 18:52:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/21 18:52:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/21 18:52:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Schema and cleaned data:
root
 |-- patent_id: string (nullable = true)
 |-- keywords: string (nullable = true)

+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|patent_id|keywords                                                                                                                                                                                             |
+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|11919419 |cell, hev, abnormal, cell, distinct, chiller, battery, module, , includes, battery, groups, battery, includes, cell, battery, abnormal, distinct, abnormal, embodiments                              |
|11887314 |features, device, device, frame-level

                                                                                