# Patent Breakthrough walkthrough

This notebook illustrates the complete analysis process of breakthrough patents, from preparing input files
to calculating impact and novelty scores.

## 1. Preparing input files

There are three input files: a file with patent texts, a patent/year-index, and list of patent/CPC-codes.

### Patents
In its raw format, the input file contains the text of one patent file per line.
Each line starts with a path pointing to that patent's original text 
file (`/Volumes/External/txt/0000000-0100000/US1009.txt`), followed by the patent text. Example file: `./data/raw_input.txt`. 


### Patent/Year-index
Contains the year of publication of each patent. Example file: `./data/year.csv`. 


### CPC-file
The CPC-file (Cooperative Patent Classification) contains the patent classification code for each patent. These codes are used to calculate benchmark similarities. Example file: `./data/GPCPCs.txt`

Note: the included data files only contain a small subset of the original data, for example purposes.

#### Other files
The three other files in the data folder - `greek.txt`  `stopwords.txt`, and `symbols.txt` - are required by the `OldPreprocessor`-class.

In [8]:
from pathlib import Path

home_path = "/home/smildinerm"
storage_path = "/home/smildinerm/data/volume_2"
data_path = "/home/smildinerm/data/volume_2/data/USPTO"
input_file = Path(f"{data_path}/brief_summary")
year_file = Path(f"{data_path}/input_files/year.csv")
cpc_fp = Path(f"{data_path}/input_files/GPCPCs.txt")
patent_dir = Path(f"{storage_path}/patents")
output_folder = Path(f"{home_path}/output")
output_fp = Path(f"{home_path}/output", "patents.h5")
results_fp = Path(f"{home_path}/results")

output_folder.mkdir(exist_ok=True)
patent_dir.mkdir(exist_ok=True)

### 1.1. Compressing

The compressor function transforms the patents to a more manageable format, sorts and saves them by year of publication, and compresses the resulting files.

In [9]:
import json
import lzma
import re
from typing import List, Union, Dict
from pathlib import Path
from collections import Counter, defaultdict
from tqdm import tqdm
import os
import csv
import glob
import multiprocessing

def read_xz(compressed_fp: Union[Path, str]) -> List[Dict]:
    """Read an .xz file containing patents

    Arguments
    ---------
    compressed_fp:
        File to read the patents from.

    Results
    -------
    patents: List[Dict]
        Patents in the file.
    """
    with lzma.open(compressed_fp, mode="rb") as handle:
        patents = json.loads(handle.read().decode(encoding="utf-8"))
    return patents

def write_xz(compressed_fp: Union[Path, str], patents: List[Dict]) -> None:
    """Write a set of patents to a compressed file

    Arguments
    ---------
    fp:
        File to write to.
    patents:
        Patents to store.
    """
    with lzma.open(compressed_fp, mode="wb", preset=9) as handle:
        handle.write(str.encode(json.dumps(patents), encoding="utf-8"))

def parse_patent_file(patent_input_fp: str, year_lookup: str) -> List[Dict[str, str]]:
    """
    Parse a raw patent file into a structured list.

    Parameters:
    -----------
    patent_input_fp : str
        Path to the .tsv input file to process with columns 'patent_id' and 'summary_text', one patent per row.

    year_lookup : str
        Path to the .csv file to lookup the year for each patent ID with columns 'pat' (patient_id) and 'year'.

    Returns:
    --------
    List[Dict[str, str]]
        A sorted list of dictionaries, where each item is for one patent and
        the year of publication:
        {
            'patent': patent_id,
            'file': file name,
            'contents':  summary_text,
            'year': year
        }
    """
    
    print(f"Starting to parse file {patent_input_fp}.\n")
    
    # Increase the field size limit
    csv.field_size_limit(10**8)
    
    # Initialize an empty list to store the structured patent data
    patents = []

    # Read the year lookup data into a dictionary for quick access
    year_dict = {}
    with open(year_lookup, 'r', newline='') as year_file:
        year_reader = csv.DictReader(year_file, delimiter='\t')
        for row in year_reader:
            year_dict[row['pat']] = row['year']  # Convert 'year' to integer

    # Extract year from the filename
    year = str(patent_input_fp).split('_')[-1].split('.')[0]

    # Read the raw patent file and parse the data
    with open(patent_input_fp, 'r', newline='') as patent_file:
        patent_reader = csv.DictReader(patent_file, delimiter='\t')
        for row in tqdm(patent_reader):
            patent_id = row['patent_id']
            
            # Skip entries where "patent_id" is not an integer
            if not patent_id.isdigit():
                continue

            # Skip entries where "patent_id" is not in year_file:
            if patent_id not in year_dict:
                continue

            summary_text = row['summary_text']
            # year = year_dict.get(patent_id, None)

            # Create a dictionary for the current patent
            patent_data = {
                'patent': int(patent_id),  # Convert 'patent_id' to integer
                'file': str(patent_input_fp),
                'contents': summary_text,
                'year': int(year)
            }

            # Append the patent data to the list
            patents.append(patent_data)
    
    patents = sorted(patents, key=lambda x: x["patent"])
    
    return patents

def compress_raw_dir(patent_input_dir: Union[Path, str], year_fp: Union[Path, str], output_dir: Union[Path, str]) -> None:
    """Compress all raw files in a directory.

    For efficiency, it stores which files have already been processed in
    a file called 'processed_files.txt' in the output directory.
    If somehow there is corruption, or re-runs are required, simply
    delete this file.

    This function is not thread-safe.

    Arguments
    ---------
    patent_input_dir:
        Directory containing all raw files with patents.
    year_fp:
        CSV file with publication year for each patent.
    output_dir:
        Directory to write the compressed files to.
    """
    patent_input_dir = Path(patent_input_dir)
    
    for file in patent_input_dir.glob("g_brf_sum_text_*.tsv"):
        
        # Extract year from the filename
        year = str(file).split('_')[-1].split('.')[0]
        
        # Get next file path to process and compress
        patent_input_fp = file
        compressed_fp = output_dir / Path(str(year) + ".xz")
        
        # Check if the output file already exists
        if os.path.exists(compressed_fp):
            print(f"Output file {compressed_fp} already exists. Skipping to the next input file.\n")
            continue

        # Parse patents for a year in the right format
        patents = parse_patent_file(patent_input_fp, year_fp)

        # Write the patents to a files, numbered by year
        write_xz(compressed_fp, patents)


def compress_and_write(patent_input_fp: Path, year_fp: Path, output_dir: Path) -> None:
    """Compresses patents from a single file and writes the compressed data to a new file.

    Arguments:
    ---------
    patent_input_fp (Path):
        Path to the input patent file.
    year_fp (Path):
        Path to the CSV file with publication year for each patent.
    output_dir (Path):
        Directory to write the compressed files to.
    """
    year = str(patent_input_fp).split('_')[-1].split('.')[0]
    compressed_fp = output_dir / Path(str(year) + ".xz")

    # Check if the output file already exists
    if os.path.exists(compressed_fp):
        print(f"Output file {compressed_fp} already exists. Skipping to the next input file.\n")
        return

    # Parse patents for a year in the right format
    patents = parse_patent_file(patent_input_fp, year_fp)

    # Write the patents to a file, compressed
    write_xz(compressed_fp, patents)


def compress_raw_dir_parallel(patent_input_dir: Union[Path, str], year_fp: Union[Path, str], output_dir: Union[Path, str], num_cores: int) -> None:
    """Compresses patents from multiple files in parallel using multiprocessing.

    Arguments:
    ---------
    patent_input_dir (Union[Path, str]):
        Directory containing all raw files with patents.
    year_fp (Union[Path, str]):
        CSV file with publication year for each patent.
    output_dir (Union[Path, str]):
        Directory to write the compressed files to.
    num_cores (int):
        Number of CPU cores to use for parallel processing.
    """
    patent_input_dir = Path(patent_input_dir)
    year_fp = Path(year_fp)
    output_dir = Path(output_dir)

    patent_files = list(patent_input_dir.glob("g_brf_sum_text_*.tsv"))

    with multiprocessing.Pool(processes=num_cores) as pool:
        pool.starmap(compress_and_write, [(file, year_fp, output_dir) for file in patent_files])



In [3]:
# Example usage:
home_path = "/home/smildinerm"
storage_path = "/home/smildinerm/data/volume_2"
data_path = "/home/smildinerm/data/volume_2/data/USPTO"

input_file = Path(f"{data_path}/brief_summary")
# input_file = Path(f"{data_path}/mock")
year_file = Path(f"{data_path}/input_files/year.csv")
cpc_fp = Path(f"{data_path}/input_files/GPCPCs.txt")
patent_dir = Path(f"{storage_path}/patents")
output_folder = Path(f"{storage_path}/output")
output_fp = Path(f"{storage_path}/output", "patents.h5")
results_fp = Path(f"{storage_path}/results")

# Parse and compress files:
compress_raw_dir_parallel(input_file, year_file, patent_dir, 20)


Output file /home/smildinerm/data/volume_2/patents/1977.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1976.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1978.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1979.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1981.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1983.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1984.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1986.xz already exists. Skipping to the next input file.
Output file /home/smildinerm/data/volume_2/patents/1988.xz already exists. Skipping to the next input file.

Output file /home/smildiner

You now have XZ-compressed files containing patents per year. Each file contains a list of JSON-objects, each JSON-object has the following key/values:

- `patent`: patent's ID
- `file`: path of original text file (not actually used)
- `contents`: patent text
- `year`: year of publication

## 2. Calculating embeddings

We calculate embeddings and scores with four different models: Countvec, Tf-Idf, Doc2Vec, and BERT ([PatentSBERTa](https://github.com/AI-Growth-Lab/PatentSBERTa)).


### 2.1. Preprocessors & parameters
Each model has its own preprocessor with various parameters. Most models also have configurable hyperparameters. The values for these parameters have been optimised using the original dataset, resulting in the values used in the `compute_embeddings()`-function below.

To recalibrate preprocessor and model parameters, run each model's hyperopt-script. See the [readme](https://github.com/UtrechtUniversity/patent-breakthrough/blob/main/docs/hyperparameter.md) and [hyperopt-notebooks](hyperopt/) for more details.


### 2.2. Calculating embeddings
Next, we calculate the embeddings.

In [1]:
from pathlib import Path

home_path = "/home/smildinerm"
storage_path = "/home/smildinerm/data/volume_2"
data_path = "/home/smildinerm/data/volume_2/data/USPTO"

input_file = Path(f"{data_path}/brief_summary")
year_file = Path(f"{data_path}/input_files/year.csv")
cpc_fp = Path(f"{data_path}/input_files/GPCPCs.txt")
patent_dir = Path(f"{storage_path}/patents")
output_folder = Path(f"{storage_path}/output")
output_fp = Path(f"{storage_path}/output", "patents.h5")
results_fp = Path(f"{storage_path}/results")

output_folder.mkdir(exist_ok=True)
patent_dir.mkdir(exist_ok=True)

In [2]:
import json
import lzma
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import List, Union, Dict

from docembedder.models import TfidfEmbedder
from docembedder.preprocessor.preprocessor import Preprocessor
# from docembedder.preprocessor.oldprep import OldPreprocessor
from docembedder.models.doc2vec import D2VEmbedder
from docembedder.models import CountVecEmbedder
from docembedder.models import BERTEmbedder

from docembedder.utils import run_models
from docembedder.pretrained_run import pretrained_run_models
import datetime

def check_files(sim_spec):
    for year in range(sim_spec.year_start, sim_spec.year_end):
        if not (patent_dir / f"{year}.xz").is_file():
            raise ValueError(f"Please download patent file {year}.xz and put it in"
                             f"the right directory ({patent_dir})")

# def compute_embeddings_cv(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):

#     model_cv = {
#         "countvec": CountVecEmbedder(method='sigmoid')
#     }
#     prep_cv = {
#         "prep-countvec": OldPreprocessor(list_path=data_path)
#     }

#     check_files(sim_spec)
#     run_models(prep_cv, model_cv, sim_spec, patent_dir, output_fp, cpc_fp, n_jobs=n_jobs)
#     print('Calculated countvec emdeddings')

    
def compute_embeddings_tfidf(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):
    
    model_tfidf = {
        "tfidf": TfidfEmbedder(
            ngram_max=1,stop_words='english',stem=False, norm='l1', sublinear_tf=True, min_df=6, max_df=0.665461)
    }
    prep_tfidf = {
        "prep-tfidf": Preprocessor(keep_caps=True, keep_start_section=True, remove_non_alpha=True),
    }

    check_files(sim_spec)
    run_models(prep_tfidf, model_tfidf, sim_spec, patent_dir, output_fp, cpc_fp, n_jobs=n_jobs)
    print('Calculated tfidf emdeddings')

def compute_embeddings_doc2vec(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):

    model_doc2vec = {
        "doc2vec": D2VEmbedder(epoch=8, min_count=13, vector_size=100)
    }
    prep_doc2vec = {
        "prep-doc2vec": Preprocessor(keep_caps=False, keep_start_section=True, remove_non_alpha=False)
    }

    check_files(sim_spec)
    run_models(prep_doc2vec, model_doc2vec, sim_spec, patent_dir, output_fp, cpc_fp, n_jobs=n_jobs)
    print('Calculated doc2vec emdeddings')

def compute_embeddings_bert(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):

    model_bert = {
        "bert": BERTEmbedder(pretrained_model='AI-Growth-Lab/PatentSBERTa')
    }
    prep_bert = {
         "prep-bert": Preprocessor(keep_caps=True, keep_start_section=True, remove_non_alpha=True)
    }

    check_files(sim_spec)
    pretrained_run_models(prep_bert, model_bert, sim_spec, patent_dir, output_fp, cpc_fp)
    print('Calculated BERT emdeddings')

  from .autonotebook import tqdm as notebook_tqdm


#### Defining the calculation window

Embeddings are calculated within a time window, which shifts over the dataset and then recalculated.
This procedure is configured with the `SimulationSpecification()`, which has the following attributes:
    
- `year_start`: start year of the entire (sub)set of data to calculate embeddings for.
- `year_end`: id. end year (the end year itself is not included).
- `window_size`: width of the window (in years) to compute embeddings for.
- `window_shift`: number of years between subsequent windows.
- `debug_max_patents`: restrict the number of patents per year (optional; for testing purposes).
    
With the `n_jobs`-parameter you can set the number of concurrent jobs to run. A higher number means faster processing, but be aware that each job takes utilises one CPU-core.

In [12]:
from docembedder.simspec import SimulationSpecification

sim_spec = SimulationSpecification(
    year_start=1996,
    # year_start=2005,
    year_end=2023,
    window_size=11,
    window_shift=1,
    # cpc_samples_per_patent = 10000,
    # debug_max_patents = 1000,
    # n_patents_per_window = 10000,
)

n_jobs=5

#### Computing embeddings

Now that we've defined the window, we can calculate embeddings, using each of the four models.
    
Be aware, depending on the amlount of patents and window size, this will take quite some time, 
and can require a (_very_) large amount of memory. Warnings from the Countvec calculations can be ignored.

All output is stored in a HDF5 file, which contains embeddings for all patents in all windows.

In [13]:
output_fp = Path(f"{storage_path}/output", "patents_1996-2022_11yrw_tfidf.h5")
# output_fp = Path(f"{storage_path}/output", "patents_2005-2022_11yrw_tfidf.h5")
args={'patent_dir': patent_dir, 'output_fp': output_fp, 'cpc_fp': cpc_fp, 'sim_spec': sim_spec, 'n_jobs': n_jobs}

# Tf-Idf
compute_embeddings_tfidf(**args)

  0%|          | 0/8 [5:05:06<?, ?it/s]


OSError: [Errno 28] Can't synchronously write data (file write failed: time = Thu Feb 29 02:39:02 2024
, filename = '/home/smildinerm/data/volume_2/output/temp_2005-2015.h5', file descriptor = 5, errno = 28, error message = 'No space left on device', buf = 0x7f9e9b4e5fc8, total write size = 4337667848, bytes this sub-write = 4337667848, bytes actually written = 18446744073709551615, offset = 0)

In [None]:
output_fp = Path(f"{storage_path}/output", "patents_1996-2022_11yrw_doc2vec.h5")
args={'patent_dir': patent_dir, 'output_fp': output_fp, 'cpc_fp': cpc_fp, 'sim_spec': sim_spec, 'n_jobs': n_jobs}

# Doc2Vec
compute_embeddings_doc2vec(**args)

In [None]:
output_fp = Path(f"{storage_path}/output", "patents_1996-2022_11yrw_bert.h5")
args={'patent_dir': patent_dir, 'output_fp': output_fp, 'cpc_fp': cpc_fp, 'sim_spec': sim_spec, 'n_jobs': n_jobs}

# BERT
compute_embeddings_bert(**args)

## 3. Impact and novelty scores

### 3.1. Calculating the scores

After we've computed and stored the embeddings, we compute novelty and impact scores. The result is a dictionary per model, each containing the novelties and impacts for each patent.


_Note on exponents_

The exponents (`[1.0, 2.0, 3.0]`) are used in the calculations to reward patents that are more similar to the patent under consideration. The backward and forward similarities for each patent is calculated based on the mean of all cosine similarities with the preceding and following patents in the window, using the formula `(x1**a + x2**a + ...)**(1/a)`, with `a` being the exponent. An `a` larger than 1 increases the weight of similarities closer to 1, i.e. of embeddings that are more similar to the one under consideration. The output includes the result for each exponent.

In [3]:
from docembedder.analysis import DocAnalysis
from docembedder.datamodel import DataModel
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

def compute_impacts(embedding_fp, output_dir, progr_dir, n_jobs):
    exponents = [1.0, 5.0, 10.0]

    impact_novel = defaultdict(lambda: defaultdict(list))

    progr_dir.mkdir(exist_ok=True, parents=True)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    with DataModel(embedding_fp, read_only=False) as data:
        analysis = DocAnalysis(data)
        
        # Use tqdm for progress tracking
        window_models = list(data.iterate_window_models())
        for window, model in tqdm(window_models, desc="Processing windows"):
            results = analysis.impact_novelty_results(window, model, exponents, cache=False, n_jobs=n_jobs)

            for expon, res in results.items():
                if expon == exponents[0]:
                    impact_novel[model]["patent_ids"].extend(res["patent_ids"])
                impact_novel[model][f"impact-{expon}"].extend(res["impact"])
                impact_novel[model][f"novelty-{expon}"].extend(res["novelty"])
                
                # Save intermediate results as .csv
                intermediate_results_fp = Path(progr_dir, f"results-{model}-{window}-{expon}.csv")
                pd.DataFrame(res).to_csv(intermediate_results_fp, index=False)

    for model, data in impact_novel.items():
        classifier_name = model.split("-")[-1]
        impact_fp = Path(output_dir, f"impact-{classifier_name}.csv")
        pd.DataFrame(impact_novel[model]).sort_values("patent_ids").to_csv(impact_fp, index=False)


In [None]:
progr_fp = Path(f"{storage_path}/results/intermediate")
output_fp = Path(f"{storage_path}/output", "patents_1996-2022_11yrw_tfidf.h5")

compute_impacts(embedding_fp=output_fp, output_dir=results_fp, progr_dir = progr_fp, n_jobs=20)

Processing windows:  12%|█▏        | 2/17 [23:50:23<179:13:17, 43013.19s/it]

In [6]:
import h5py

output_fp = Path(f"{storage_path}/output", "patents_1996-2022_11yrw_tfidf.h5")

with h5py.File(output_fp, 'r') as file:
    print("Keys in the HDF5 file:")
    print(list(file.keys()))

# Open the HDF5 file in read mode
with h5py.File(output_fp, 'r') as file:
    # Access the dataset(s) within the file
    cpc = file['cpc']
    embeddings = file['embeddings']
    impact_novelty = file['impact_novelty']
    models = file['models']
    preprocessors = file['preprocessors']
    windows = file['windows']

    # # Read the data from the dataset(s)
    # cpc = cpc[:]
    # embeddings = embeddings[:]
    # impact_novelty = impact_novelty[:]
    # models = models[:]
    # preprocessors = preprocessors[:]
    # windows = windows[:]


Keys in the HDF5 file:
['cpc', 'embeddings', 'impact_novelty', 'models', 'preprocessors', 'windows']


In [7]:
with h5py.File(output_fp, 'r') as file:
    windows_group = file['windows']
    print("Keys in the 'windows' group:")
    print(list(windows_group.keys()))

Keys in the 'windows' group:
['1996-2006', '1997-2007', '1998-2008', '1999-2009', '2000-2010', '2001-2011', '2002-2012', '2003-2013', '2004-2014', '2005-2015', '2006-2016', '2007-2017', '2008-2018', '2009-2019', '2010-2020', '2011-2021', '2012-2022']


In [8]:
import h5py

def print_hdf5_structure(group, indent=0):
    """Recursively print the structure of an HDF5 group."""
    for key in group.keys():
        print("  " * indent + f"{'Group' if isinstance(group[key], h5py.Group) else 'Dataset'}: {key}")
        if isinstance(group[key], h5py.Group):
            print_hdf5_structure(group[key], indent + 1)

# Open the HDF5 file in read mode
with h5py.File(output_fp, 'r') as file:
    # Print the overall structure
    print(f"File: {file.filename}")
    print_hdf5_structure(file)


File: /home/smildinerm/data/volume_2/output/patents_1996-2022_11yrw_tfidf.h5
Group: cpc
  Group: 1996-2006
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 1997-2007
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 1998-2008
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 1999-2009
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 2000-2010
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 2001-2011
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 2002-2012
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 2003-2013
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 2004-2014
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  Group: 2005-2015
    Dataset: correlations
    Dataset: i_patents
    Dataset: j_patents
  

In [69]:
output_fp = Path(f"{storage_path}/output", "patents_tfidf_1976-2022_5yrw.h5")

with h5py.File(output_fp, 'r') as file:
    group = file['embeddings']
    print("Keys in the 'windows' group:")
    print(list(group.keys()))
    subgroup = group['prep-tfidf-tfidf']
    print(list(subgroup.keys()))
    subsubgroup = subgroup['1976-1980']
    print(list(subsubgroup.keys()))
    data = subsubgroup['data'][:]


Keys in the 'windows' group:
['prep-tfidf-tfidf']
['1976-1980', '1977-1981', '1978-1982', '1979-1983', '1980-1984', '1981-1985', '1982-1986', '1983-1987', '1984-1988', '1985-1989', '1986-1990', '1987-1991', '1988-1992', '1989-1993', '1990-1994', '1991-1995', '1992-1996', '1993-1997', '1994-1998', '1995-1999', '1996-2000', '1997-2001', '1998-2002', '1999-2003', '2000-2004', '2001-2005', '2002-2006', '2003-2007', '2004-2008', '2005-2009', '2006-2010', '2007-2011', '2008-2012', '2009-2013', '2010-2014', '2011-2015', '2012-2016', '2013-2017', '2014-2018', '2015-2019', '2016-2020', '2017-2021']
['data', 'indices', 'indptr']


71460467

In [7]:
progr_fp = Path(f"{storage_path}/results/intermediate")
output_fp = Path(f"{storage_path}/output", "patents_1996-2022_11yrw_doc2vec.h5")

compute_impacts(embedding_fp=output_fp, output_dir=results_fp, progr_dir = progr_fp, n_jobs=25)

analysis: <docembedder.analysis.DocAnalysis object at 0x7fa77087b4f0>
window: 1996-2006
model: prep-doc2vec-doc2vec


IndexError: index 9970 is out of bounds for axis 0 with size 9969

### 3.2. Output

After the computations are done, novelty and impact scores are written to CSV-files in the results folder. One file per model, with novelty and impact scores for each exponent. The key column refers back to the patent ID's from the original data.

Below is a list of the resulting files.

In [4]:
[str(path.absolute()) for path in results_fp.iterdir()]

[]

In [9]:
impact_tfidf = pd.read_csv(Path(f"{results_fp}/impact-tfidf.csv"))
# impact_doc2vec = pd.read_csv(Path(f"{results_fp}/impact-doc2vec.csv"))

In [10]:
impact_tfidf.head()

Unnamed: 0,patent_ids,impact-1.0,novelty-1.0,impact-2.0,novelty-2.0,impact-3.0,novelty-3.0
0,4065812,1.001182,0.477062,1.001191,0.4769,1.001199,0.476735
1,4065813,1.00058,0.488503,1.000586,0.488458,1.000593,0.488412
2,4065814,1.000898,0.491731,1.000902,0.491686,1.000907,0.491641
3,4065815,1.000458,0.491266,1.000462,0.491225,1.000467,0.491182
4,4065816,1.000488,0.481946,1.000486,0.481885,1.000483,0.481823


In [13]:
impact_tfidf.describe()

Unnamed: 0,patent_ids,impact-1.0,novelty-1.0,impact-2.0,novelty-2.0,impact-3.0,novelty-3.0
count,6409309.0,6409309.0,6409309.0,6409309.0,6409309.0,6409309.0,6409309.0
mean,7292881.0,0.9999473,0.4810577,0.9999486,0.4809243,0.99995,0.4807864
std,1863551.0,0.001038546,0.004979888,0.001054899,0.005022688,0.001072521,0.005067377
min,4065812.0,0.9929912,0.4507686,0.9926143,0.4505437,0.9922072,0.4503165
25%,5679864.0,0.999301,0.477742,0.9992952,0.4775796,0.9992892,0.477411
50%,7292353.0,0.9999189,0.4811134,0.9999192,0.4809825,0.9999195,0.4808478
75%,8905024.0,1.000541,0.4844514,1.000548,0.4843498,1.000556,0.4842445
max,10524400.0,1.006098,0.5,1.006192,0.5,1.006302,0.5


In [1]:
import matplotlib.pyplot as plt
# plt.subplot()
# plt.scatter(x=impact_tfidf["impact-1.0"],y=impact_doc2vec["impact-1.0"])
# plt.show()

# plt.scatter(x=impact_doc2vec["impact-1.0"],y=impact_doc2vec["impact-3.0"])
# plt.show()

plt.scatter(x=impact_tfidf["impact-1.0"],y=impact_tfidf["novelty-1.0"])
plt.show()

NameError: name 'impact_tfidf' is not defined

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])