# Imports

In [None]:
import pathlib
import sys
sys.path.insert(0, 'C:/thesis/transfer-learning-with-feature-selection/')
sys.path.insert(0, 'C:/thesis/transfer-learning-with-feature-selection/lib/')
sys.path.insert(0, 'C:/thesis/transfer-learning-with-feature-selection/utils/')
import lib as lib
import json
import os
import torch
from random import sample
from sklearn.utils import resample
import numpy as np
import pandas as pd
from lib.similarity import CentroidSimilarity
from sklearn.svm import SVC,LinearSVC
from lib.feature_selection import FeatureSelectionDiversityPursuitAnova,FeatureSelectionOneVsAllAnova,FeatureSelectionDiversityPursuitKruskal, FeatureSelectionOneVsAllKS
from lib.classifier_with_feature_selection import ClassifierFeatureSelection
from sklearn.metrics import balanced_accuracy_score
from utils.experiment_utils import multiple_classifiers_fit_predict
from utils.experiment_utils import get_feature_extractor, extract_features, preprocessing_model, identity_model, classifiers_hyper_tune
from utils.experiment_utils import scan_experiment
from utils.experiment_utils import get_images_from_supervised_set
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from utils.visualization import bar_plot_scores
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import plotly.graph_objects as go
import plotly
import plotly.express as px
from pathlib import Path
import mlflow
import time
import tiktoken
import pickle
from plotly.subplots import make_subplots
from scipy.spatial.distance import cdist
from utils.experiment_utils import compute_inter_class_distances
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
%load_ext autoreload
%autoreload 2

In [None]:
models_dir = "c:/transformer_models/"
if os.path.isdir(models_dir) is not True:
    os.mkdir(models_dir)

In [None]:
datasets_dir = "c:/datasets/"

# Functions

In [None]:
def parse_code_to_levels(code):
    split = code.split()
    # first part has a letter,double digit number,letter - e.g. 'H04N'. these are the first 3 levels
    part_1 = split[0]
    l = [part_1[0], part_1[1:3], part_1[3]]
    # get part 2 which has a structure [d]/[d]
    part_2 = split[1]
    # split the second part on the '/'
    split_2 = part_2.split('/')
    l_4 = split_2[0]
    l.append(l_4)
    l_5 = split_2[1][:2]
    l.append(l_5)
    return tuple(l)

In [None]:
from typing import Union


def get_code_data(tree: dict, code: list, tree_depth: int = 5):
    """
    extracts all patents ids nestled in the tree under the given code
    Args:
        tree: a data structure of nestled dictionaries where tree nodes are patents classification in
        different levels and the leaves are lists of patents
        code: a list of strings - each entry is another level in a patent cpcs code
        tree_depth : int. maximal tree depth
    Returns:
        patents - a list of patents all have the same cpcs code
    """
    patents = []
    next_level = tree[code[0]]

    for i in range(1,len(code)):
        # descend down the tree
        next_level = next_level[code[i]]
    # get all patents in that subtree
    return get_tree_leaves(next_level, patents)


def get_tree_leaves(tree: Union[dict,list], leaves: list) -> list:
    """
    a recursive function for extracting tree leaves
    Args:
        tree: may be a dict in case it's a tree with further nodes, or a list if it's the tree leaf
        leaves: a list with currently found leaves

    Returns:
        a new list of leaves with the new ones found in the input tree
    """
    if type(tree) is dict:
        for key in tree.keys():
            leaves = get_tree_leaves(tree[key], leaves)
    else:
        leaves.extend(tree)
    return leaves



In [None]:
def get_unique_patents_per_code(codes_to_extract: list, codes_to_patents_mapping: dict):
    code_unique_patents = {}
    for code in codes_to_extract:
        # get all patents for the current code
        code_patents = set(get_code_data(codes_to_patents_mapping, code))

        # get unified list of patents from all other codes
        other_codes = codes_to_extract.copy()
        other_codes.remove(code)
        other_patents = []
        for oc in other_codes:
            other_patents.extend(get_code_data(codes_to_patents_mapping, oc))
        other_patents = set(other_patents)
        code_unique_patents[code] = list(code_patents.difference(other_patents))

    return code_unique_patents


def get_embeddings(patents_list: list, 
                   patents_dict: dict, 
                   embedding_dict: dict, 
                   embedding_model: str = 'gpt2', 
                   cohere_client = None, 
                   cohere_model: str = None,
                   batch_size: int = 96) -> np.ndarray:
    embeddings = []
    batch_patents = []
    if embedding_model == 'gpt2':
        for p in patents_list:
            if p not in embedding_dict.keys():
                text = get_abstract(patent_id=p,patents_dict=patents_dict)
                input = gpt2_tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
                with torch.no_grad():
                    output = gpt2_model(**input)
                    embedding_dict[p] = {'emb' : output.last_hidden_state[:,:,:].numpy().mean(axis=1).squeeze()}
            # update the embeddings
            embeddings.append(embedding_dict[p]['emb'])

    if embedding_model == 'cohere':
        temp_embedding_dict = {p : {} for p in patents_list}
        assert (type(cohere_client) == cohere.client.Client) or (type(cohere_client) == cohere.client_v2.ClientV2), "undefined cohere client"
        assert cohere_model is not None, "undefined cohere model"
        for p in patents_list:
            if p not in embedding_dict.keys():
                batch_patents.append(p)
                if len(batch_patents) == batch_size:
                    # get texts for this batch patents
                    texts = [get_abstract(patent_id=p,patents_dict=patents_dict) for p in batch_patents]
                    # use the cohere API to embed tham
                    try:
                        res = cohere_client.embed(texts=texts, model=cohere_model, input_type="classification", embedding_types=["float"])
                    except RemoteProtocolError:
                        print('cohere API error')
                        return np.empty((1,)), False
                    # update the temp dictionary
                    for i in range(len(batch_patents)):
                        temp_embedding_dict[batch_patents[i]] = {'emb' : res.embeddings.float_[i]}
                        
                    # restart the batch_patents list for the next batch
                    batch_patents = []
                
            else:        
                # we have an embedding for this patent in the embedding dict
                temp_embedding_dict[p] = {'emb' : embedding_dict[p]['emb']}
        
        # if we embed in batches we might exit the loop over all patents when there are still patents to embed
        if len(batch_patents) > 0:
            # get texts for this batch patents
            texts = [get_abstract(patent_id=p,patents_dict=patents_dict) for p in batch_patents]
            # use the cohere API to embed tham
            try:
                res = cohere_client.embed(texts=texts, model=cohere_model, input_type="classification", embedding_types=["float"])
            except RemoteProtocolError:
                print('cohere API error')
                return np.empty((1,)), False
            # update the temp dictionary
            for i in range(len(batch_patents)):
                temp_embedding_dict[batch_patents[i]] = {'emb' : res.embeddings.float_[i]}

        # at this points we should have embeddings for all patents in the input patents_list. copy them from the temp_embedding_dict to the global embedding_dict
        # and update the embeddings
        assert set(patents_list) == set(list(temp_embedding_dict.keys())), "get_embeddings: temp_embedding_dict must have all input patents"
        for p in patents_list:
            if p not in embedding_dict.keys():
                embedding_dict[p] = {'emb' : temp_embedding_dict[p]['emb']}
            
            embeddings.append(embedding_dict[p]['emb'])
        
    return np.stack(embeddings, axis=0), True

def get_abstract(patent_id: str, patents_dict: dict) -> str:
    abstract = patents_dict[patent_id]['abstract']
    return abstract

In [None]:
# building a tree dictionary - from codes to a list of all patents classified to that code (each patent may be assigned to few codes)
# the dictionary is built hierarchic - first level of keys is first letter of the code, second level of keys is the next two digits etc.
# the dict has 5 levels
def get_codes_to_patent_mapping(patents_dict: dict):
    codes_to_patent_mapping = {}
    for k in patents_dict.keys():
        for c in patents_dict[k]['cpcs']:
            levels = parse_code_to_levels(c)
            if levels[0] not in codes_to_patent_mapping.keys():
                codes_to_patent_mapping[levels[0]] = {}

            if levels[1] not in codes_to_patent_mapping[levels[0]].keys():
                codes_to_patent_mapping[levels[0]][levels[1]] = {}

            if levels[2] not in codes_to_patent_mapping[levels[0]][levels[1]].keys():
                codes_to_patent_mapping[levels[0]][levels[1]][levels[2]] = {}

            if levels[3] not in codes_to_patent_mapping[levels[0]][levels[1]][levels[2]].keys():
                codes_to_patent_mapping[levels[0]][levels[1]][levels[2]][levels[3]] = {}

            if levels[4] not in codes_to_patent_mapping[levels[0]][levels[1]][levels[2]][levels[3]].keys():
                codes_to_patent_mapping[levels[0]][levels[1]][levels[2]][levels[3]][levels[4]] = [k]
            else:
                codes_to_patent_mapping[levels[0]][levels[1]][levels[2]][levels[3]][levels[4]].append(k)

    return codes_to_patent_mapping


In [None]:
def num_tokens_from_string(string: str, encoding) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoding.encode(string))
    return num_tokens


# Data Load

## Bert Embeddings

In [None]:
embedding_file_path = os.path.join(datasets_dir,"patent_bert_emb/PatentBert_emb_top100_cpc_class.json")
patents_dict = {}

In [None]:
f = open(embedding_file_path)
embedding_dict = json.load(f)

In [None]:
print(f"there are {len(embedding_dict)} patents in the dictionary")
print(embedding_dict[next(iter(embedding_dict))].keys())

In [None]:
# looking at an example codes (labels) of a given entry
print(embedding_dict[next(iter(embedding_dict))]['cpcs'])

## Text (abstracts)

In [None]:
patents_file_path = os.path.join(datasets_dir,"patents_text/Patents/top100_cpc_class_abstract.json")
embedding_dict = {}

In [None]:
patents_dict = []
with open(patents_file_path, 'r') as file:
    for line in file:
        patents_dict.append(json.loads(line))

In [None]:
patents_dict = {list(p.keys())[0] : list(p.values())[0] for p in patents_dict}

In [None]:
print(f"there are {len(patents_dict)} patents in the dictionary")
print(patents_dict[next(iter(patents_dict))].keys())

In [None]:
all_abstracts = [patents_dict[k]['abstract'] for k in patents_dict.keys()]
print(f"there are {len(all_abstracts)} abstracts")

In [None]:
# get the distribution of sentences length
abstracts_length = {i : len(all_abstracts[i].split()) for i in range(len(all_abstracts))}
length_dist, bins = np.histogram(list(abstracts_length.values()),bins=50)
bins = bins[:-1] + (bins[1] - bins[0])/2
seq_len_fig = px.bar(x=bins, y=length_dist)
seq_len_fig.update_layout(title='abstract length histogram', width=800)
seq_len_fig.update_xaxes(title="length")
seq_len_fig.update_yaxes(title="count")
seq_len_fig.show()

In [None]:
min_len = 32
patents_dict = {k : v for k,v in patents_dict.items() if len(v['abstract'].split()) > min_len}

In [None]:
print(f"there are {len(patents_dict)} patents in the dictionary")
print(patents_dict[next(iter(patents_dict))].keys())

In [None]:
for k in patents_dict.keys():
    patents_dict[k]['abstract']= patents_dict[k]['abstract'].replace("\n", " ")
    

In [None]:
all_abstracts = [patents_dict[k]['abstract'] for k in patents_dict.keys()]
print(f"there are {len(all_abstracts)} abstracts")

In [None]:
# looking at an example codes (labels) of a given entry
print(patents_dict[next(iter(patents_dict))]['cpcs'])

# Get OpenAI embeddings

## Create embeddings

In [None]:
from openai import OpenAI
key = ## YOUR KEY

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")
openai_tokens = [num_tokens_from_string(abstract,encoding) for abstract in all_abstracts]

In [None]:
k_tokens_price = 0.00002
num_tokens = np.sum(openai_tokens)
print(f"num_tokens = {num_tokens}, num K tokens = {num_tokens/1000}, price = {(num_tokens/1000)*k_tokens_price}")

In [None]:
print(f"max number of tokens = {np.max(openai_tokens)}")

In [None]:
patent_id_abstract_tuples = [(k,patents_dict[k]['abstract']) for k in patents_dict.keys()]

In [None]:
current_batch = 0


In [None]:
len(patent_id_abstract_tuples)

In [None]:
embedding_model = "text-embedding-3-small"
batch_size = 1500
tokens_per_minute_limit = 1000000
client = OpenAI(api_key=key)
minute_tic = time.perf_counter()
tokens_last_minute = 0
for current_batch in range(10,145):
    num_remaining_abstracts = len(patent_id_abstract_tuples) - current_batch*batch_size
    if num_remaining_abstracts >= batch_size:
        print(f"getting {batch_size} embeddings")
        ids = [patent_id_abstract_tuples[current_batch*batch_size + i][0] for i in range(batch_size)]
        text = [patent_id_abstract_tuples[current_batch*batch_size + i][1] for i in range(batch_size)]
    else:
        print(f"getting {num_remaining_abstracts} embeddings")
        ids = [patent_id_abstract_tuples[current_batch*batch_size + i][0] for i in range(num_remaining_abstracts)]
        text = [patent_id_abstract_tuples[current_batch*batch_size + i][1] for i in range(num_remaining_abstracts)]
        
    
    if tokens_last_minute > tokens_per_minute_limit:
        toc = time.perf_counter()
        time.sleep(60)
        print(f"sleeping for {60} sec")
        minute_tic = time.perf_counter()
        tokens_last_minute = 0
    tokens_last_minute += np.sum([num_tokens_from_string(s,encoding) for s in text])
    tic = time.perf_counter()
    response = client.embeddings.create(input=text, model=embedding_model)
    assert len(response.data) == len(ids), f"batch {current_batch} response length must match input length"
    current_dict = {ids[i] : response.data[i].embedding for i in range(len(ids))}
    file_name = os.path.join(datasets_dir,"patents_text/openai_small_embeddings/",f"embedding_batch_{current_batch}.pkl")
    with open(file_name,'wb') as f:
        pickle.dump(current_dict, f)
    for i in range(len(ids)):
        patents_dict[ids[i]]['emb'] = current_dict[ids[i]]
    toc = time.perf_counter()
    minute_timer = toc - minute_tic
    if minute_timer >= 60:
        minute_tic = time.perf_counter()
        tokens_last_minute = 0
    print(f"batch {current_batch}: {toc - tic:0.4f} seconds passed \n\n")

In [None]:
file_name = os.path.join(datasets_dir,"patents_text/openai_small_embeddings/",f"all_embeddings.pkl")
with open(file_name,'wb') as f:
    pickle.dump(patents_dict, f)

## Load ready embeddings

In [None]:
# in case we already have embeddings in a file
file_name = os.path.join(datasets_dir,"patents_text/openai_large_embeddings/",f"all_embeddings.pkl")
with open(file_name,'rb') as f:
    patents_dict = pickle.load(f)

# Cohere Embeddings

In [None]:
import cohere
from cohere import Client
cohere_api_key = ## YOUR KEY
cohere_trial_key = ## YOUR KEY

In [None]:
cohere_trial_client = cohere.Client(cohere_trial_key)

In [None]:
type(cohere_trial_client) == cohere.client.Client

In [None]:
def cohere_num_tokens_from_string(string: str, cohere_tokenizer) -> int:
    """Returns the number of tokens in a text string."""
    tokenized_str = cohere_tokenizer.encode(sequence=string, add_special_tokens=True)
    return len(tokenized_str.tokens)
    

In [None]:
from tokenizers import Tokenizer  
import requests

In [None]:
cohere_prod_client = cohere.ClientV2(cohere_api_key)
response = cohere_prod_client.models.list()
response_dict = json.loads(response.json())


In [None]:
type(cohere_prod_client)

In [None]:
for model in response_dict['models']:
    if model['name'] == 'embed-english-v3.0':
        tokenizer_url = model['tokenizer_url']
        print(f"tokenizer_url = {tokenizer_url}")
        break
        

In [None]:
# get the tokenizer locally
response = requests.get(tokenizer_url)  
cohere_tokenizer = Tokenizer.from_str(response.text)


In [None]:
cohere_tokens = [cohere_num_tokens_from_string(abstract, cohere_tokenizer) for abstract in all_abstracts]

In [None]:
M_tokens_price = 0.1
num_tokens = np.sum(cohere_tokens)
print(f"num_tokens = {num_tokens}, num 1M tokens = {num_tokens/1_000_000}, price = {(num_tokens/1_000_000)*M_tokens_price}")

In [None]:
# example embedding

model = "embed-english-v2.0"
input_type = "classification"
t1 = time.perf_counter()
res = cohere_trial_client.embed(texts=all_abstracts[:96],
    model=model,
    input_type=input_type,
    embedding_types=["float"],
)
t2 = time.perf_counter()

In [None]:
cohere_model = "embed-english-v2.0"
cohere_embeddings_dict = {}

In [None]:
len(res.embeddings.float_[90])

# Pre-process

## TF-IDF extraction

In [None]:
abstracts = [patents_dict[k]['abstract'] for k in patents_dict.keys()]
print(f"there are {len(abstracts)} abstracts")

In [None]:
# represent each sentence as TF-IDF vector
from sklearn.feature_extraction.text import TfidfVectorizer
min_df = 50
# if a term appears in more than 0.5 of documents, omit it
max_df = 0.5
tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, use_idf=True)
abstract_vectorizer = tfidf_vectorizer.fit(abstracts)


In [None]:
print(f"number of stop words = {len(abstract_vectorizer.stop_words_)}")
print(f"vocab size = {len(abstract_vectorizer.vocabulary_)}")

## Pre-trained language models definition

In [None]:
max_length = 256

### XLnet

In [None]:
from transformers import XLNetModel,AutoTokenizer

xlnet_tokenizer = AutoTokenizer.from_pretrained("xlnet-large-cased", cache_dir=models_dir)
xlnet_model = XLNetModel.from_pretrained("xlnet-large-cased", cache_dir=models_dir)

In [None]:
embed_dim = xlnet_model.config.hidden_size
print(f"embedding dimension = {embed_dim}")

### GPT-2

In [None]:
from transformers import AutoTokenizer, GPT2Model
model_name = 'gpt2'
gpt2_tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=models_dir)
gpt2_model = GPT2Model.from_pretrained(model_name, cache_dir=models_dir)

In [None]:
gpt2_model.embed_dim

In [None]:
embed_data = True
idx = 1
codes = experiments_codes['level_4'][idx]
exp_patents = get_unique_patents_per_code(codes, codes_to_patent_mapping)
min_num_samples = min([len(exp_patents[p]) for p in exp_patents.keys()])
max_num_samples = 100
# get the patents embeddings and arrange in an array
X = []
y = []
j = 0
for k in exp_patents.keys():
    if embed_data:
        curr_patents = get_abstracts(exp_patents[k], patents_dict)
        curr_patents = resample(curr_patents, replace=False, n_samples=min_num_samples)
        if len(curr_patents) > max_num_samples:
            curr_patents = resample(curr_patents, replace=False, n_samples=max_num_samples)
        X.extend(curr_patents)

In [None]:
input = xlnet_tokenizer(X[:1], truncation=True, padding=True, max_length=max_length,return_tensors="pt")

In [None]:
input['input_ids'].shape

In [None]:
tic = time.perf_counter()
output = xlnet_model(**input)
toc = time.perf_counter()
print(f"so far, {toc - tic:0.4f} seconds passed \n\n")

In [None]:
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
tic = time.perf_counter()
batch_size = 1
input = gpt2_tokenizer(X[6:6+batch_size], truncation=True, padding=True, max_length=max_length,return_tensors="pt")
with torch.no_grad():
    output = gpt2_model(**input)
    # with GPT2 embeddings, we take the mean of the hidden states of all tokens
    #gpt2_embeddings[i,:] = output.last_hidden_state[:,:,:].numpy().mean(axis=1)
    #gpt2_embeddings[i,:] = output.last_hidden_state[:,-1,:].numpy()
toc = time.perf_counter()
inference_time = toc - tic
print(f"so far, {inference_time:0.4f} seconds passed \n\n")
print(f"inference time per sentence =  {inference_time/batch_size:0.4f} \n\n")

In [None]:
output.last_hidden_state.shape

## Extract codes for classification

In [None]:
def validate_min_samples(codes, required_samples_num, codes_list, codes_to_patent_mapping):
    curr_codes_patents = get_unique_patents_per_code(codes, codes_to_patent_mapping)
    # get the class with minimal number of patents, and down sample the rest of the classes to this size
    num_samples = min([len(curr_codes_patents[p]) for p in curr_codes_patents.keys()])
    if num_samples > required_samples_num:
        codes_list.append(codes)
    return codes_list


In [None]:
codes_to_patent_mapping = get_codes_to_patent_mapping(patents_dict=patents_dict)

In [None]:
num_experiments = 50
num_classes = 3
required_samples_num = 100
experiments_codes = {f'level_{i}' : [] for i in range(1,6)}

# generate codes for level 1
all_first_level_codes = list(codes_to_patent_mapping.keys())
assert num_classes <= len(all_first_level_codes), "number off classes cannot exceed number of possible codes"
while len(experiments_codes['level_1']) < num_experiments:
    codes = sample(all_first_level_codes, num_classes)
    experiments_codes['level_1'] = validate_min_samples(codes, required_samples_num, experiments_codes['level_1'], codes_to_patent_mapping)



# generate codes for level 2
while len(experiments_codes['level_2']) < num_experiments:
    # generate codes for current experiment
    first_level_code = sample(all_first_level_codes, 1)[0]
    second_level_pool = list(codes_to_patent_mapping[first_level_code].keys())
    if len(second_level_pool) < num_classes:
        continue
    second_level_codes = sample(second_level_pool,num_classes)
    codes = [(first_level_code,c) for c in second_level_codes]
    experiments_codes['level_2'] = validate_min_samples(codes, required_samples_num, experiments_codes['level_2'], codes_to_patent_mapping)


# generate codes for level 3
while len(experiments_codes['level_3']) < num_experiments:
    # generate codes for current experiment
    first_level_code = sample(all_first_level_codes, 1)[0]
    second_level_pool = list(codes_to_patent_mapping[first_level_code].keys())
    if len(second_level_pool) < num_classes:
        continue
    second_level_code = sample(second_level_pool,1)[0]
    third_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code].keys())
    if len(third_level_pool) < num_classes:
        continue
    third_level_codes = sample(third_level_pool,num_classes)
    codes = [(first_level_code, second_level_code, c) for c in third_level_codes]
    experiments_codes['level_3'] = validate_min_samples(codes, required_samples_num, experiments_codes['level_3'], codes_to_patent_mapping)


# generate codes for level 4
while len(experiments_codes['level_4']) < num_experiments:
    # generate codes for current experiment
    first_level_code = sample(all_first_level_codes, 1)[0]
    second_level_pool = list(codes_to_patent_mapping[first_level_code].keys())
    if len(second_level_pool) < num_classes:
        continue
    second_level_code = sample(second_level_pool,1)[0]
    third_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code].keys())
    if len(third_level_pool) < num_classes:
        continue
    third_level_code = sample(third_level_pool,1)[0]
    four_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code][third_level_code].keys())
    if len(four_level_pool) < num_classes:
        continue
    four_level_codes = sample(four_level_pool,num_classes)
    codes = [(first_level_code, second_level_code, third_level_code, c) for c in four_level_codes]
    experiments_codes['level_4'] = validate_min_samples(codes, required_samples_num, experiments_codes['level_4'], codes_to_patent_mapping)

# generate codes for level 5
while len(experiments_codes['level_5']) < num_experiments:
    # generate codes for current experiment
    first_level_code = sample(all_first_level_codes, 1)[0]
    second_level_pool = list(codes_to_patent_mapping[first_level_code].keys())
    if len(second_level_pool) < 1:
        continue
    second_level_code = sample(second_level_pool,1)[0]
    third_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code].keys())
    if len(third_level_pool) < 1:
        continue
    third_level_code = sample(third_level_pool,1)[0]
    four_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code][third_level_code].keys())
    if len(four_level_pool) < 1:
        continue
    four_level_code = sample(four_level_pool,1)[0]
    fifth_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code][third_level_code][four_level_code].keys())
    if len(fifth_level_pool) < num_classes:
        continue

    fifth_level_codes = sample(fifth_level_pool,num_classes)
    codes = [(first_level_code, second_level_code, third_level_code, four_level_code, c) for c in fifth_level_codes]
    experiments_codes['level_5'] = validate_min_samples(codes, required_samples_num, experiments_codes['level_5'], codes_to_patent_mapping)

In [None]:
# sanity check that all levels have codes
for level in experiments_codes.keys():
    print(f"{level} has {len(experiments_codes[level])} codes")

# classification experiment

In [None]:
from sklearn.linear_model import LogisticRegression

num_spc = 30 # number of train samples per class
num_experiments_in_code = 3 # for each set of codes (dataset with labels) repeat the experiment with different train/test splits
feature_dim = 4096 #1536


beta = 0.3
gamma = feature_dim**(-beta)
print(f"gamma = {gamma}, max num features = {int(feature_dim*gamma)}")
verbosity = False
hc_stbl = False
hc_method = 'jin'
use_emp_cdf_in_hc_obj = False
override_inf_nan_stat = False
dp_fs_anova = FeatureSelectionDiversityPursuitAnova(hc_gamma=gamma,
                                                    hc_stbl=hc_stbl,
                                                    hc_method=hc_method,
                                                    use_emp_cdf_in_hc_obj=False,
                                                    override_inf_nan_stat=override_inf_nan_stat,
                                                    transformer=None, verbosity=verbosity)

dp_fs_k_anova = FeatureSelectionDiversityPursuitKruskal(hc_gamma=gamma,
                                                    hc_stbl=hc_stbl,
                                                    hc_method=hc_method,
                                                    use_emp_cdf_in_hc_obj=False,
                                                    override_inf_nan_stat=override_inf_nan_stat,
                                                    transformer=None, verbosity=verbosity)


ova_fs_ks = FeatureSelectionOneVsAllKS(hc_gamma=gamma, 
                                       hc_stbl=hc_stbl, 
                                       hc_method=hc_method,
                                       use_emp_cdf_in_hc_obj=False,
                                       override_inf_nan_stat=override_inf_nan_stat,
                                       transformer=None, verbosity=verbosity)
ova_fs_t_test = FeatureSelectionOneVsAllAnova(hc_gamma=gamma, 
                                              hc_stbl=hc_stbl, 
                                              hc_method=hc_method,
                                              use_emp_cdf_in_hc_obj=False,
                                              override_inf_nan_stat=override_inf_nan_stat,
                                              transformer=None, verbosity=verbosity)


c_grid = [0.001,0.01,0.1,1,10,100]
svm_kernel_grid = ['linear', 'rbf', 'poly']
max_iter = [700]
svm_tol = [5e-3]
svm_with_feat_sel_pipe = Pipeline([('scaling', StandardScaler()), ('clf', LinearSVC())])
svm_with_feat_sel_pipe_parameters = dict(scaling=['passthrough', StandardScaler()],
                                         clf__C=c_grid,
                                         clf__penalty=['l1'],
                                         clf__dual=['auto'],
                                         #clf__kernel=svm_kernel_grid,
                                         clf__max_iter=max_iter,
                                         clf__tol=svm_tol)

svm_without_feat_sel_pipe = Pipeline([('scaling', StandardScaler()), ('clf', SVC())])
svm_without_feat_sel_pipe_parameters = dict(scaling=['passthrough', StandardScaler()],
                                         clf__C=c_grid,
                                         clf__kernel=svm_kernel_grid,
                                         clf__max_iter=max_iter,
                                         clf__tol=svm_tol)

log_reg_pipe = Pipeline([('scaling', StandardScaler()), ('clf', LogisticRegression())])
log_reg_pipe_parameters = {'scaling' : ['passthrough', StandardScaler()],
                           'clf__C' : c_grid,
                           'clf__penalty' : ['l1'],
                           'clf__solver' : ['liblinear'],
                           'clf__max_iter' : max_iter,
                           'clf__tol' : [5e-4]}


log_reg_fs_pipe = Pipeline([('scaling', StandardScaler()), ('clf', LogisticRegression())])
log_reg_fs_pipe_parameters = {'scaling' : ['passthrough', StandardScaler()],
                           'clf__penalty' : [None],
                           'clf__max_iter' : max_iter,
                           'clf__tol' : [5e-4]}


svm_fs_pipe = Pipeline([('scaling', StandardScaler()), ('clf', SVC())])
svm_fs_pipe_parameters = dict(scaling=['passthrough', StandardScaler()],
                                         clf__C=c_grid,
                                         clf__kernel=svm_kernel_grid,
                                         clf__max_iter=max_iter,
                                         clf__tol=svm_tol)

n_estimators_grid = [10,20,50,100]
max_depth_grid = [3,5,7]
learning_rate_grid = [0.05, 0.1, 0.3]
gb_cls_parameters = {'learning_rate' : learning_rate_grid, 'n_estimators' : n_estimators_grid, 'max_depth' : max_depth_grid}
gb_cls = GradientBoostingClassifier()
rf_cls_parameters = {'n_estimators' : n_estimators_grid, 'max_depth' : max_depth_grid}
rf_cls = RandomForestClassifier()


In [None]:
# defining classifiers
use_euclidian_distance = False
cs_cls  = CentroidSimilarity(use_euclidian_distance=use_euclidian_distance)
cs_fs_dp = ClassifierFeatureSelection(CentroidSimilarity(use_euclidian_distance=use_euclidian_distance), dp_fs_anova)
cs_fs_ova = ClassifierFeatureSelection(CentroidSimilarity(use_euclidian_distance=use_euclidian_distance), ova_fs_t_test)
cs_fs_ova_ks = ClassifierFeatureSelection(CentroidSimilarity(use_euclidian_distance=use_euclidian_distance), ova_fs_ks)
num_classifiers = 5 # including base classifiers


In [None]:
#max_num_samples = 3*required_samples_num
noise_std = 0.01
levels_to_visualize = ['level_3', 'level_4', 'level_5']
exp_codes = []
code_num_feat = []
code_acc = []
for level in levels_to_visualize:    #experiments_codes.keys():
    print(f"classification in {level}")
    print("---------------------------------------------------------------------------------------------------------")

    exp_cnt = 0
    for codes in experiments_codes[level]:
        exp_patents = get_unique_patents_per_code(codes, codes_to_patent_mapping)
        exp_cnt += 1
        print(f"running exp {exp_cnt}")
        print(f"selected codes = {codes}")
        min_num_samples = min([len(exp_patents[p]) for p in exp_patents.keys()])
        print(f"min num samples = {min_num_samples}")
        # get the patents embeddings and arrange in an array
        X = []
        y = []
        j = 0
        for k in exp_patents.keys():
            #curr_patents = resample(curr_patents, replace=False, n_samples=min_num_samples)
            if len(exp_patents[k]) > required_samples_num:
                curr_patents = resample(exp_patents[k], replace=False, n_samples=required_samples_num)
            else:
                curr_patents = exp_patents[k].copy()
                
            curr_embeddings, valid_embeddings = get_embeddings(patents_list=curr_patents, patents_dict=patents_dict, embedding_model='cohere', embedding_dict=cohere_embeddings_dict, cohere_client=cohere_prod_client, cohere_model=cohere_model)
            if not valid_embeddings:
                break
            X.append(curr_embeddings)
            y.append(j * np.ones((curr_embeddings.shape[0],)))
            j += 1
        if not valid_embeddings:
            print('skipping experiment - failed to obtain embeddings')
            continue
        X = np.concatenate(X, axis=0)
        y = np.concatenate(y, axis=0)
        print(f"X shape = {X.shape}")
        print(f"y shape = {y.shape}")
        # add random noise
        X = X + noise_std * np.random.randn(*(X.shape))
        accuracies = np.empty((num_classifiers, num_experiments_in_code))
        num_features = np.empty_like(accuracies)
        for i in range(num_experiments_in_code):
            # split to train and test
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=num_spc*num_classes, stratify=y)
            # tune the SVM classifier using the train data
            num_splits = min(num_spc,5)
            skf = StratifiedKFold(n_splits=num_splits)
            
            #svm_with_feat_sel_search = GridSearchCV(svm_with_feat_sel_pipe, svm_with_feat_sel_pipe_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            #svm_with_feat_sel_search.fit(X_train, y_train)
            #svm_with_feat_sel_best = svm_with_feat_sel_search.best_estimator_
            #print(f"best SVM with feat sel grid search score = {svm_with_feat_sel_search.best_score_}")

            svm_without_feat_sel_search = GridSearchCV(svm_without_feat_sel_pipe, svm_without_feat_sel_pipe_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            svm_without_feat_sel_search.fit(X_train, y_train)
            svm_without_feat_sel_best = svm_without_feat_sel_search.best_estimator_
            print(f"best SVM without feat sel grid search score = {svm_without_feat_sel_search.best_score_}")

            log_reg_search = GridSearchCV(log_reg_pipe, log_reg_pipe_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            log_reg_search.fit(X_train, y_train)
            log_reg_best = log_reg_search.best_estimator_
            print(f"best LASSO grid search score = {log_reg_search.best_score_}")

            
            #X_train_fs = dp_fs_k_anova.fit_transform(X_train, y_train)
            # SVM with HC-ANOVA feature selection grid search
            #svm_fs_search = GridSearchCV(svm_fs_pipe, svm_fs_pipe_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            #svm_fs_search.fit(X_train_fs, y_train)
            #svm_fs_best = Pipeline([('feat_sel', dp_fs_k_anova), ('clf', svm_fs_search.best_estimator_)])
            #print(f"best SVM with HC-ANOVA feature selection grid search score = {svm_fs_search.best_score_}")

            # Log Reg with HC-ANOVA feature selection grid search
            #log_reg_fs_search = GridSearchCV(log_reg_fs_pipe, log_reg_fs_pipe_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            #log_reg_fs_search.fit(X_train_fs, y_train)
            #log_reg_fs_best = Pipeline([('feat_sel', dp_fs_k_anova), ('clf', log_reg_fs_search.best_estimator_)])
            #print(f"best Logistic regression with HC-ANOVA feature selection grid search score = {log_reg_fs_search.best_score_}")

            #gb_search = GridSearchCV(gb_cls, gb_cls_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            #gb_search.fit(X_train, y_train)
            #gb_best = gb_search.best_estimator_
            #print(f"best Grad boost grid search score = {gb_search.best_score_}")

            #rf_search = GridSearchCV(rf_cls, rf_cls_parameters, cv=skf, scoring='accuracy', n_jobs=4)
            #rf_search.fit(X_train, y_train)
            #rf_best = rf_search.best_estimator_
            #print(f"best random forest grid search score = {rf_search.best_score_}")

            classifiers = [cs_cls,  cs_fs_dp,   cs_fs_ova,   svm_without_feat_sel_best,  log_reg_best]
            cls_names = ['cs_all', 'cs_fs_dp', 'cs_fs_ova',  'svm_without_feat_sel'    , 'log_reg']

            #classifiers = [svm_without_feat_sel_best, log_reg_best, svm_fs_best, log_reg_fs_best]
            #cls_names = ['SVM', 'log reg Lasso', 'SVM HC ANOVA', 'Log Reg HC ANOVA']


            accuracies[:, i], num_features[:, i] = multiple_classifiers_fit_predict(classifiers=classifiers,
                                                                                    X_train=X_train,
                                                                                    y_train=y_train,
                                                                                    X_test=X_test,
                                                                                    y_test=y_test,
                                                                                    score_func=accuracy_score,
                                                                                    preprocess_func=None,
                                                                                    get_features=False,
                                                                                    scaler_func=None)
            

        
        # get the average accuracy for the current code
        mean_acc = np.squeeze(np.mean(accuracies,axis=1))
        print(f"mean accuracies: \n  {[f'{k}:{v}' for (k,v) in dict(zip(cls_names,list(mean_acc))).items()]}")
        mean_num_features = np.squeeze(np.mean(num_features,axis=1))
        print(f"mean num features: \n  {[f'{k}:{v}' for (k,v) in dict(zip(cls_names,list(mean_num_features))).items()]}")
        # store the results
        code_acc.append(accuracies)
        code_num_feat.append(num_features)
        exp_codes.append(tuple(codes))


### Analysis

In [None]:
code_acc_all_exp = np.stack(code_acc,axis=0)
code_acc_all_exp = np.squeeze(np.mean(code_acc_all_exp,axis=2))
code_num_feat_all_exp = np.stack(code_num_feat,axis=0)
code_num_feat_all_exp = np.squeeze(np.mean(code_num_feat_all_exp,axis=2))
code_acc_all_exp.shape

In [None]:
code_acc_all_exp = pd.DataFrame({cls_names[i] : code_acc_all_exp[:,i] for i in range(len(cls_names))})
code_acc_all_exp['codes'] = exp_codes
exp_level = [len(exp_codes[i][0]) for i in range(len(exp_codes))]
code_acc_all_exp['exp_level'] = exp_level
code_acc_all_exp

In [None]:
code_num_features_all_exp = pd.DataFrame({cls_names[i] : code_num_feat_all_exp[:,i] for i in range(len(cls_names))})
code_num_features_all_exp['codes'] = exp_codes
code_num_features_all_exp['exp_level'] = exp_level
code_num_features_all_exp

In [None]:
figures = []

In [None]:
gb = code_acc_all_exp.drop('codes',axis=1).groupby('exp_level').mean()
cols = list(gb)
gb['exp_level'] = gb.index
acc_fig = px.bar(gb, x='exp_level', y=cols)
acc_fig.update_layout(barmode='group', title='classifiers accuracy over code levels')
acc_fig.update_yaxes({'title' : 'accuracy'})
acc_fig.show()
figures.append(acc_fig)

In [None]:
gb = code_num_features_all_exp.drop('codes',axis=1).groupby('exp_level').mean()
cols = list(gb)
gb['exp_level'] = gb.index
fig = px.bar(gb, x='exp_level', y=cols)
fig.update_layout(barmode='group', title='classifiers mean num features over code levels')
fig.update_yaxes({'title' : 'num features'})
fig.show()
figures.append(fig)

In [None]:
print(f"total patents embedded = {len(cohere_embeddings_dict)}")
embeddings_dir = os.path.join(datasets_dir,f"patents_text/cohere_{cohere_model}_embeddings/")
if os.path.isdir(embeddings_dir) is not True:
    os.mkdir(embeddings_dir)
    print("making dir for embeddings")
file_name = os.path.join(embeddings_dir,f"all_embeddings.pkl")
with open(file_name,'wb') as f:
    pickle.dump(cohere_embeddings_dict, f)

In [None]:
def get_available_embeddings(curr_patents: list, embedding_dict: dict):
    embeddings = []
    for p in curr_patents:
        if p in embedding_dict.keys():
            embeddings.append(embedding_dict[p]['emb'])
        else:
            continue
    return np.stack(embeddings, axis=0)
 
levels = ['level_3', 'level_4', 'level_5']
level_distances = {l : [] for l in levels}
for level in levels:
    
    for codes in experiments_codes[level]:
        exp_patents = get_unique_patents_per_code(codes, codes_to_patent_mapping)
        # get the patents embeddings for each code (class) and compute the class mean
        means = []
        #print(exp_patents.keys())
        for k in exp_patents.keys():
            if len(exp_patents[k]) > 1000:
                curr_patents = resample(exp_patents[k], replace=False, n_samples=1000)
            else:
                curr_patents = exp_patents[k].copy()
            curr_embeddings = get_available_embeddings(curr_patents=curr_patents, embedding_dict=cohere_embeddings_dict)
            class_mean = curr_embeddings.mean(axis=0)
            means.append(class_mean)
        # Stack vectors into a 2D numpy
        means = np.vstack(means)
        # Compute pairwise Euclidean distances
        distances = cdist(means, means, metric='euclidean')
        upper_tri_indices = np.triu_indices(len(means), k=1)
        # Extract the upper triangular elements using these indices
        upper_triangle_distances = distances[upper_tri_indices]
        for d in upper_triangle_distances:
            level_distances[level].append(d)
            

In [None]:
level_distances_hist = {k : np.histogram(level_distances[k], bins=10) for k in level_distances.keys()}
level_names_dict = {'level_3' : 'sub-class level',
                    'level_4' : 'group level',
                    'level_5' : 'sub-group level'}
level_distances_hist
fig = go.Figure(data=[go.Bar(x=level_distances_hist[k][1][:-1], y=level_distances_hist[k][0], name=level_names_dict[k], opacity=0.75) for k in level_distances_hist.keys()])
fig.update_layout(
        #legend = dict(font=dict(size=20)),
        font = dict(size=20),
        #legend_font_size = 20,
        #xaxis_title_font_size = 20,
        xaxis_title='Distance',
        yaxis_title='Count',
        barmode='overlay',  # Overlay histograms to compare
        template='plotly',
        bargap=0.2
    )
fig.show()
#figures.append(fig)

### Store results

In [None]:
import mlflow

In [None]:
def add_local_dicts_to_run_params(run_params: dict, local_dict: dict, local_dict_name: str):
    for k in local_dict.keys():
        run_params[f'{local_dict_name}__{k}'] = local_dict[k]

In [None]:
# create a folder to hold the experiment artifacts (files)
cwd=os.getcwd()
artifacts_dir = os.path.join(cwd,'run_artifacts')
print(f"artifacts dir = {artifacts_dir}")
if os.path.isdir(artifacts_dir) is not True:
    os.mkdir(artifacts_dir)
else:
    if os.listdir(artifacts_dir):
        # remove all files
        print("cleaning artifacts dir")
        for file in os.listdir(artifacts_dir):
            os.remove(os.path.join(artifacts_dir,file))


In [None]:
experiment = mlflow.set_experiment(experiment_name="Patent embedding classification")

In [None]:
experiment_id = experiment.experiment_id
print(f"experiment_id = {experiment_id}")

In [None]:
run_name = f"run_3_levels_cohere_large_embed"
print(f"run_name = {run_name}")

In [None]:
code_num_features_all_exp.to_csv(os.path.join(artifacts_dir,'num_features_all_exp.csv'))
code_acc_all_exp.to_csv(os.path.join(artifacts_dir,'acc_all_exp.csv'))

In [None]:
from utils.experiment_utils import gen_html_report
gen_html_report(html_path=os.path.join(artifacts_dir,'figures.html'),items=figures)

In [None]:
description = f'perform classification over 3 levels of codes. from each level, sample {num_experiments} sets of {num_classes} codes. the problem is a multi-class classification of {num_classes} classes. for each set, perform {num_experiments_in_code} experiments. in each experiment, perform a grid search to find the best tuning for the base classifiers in the experiment. '
            
description

In [None]:
run_params = {
    'num_classes' : num_classes,
    'num_spc' : num_spc,
    'required_samples_num':required_samples_num,
    'hc_stbl' : hc_stbl,
    'hc_method': hc_method,
    'hc_gamma' : gamma,
    'num_experiments' : num_experiments,
    'num_experiments_in_code' : num_experiments_in_code
}

add_local_dicts_to_run_params(run_params, svm_without_feat_sel_pipe_parameters, 'svm_without_feat_sel_pipe_parameters')
add_local_dicts_to_run_params(run_params, svm_with_feat_sel_pipe_parameters, 'svm_with_feat_sel_pipe_parameters')
add_local_dicts_to_run_params(run_params, log_reg_pipe_parameters, 'log_reg_pipe_parameters')
print(run_params)

In [None]:
curr_run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name, description=description)
mlflow.log_params(run_params)
mlflow.log_artifact(local_path=artifacts_dir)
mlflow.end_run()


### Observe P-values

In [None]:
#fs_scaler = StandardScaler(with_mean=False)
#fs_scaler = PowerTransformer(standardize=False)
fs_scaler = None
dp_fs_anova = FeatureSelectionDiversityPursuitAnova(hc_gamma=0.8, hc_stbl=False, hc_method='jin', use_emp_cdf_in_hc_obj=use_emp_cdf_in_hc_obj, transformer=fs_scaler, verbosity=True)
dp_fs_anova.fit(X_train, y_train)
print(f"feature selector hct = {dp_fs_anova.hct}")
print(f"num selected features = {dp_fs_anova.get_num_selected_features()}")
mi_hist, bins = np.histogram(dp_fs_anova.pvals, bins=100)
bins = bins[:-1] + (bins[1] - bins[0]) / 2
fig = px.bar(x=bins, y=mi_hist)
fig.update_layout(title='pval histogram', width=800)
fig.update_xaxes(title="p-val")
fig.update_yaxes(title="count")
fig.show()
X_cov = np.corrcoef(X_train, rowvar=False)
print(f"X_cov.shape = {X_cov.shape}")
fig = px.imshow(X_cov)
fig.show()

# EDA

In [None]:
from sklearn.manifold import TSNE
from random import sample
from sklearn.decomposition import PCA


In [None]:
experiments_codes.keys()

In [None]:
codes_for_vis = {f'level_{i}' : [] for i in range(1,6)}

while (len(codes_for_vis['level_5']) < 1) or (len(codes_for_vis['level_4']) < 1) or (len(codes_for_vis['level_3']) < 1):
    # generate codes for current experiment
    first_level_code = sample(all_first_level_codes, 1)[0]
    codes_for_vis['level_1'] = first_level_code
    second_level_pool = list(codes_to_patent_mapping[first_level_code].keys())
    if len(second_level_pool) < 1:
        codes_for_vis = {f'level_{i}' : [] for i in range(1,6)}
        continue
    second_level_code = sample(second_level_pool,1)[0]
    codes_for_vis['level_2'] = second_level_code
    third_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code].keys())
    if len(third_level_pool) < num_classes:
        codes_for_vis = {f'level_{i}' : [] for i in range(1,6)}
        continue
    third_level_codes = sample(third_level_pool,num_classes)
    codes = [(first_level_code, second_level_code, c) for c in third_level_codes]
    codes_for_vis['level_3'] = validate_min_samples(codes, required_samples_num, codes_for_vis['level_3'], codes_to_patent_mapping)
    third_level_code = third_level_codes[0]
    four_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code][third_level_code].keys())
    if len(four_level_pool) < num_classes:
        codes_for_vis = {f'level_{i}' : [] for i in range(1,6)}
        continue
    four_level_codes = sample(four_level_pool,num_classes)
    four_level_code = four_level_codes[0]
    codes = [(first_level_code, second_level_code, third_level_code, c) for c in four_level_codes]
    codes_for_vis['level_4'] = validate_min_samples(codes, required_samples_num, codes_for_vis['level_4'], codes_to_patent_mapping)
    fifth_level_pool = list(codes_to_patent_mapping[first_level_code][second_level_code][third_level_code][four_level_code].keys())
    if len(fifth_level_pool) < num_classes:
        codes_for_vis = {f'level_{i}' : [] for i in range(1,6)}
        continue

    fifth_level_codes = sample(fifth_level_pool,num_classes)
    
    codes = [(first_level_code, second_level_code, third_level_code, four_level_code, c) for c in fifth_level_codes]
    codes_for_vis['level_5'] = validate_min_samples(codes, required_samples_num, codes_for_vis['level_5'], codes_to_patent_mapping)
    #experiments_codes['level_5'] = validate_min_samples(codes, required_samples_num, experiments_codes['level_5'], codes_to_patent_mapping)

codes_for_vis

In [None]:
required_samples_num = 300
levels_to_visualize = ['level_3', 'level_4', 'level_5']
tsne_fig = make_subplots(rows=len(levels_to_visualize), cols=1,shared_xaxes=True)

p = 5
distances = {}
for i,level in enumerate(levels_to_visualize):
    codes = sample(codes_for_vis[level],1)[0]
    print(f"selected codes = {codes}")
    exp_patents = get_unique_patents_per_code(codes, codes_to_patent_mapping)
    min_num_samples = min([len(exp_patents[p]) for p in exp_patents.keys()])
    print(f"min num samples = {min_num_samples}")
    # get the patents embeddings and arrange in an array
    X = []
    y = []
    j = 0
    for k in exp_patents.keys():
        #curr_patents = resample(curr_patents, replace=False, n_samples=min_num_samples)
        if len(exp_patents[k]) > required_samples_num:
            curr_patents = resample(exp_patents[k], replace=False, n_samples=required_samples_num)
        else:
            curr_patents = exp_patents[k].copy()      
        curr_embeddings = get_embeddings(patents_list=curr_patents, patents_dict=patents_dict, embedding_dict=patents_dict)
        X.append(curr_embeddings)
        y.append(j * np.ones((curr_embeddings.shape[0],)))
        j += 1
    X = np.concatenate(X, axis=0)
    y = np.concatenate(y, axis=0)
    print(f"X shape = {X.shape}")
    print(f"y shape = {y.shape}")
    
    tsne_transformer = TSNE(n_components=2, learning_rate='auto', perplexity=p)
    X_embedded = tsne_transformer.fit_transform(X)
    print(X_embedded.shape)
    print(f"perplexity = {p}, KL = {tsne_transformer.kl_divergence_}")
    tsne_fig.add_trace(go.Scatter(x=X_embedded[:,0],y=X_embedded[:,1], mode='markers', marker_color=y), row=i+1, col=1)
    distances[level] = compute_inter_class_distances(X, num_samples_per_class=required_samples_num)
tsne_fig.update_layout(height=1200, width=600, title_text="patents projection")
tsne_fig.show()

In [None]:
[np.histogram(distances[k], bins=20) for k in distances.keys()]