In [24]:
import pandas as pd
import numpy as np
import gensim

from itertools import combinations
from tqdm.notebook import tqdm

import random
import time

from multiprocessing import Process, Queue

import umap.umap_ as umap
from numba import set_num_threads

import plotly.express as px

from sklearn.metrics import pairwise_distances


In [2]:
family = pd.read_csv("patstat_family.csv")
family

Unnamed: 0,inpadoc_family_id,appln_filing_year,patent_count,citation
0,1,2000,6,0
1,3,2000,4,0
2,4,2000,7,0
3,5,2000,4,0
4,6,2000,6,0
...,...,...,...,...
32951666,47445802,1993,1,0
32951667,47445803,1983,1,0
32951668,47445804,1991,1,0
32951669,47445805,1983,1,0


In [6]:
class_list = pd.read_csv("patstat_class_list.csv")
class_list = pd.merge(class_list, family[["inpadoc_family_id", "appln_filing_year"]])
class_list = class_list[(class_list.appln_filing_year > 1990) & (class_list.appln_filing_year <= 2000)]
class_list

Unnamed: 0,inpadoc_family_id,ipc_class,appln_filing_year
0,1,G06K,2000
1,1,H01R,2000
2,1,H04M,2000
3,1,H04Q,2000
4,1,H04W,2000
...,...,...,...
48150917,47445797,A01N,1991
48150919,47445799,C06B,1994
48150923,47445802,H02K,1993
48150926,47445804,D05B,1991


In [8]:
data = {"w1": [], "w2": [], "year": []}

# patent_subclass = class_list[["inpadoc_family_id", "ipc_class", "appln_filing_year"]]

for g in tqdm(class_list.groupby("inpadoc_family_id")):
    for c in combinations(list(set(g[1].ipc_class)), 2):
        data["w1"].append(min(c))
        data["w2"].append(max(c))
        data["year"].append(g[1].appln_filing_year.values[0])

pair_vintage = pd.DataFrame(data)
pair_vintage = pair_vintage.groupby(pair_vintage.columns.tolist(),as_index=False).size()
pair_vintage = pair_vintage.rename(columns={"size": "pair_count"})

pair_vintage.to_csv("patstat_pair_count.csv", index=False)

pair_vintage

  0%|          | 0/5477659 [00:01<?, ?it/s]

Unnamed: 0,w1,w2,year,pair_count
0,A01B,A01C,1991,143
1,A01B,A01C,1992,155
2,A01B,A01C,1993,190
3,A01B,A01C,1994,177
4,A01B,A01C,1995,202
...,...,...,...,...
385053,H05H,H05K,1996,7
385054,H05H,H05K,1997,9
385055,H05H,H05K,1998,3
385056,H05H,H05K,1999,4


In [10]:
year_vintage = [1996 + i for i in range(5)]
year_window = 5

target_pairs = pd.DataFrame([(min(c), max(c))for c in combinations(list(set(class_list.ipc_class)), 2)], columns=["w1", "w2"])
target_pairs

Unnamed: 0,w1,w2
0,A43C,C13F
1,B29C,C13F
2,B66D,C13F
3,B81B,C13F
4,C10C,C13F
...,...,...
204475,C01B,G01V
204476,C01B,G07G
204477,A01H,G01V
204478,A01H,G07G


In [41]:
year_vintage = [1996 + i for i in range(5)]
year_window = 5

target_pairs = pd.DataFrame([(min(c), max(c))for c in combinations(list(set(class_list.ipc_class)), 2)], columns=["w1", "w2"])
target_pairs

patent_pair = pd.DataFrame()

for year in tqdm(year_vintage):
    # Get sentences
    df = class_list[(class_list.appln_filing_year > year - year_window) & (class_list.appln_filing_year <= year)][["inpadoc_family_id", "ipc_class"]]

    sentences = []#df.groupby('appln_id').aggregate(lambda x: tuple(x))["subclass"].apply(lambda x: list(x)).values
    
    for g in df.groupby("inpadoc_family_id"):
        if len(g[1]) > 1:
            sentences.append(g[1].ipc_class.values.tolist())
            
    pair_data = {"w1": [], "w2": [], "similarity": []}
    
    for i in tqdm(range(10), desc="{}".format(year)):
        
        model = gensim.models.Word2Vec(min_count=1, vector_size=32, window=10, sg = 1, workers=1)
        model.build_vocab(sentences)

        for epoch in range(5):
            [random.shuffle(s) for s in sentences]
            model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
        
        target_pair_year = target_pairs[(target_pairs.w1.isin(model.wv.index_to_key)) & (target_pairs.w2.isin(model.wv.index_to_key))]

        for r in target_pair_year.itertuples():
            sim = model.wv.similarity(r.w1, r.w2)
            if not np.isnan(sim):
                pair_data["w1"].append(r.w1)
                pair_data["w2"].append(r.w2)
                pair_data["similarity"].append(sim)
                
    df_pair = pd.DataFrame(pair_data)
    df_pair_average = df_pair[["w1", "w2", "similarity"]].groupby(["w1", "w2"]).mean()
    df_pair_average = pd.merge(df_pair_average, df_pair[["w1", "w2", "similarity"]].groupby(["w1", "w2"]).std().rename(columns={"similarity": "similarity_std"}), left_index=True, right_index=True)
    df_pair_average = df_pair_average.reset_index()
    df_pair_average["year"] = year
    df_pair_average
    
    patent_pair = pd.concat([patent_pair, df_pair_average])
    patent_pair.to_csv("patstat_pair_similarity.csv", index=False)
    
patent_pair

  0%|          | 0/5 [00:00<?, ?it/s]

1996:   0%|          | 0/10 [00:00<?, ?it/s]

1997:   0%|          | 0/10 [00:00<?, ?it/s]

1998:   0%|          | 0/10 [00:00<?, ?it/s]

1999:   0%|          | 0/10 [00:00<?, ?it/s]

2000:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,w1,w2,similarity,similarity_std,year
0,A01B,A01C,0.817786,0.021898,1996
1,A01B,A01D,0.817335,0.022531,1996
2,A01B,A01F,0.512621,0.031572,1996
3,A01B,A01G,0.608446,0.047579,1996
4,A01B,A01H,0.527471,0.034622,1996
...,...,...,...,...,...
201290,H05F,H05H,0.390195,0.056334,2000
201291,H05F,H05K,0.478308,0.060720,2000
201292,H05G,H05H,0.590601,0.023147,2000
201293,H05G,H05K,0.337112,0.045256,2000
