In [48]:
import numpy as np 
import pandas as pd 
import time 
import datetime 
from tqdm import tqdm 
import re 
import math 
import os 
import json 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
import nltk

# Read Data

In [87]:
positives = os.listdir('./positive_samples/')
cnt = 0
positive_data = [] 
for i in tqdm(range(len(positives)), position=0, leave=True): 
    filename = './positive_samples/' + positives[i] 
    try:  
        with open(filename) as f: 
            data = json.load(f)
            positive_data.append(data)
    except Exception as e: 
        cnt += 1

print("could not process {} / {} positive sample files".format(cnt, len(positives))) 

100%|██████████| 260534/260534 [00:27<00:00, 9317.30it/s] 

could not process 57 / 260534 positive sample files





In [77]:
negatives = os.listdir('./negative_samples/') 
cnt = 0 
negative_data = [] 
for i in tqdm(range(len(negatives)), position=0, leave=True): 
    filename = './negative_samples/' + negatives[i] 
    try: 
        with open(filename) as f: 
            data = json.load(f) 
            negative_data.append(data)
    except Exception as e: 
        cnt += 1 

print("could not process {} / {} negative sample files".format(cnt, len(negatives))) 

100%|██████████| 260411/260411 [00:35<00:00, 7367.45it/s]

could not process 0 / 260411 negative sample files





In [82]:
'''
negative_ids map contains the index in the negative_data array that matches the id of the positive sample 
'''

negative_ids = {} 

print("Creating ID-INDEX dictionary ...")

for i in tqdm(range(len(negative_data)), position=0, leave=True): 
    cur_id = negative_data[i]['patent1id']
    negative_ids[cur_id] = [] 

print("Filling up ID-INDEX dictionary ...")
for i in tqdm(range(len(negative_data)), position=0, leave=True): 
    cur_id = negative_data[i]['patent1id'] 
    negative_ids[cur_id].append(i) 
    

Creating ID-INDEX dictionary ...


100%|██████████| 260411/260411 [00:00<00:00, 952586.98it/s] 


Filling up ID-INDEX dictionary ...


100%|██████████| 260411/260411 [00:00<00:00, 969025.49it/s]


In [86]:
occurrence = [] 

for k,v in zip(negative_ids.keys(), negative_ids.values()): 
    occurrence.append(len(v)) 
    
print("max occurrence = {}".format(np.max(occurrence)))

max occurrence = 15


# Collect Triplets

In [95]:
def preprocess(s): 
    s = ' '.join(s)
    cleaned = re.sub('[^A-Za-z]+', ' ', s) 
    return cleaned

In [99]:
q_list, p_list, n_list = [], [], [] # queries, positives, negatives  

for i in tqdm(range(len(positive_data)), position=0, leave=True): 
    cur_id = positive_data[i]['patent1id'] 
    if cur_id not in negative_ids.keys(): 
        continue 
    matching_idxs = negative_ids[cur_id]  
    for j in matching_idxs: 
        q_list.append(preprocess(positive_data[i]['patent1claims']))  
        p_list.append(preprocess(positive_data[i]['patent2claims']))  
        n_list.append(preprocess(negative_data[j]['patent2claims']))  

100%|██████████| 260477/260477 [10:05<00:00, 430.47it/s]


In [109]:
outputStr = "queries" + "\t" + "positive" + "\t" + "negative" + "\n" 

for i in tqdm(range(len(q_list)), position=0, leave=True): 
    outputStr += str(q_list[i]) + "\t" + str(p_list[i]) + "\t" + str(n_list[i]) + "\n" 


100%|██████████| 390584/390584 [03:17<00:00, 1974.74it/s] 


In [110]:
print("Creating .tsv file ...")
with open('train.tsv', 'w') as outputter: 
    outputter.write(outputStr)

Creating .tsv file ...


In [112]:
train = pd.read_csv("train.tsv", sep='\t') 

train.head(10)

Unnamed: 0,queries,positive,negative
0,A contact data sharing method comprising runn...,A method comprising maintaining a passive per...,A method for wireless collaboration the metho...
1,A video recording apparatus for a vehicle com...,A monitoring device that outputs a movement o...,A modular self supporting mounting system for...
2,An implantable medical device comprising at l...,A composition comprising a plurality of heter...,A digital IF demodulator including an analog ...
3,A multi mode surgical tool comprising an elec...,A self regulating heating device having a fer...,An apparatus for restraining the movement of ...
4,A multi mode surgical tool comprising an elec...,A self regulating heating device having a fer...,A system for enriching a document comprising ...
5,A semiconductor structure with a high voltage...,A semiconductor device comprising a substrate...,A flat panel display device comprising a film...
6,Deployment system for an implantable medical ...,A deployment system for a self expanding sten...,A method implemented in a wireless transmit r...
7,Deployment system for an implantable medical ...,A deployment system for a self expanding sten...,A method of distributing light in more than o...
8,A method comprising obtaining motive outputs ...,A method for powering a powered system having...,A driving method of a semiconductor device th...
9,A near eye device comprising a spatial light ...,Holographic display device comprising a first...,A sealed compressor assembly comprising a com...


# Train/Val/Test Split

In [116]:
train_df, test_df = train_test_split(train, test_size = 0.2, random_state = 777) 

In [118]:
val_df, test_df = train_test_split(test_df, test_size = 0.5, random_state = 888)

In [123]:
assert train_df.shape[0] + val_df.shape[0] + test_df.shape[0] == train.shape[0]

train_df.shape, val_df.shape, test_df.shape 

((312467, 3), (39058, 3), (39059, 3))

In [126]:
# drop NaN rows 
train_df = train_df.dropna() 
val_df = val_df.dropna() 
test_df = test_df.dropna() 


train_df.shape, val_df.shape, test_df.shape

((311459, 3), (38940, 3), (38943, 3))

In [137]:
train_df.to_csv("train_df.tsv", sep="\t", index=False) 

In [138]:
val_df.to_csv("val_df.tsv", sep="\t", index=False) 

In [139]:
test_df.to_csv("test_df.tsv", sep="\t", index=False) 