# Setup Environment

In [11]:
import pandas as pd
import os
import pickle
import json

# Load Data

In [2]:
def load_BART_Results(filename):
    with open(filename, 'rb') as f:
        results = pickle.load(f)
    return results

In [3]:
def create_filename(fileidx):
    fileidx = str(fileidx)
    if len(fileidx) == 1:
        fileidx = "0" + fileidx
    return 'BART_results_' + fileidx + '.pkl'

In [4]:
bart_results = []

for i in range(1, 100):
    filename = create_filename(i)
    try:
        temp_results = load_BART_Results(filename)
        bart_results.extend(temp_results)
    except:
        pass

with open('BART_results_all.pkl', 'wb') as f:
    pickle.dump(bart_results, f)

In [27]:
pd.Series(a)[[0,1]].tolist()

[1, 2]

In [5]:
def keep_scores_above_threshold(bart_results, threshold=0.25):
    def get_indexes_above_threshold(scores):
        indexes = []
        scores_list = []
        for idx, score in enumerate(scores):
            if score >= threshold:
                indexes.append(idx)
                scores_list.append(score)
            else:
                break #Scores are sorted so processing stops at first encounter
        if len(indexes) > 0:
            return indexes, scores_list
        else:
            return None, None
    def extract_indexed_labels(labels, indexes):
        if indexes:
            return pd.Series(labels)[indexes].tolist()
        else:
            return None
    bart_results_t = [] #BART results above threshold
    for result in bart_results:
        indexes, scores_list = get_indexes_above_threshold(result['scores'])
        labels = extract_indexed_labels(result['labels'], indexes)
        temp_result = {'sequence': result['sequence'],
                       'labels': labels,
                       'scores': scores_list
                      }
        bart_results_t.append(temp_result)
    return bart_results_t

In [6]:
bart_results_thresholded = keep_scores_above_threshold(bart_results, threshold=0.25)
with open('BART_results_all_thresholded.pkl', 'wb') as f:
    pickle.dump(bart_results_thresholded, f)

In [7]:
df = pd.DataFrame(bart_results_thresholded)
df.isna().sum(0) / df.shape[0]

sequence    0.000000
labels      0.115869
scores      0.115869
dtype: float64

# Combine BART Results with Original Data

In [40]:
a = pd.Series([1,2,3])
b = pd.Series([1,2,3])

(a == b).all()

True

In [8]:
orig_data = pd.read_csv('./PatentView_Data/2020_h2_patents_processed.csv', header=0)

def combine_results_orig_data(orig_data, results):
    orig_col_names = ['patent_number', 
                     'patent_firstnamed_assignee_id', 
                     'patent_text']
    orig_data.columns = orig_col_names
    #orig_data['sequence'] = orig_data[['abstract', 'claim_1']].agg(' '.join, axis=1)
    results = pd.DataFrame(results)
    assert orig_data.shape[0] == len(results), 'orig_data and results not same length'
    assert (orig_data.patent_text == results.sequence).all(), 'orig_data and results not aligned'
    merged_data = pd.merge(left=orig_data, right=results, how='left', left_on='patent_text', right_on='sequence')
    merged_data.drop('sequence', axis=1, inplace=True)
    return merged_data

In [9]:
first_N = 4000
merged_results = combine_results_orig_data(orig_data.iloc[:first_N], bart_results_thresholded[:first_N])
merged_results.head()

Unnamed: 0,patent_number,patent_firstnamed_assignee_id,patent_text,labels,scores
0,10701852,fd9a9bee-58a0-43e6-8dc5-6bf1331edeb3,Automatic target recognition and management sy...,"[Specialized Design Services, Business Support...","[0.3397502899169922, 0.3167252838611603, 0.312..."
1,10701853,6db6e876-5207-4c9a-9421-d3ce7fe9992e,"Agricultural trench depth systems, methods, an...","[Support Activities for Crop Production, Indus...","[0.44477155804634094, 0.3382902145385742, 0.30..."
2,10701854,7eeec45c-d120-45e3-86d5-f4ebac9912a8,Agricultural implement with row unit leveling ...,"[Traveler Accommodation, Travel Arrangement an...","[0.8478323221206665, 0.7196636199951172, 0.712..."
3,10701855,f5866616-c4a1-4fad-81e6-0fb03a99c2c2,Folding implement with tractor assist A towed ...,"[Motor Vehicle Manufacturing, Greenhouse, Nurs...","[0.4398001730442047, 0.4355512261390686, 0.387..."
4,10701856,0f802b46-f7dd-48bc-a77e-6654f2e0e5f8,Agricultural implements for soil and vegetatio...,[Motor Vehicle Manufacturing],[0.29373806715011597]


In [10]:
merged_results = combine_results_orig_data(orig_data, bart_results_thresholded)

# Add Assignee Information to DataFrame

In [12]:
json_fn = './PatentView_Data/2020_h2_patents.json'
with open(json_fn, 'rb') as f:
    patent_data = json.load(f)

In [19]:
patent_data['assignees']['100']

[{'assignee_city': 'McLean',
  'assignee_country': 'US',
  'assignee_county': 'Fairfax',
  'assignee_county_fips': '51059',
  'assignee_first_name': None,
  'assignee_first_seen_date': '1976-03-16',
  'assignee_id': '6952680f-0895-493e-9003-7e31a5931d10',
  'assignee_last_name': None,
  'assignee_latitude': '38.9343',
  'assignee_location_id': '38.9342888|-77.1776327',
  'assignee_longitude': '-77.1776',
  'assignee_organization': 'Mars, Incorporated',
  'assignee_sequence': '0',
  'assignee_state': 'VA',
  'assignee_state_fips': '51',
  'assignee_total_num_inventors': '1095',
  'assignee_total_num_patents': '1106',
  'assignee_type': '2',
  'assignee_key_id': '634'}]

In [20]:
assignee_list = []
assignee_cols = ['assignee_id',
                 'assignee_organization',
                 'assignee_latitude',
                 'assignee_longitude',
                 'assignee_city',
                 'assignee_state',
                 'assignee_country',
                 'assignee_type'
                ]
assignees = patent_data['assignees']
for assignee_idx in assignees:
    assignee_temp_list = assignees[assignee_idx]
    assignee_list.extend(assignee_temp_list)
assignee_df = pd.DataFrame(assignee_list, columns = assignee_cols)
assignee_df.drop_duplicates(subset='assignee_id', inplace=True)
assignee_df.dropna(how='all', inplace=True)
assignee_df.head()

Unnamed: 0,assignee_id,assignee_organization,assignee_latitude,assignee_longitude,assignee_city,assignee_state,assignee_country,assignee_type
0,fd9a9bee-58a0-43e6-8dc5-6bf1331edeb3,The University of Sydney,-33.3843,148.01,Forbes,,AU,3
1,6db6e876-5207-4c9a-9421-d3ce7fe9992e,Precision Planting LLC,40.5275,-89.4926,Tremont,IL,US,2
2,7eeec45c-d120-45e3-86d5-f4ebac9912a8,CNH Industrial America LLC,40.1018,-76.0852,New Holland,PA,US,2
3,f5866616-c4a1-4fad-81e6-0fb03a99c2c2,"CNH Industrial Canada, Ltd.",52.1318,-106.661,Saskatoon,SK,CA,3
4,0f802b46-f7dd-48bc-a77e-6654f2e0e5f8,CLIMATE LLC,37.779,-122.42,San Francisco,CA,US,2


In [22]:
merged_data_w_assignees = pd.merge(left=merged_results, 
                                   right=assignee_df, 
                                   how='left',
                                   left_on='patent_firstnamed_assignee_id',
                                   right_on='assignee_id'
                                  )
merged_data_w_assignees.drop('assignee_id', axis=1,inplace=True)
merged_data_w_assignees.head()

Unnamed: 0,patent_number,patent_firstnamed_assignee_id,patent_text,labels,scores,assignee_organization,assignee_latitude,assignee_longitude,assignee_city,assignee_state,assignee_country,assignee_type
0,10701852,fd9a9bee-58a0-43e6-8dc5-6bf1331edeb3,Automatic target recognition and management sy...,"[Specialized Design Services, Business Support...","[0.3397502899169922, 0.3167252838611603, 0.312...",The University of Sydney,-33.3843,148.01,Forbes,,AU,3
1,10701853,6db6e876-5207-4c9a-9421-d3ce7fe9992e,"Agricultural trench depth systems, methods, an...","[Support Activities for Crop Production, Indus...","[0.44477155804634094, 0.3382902145385742, 0.30...",Precision Planting LLC,40.5275,-89.4926,Tremont,IL,US,2
2,10701854,7eeec45c-d120-45e3-86d5-f4ebac9912a8,Agricultural implement with row unit leveling ...,"[Traveler Accommodation, Travel Arrangement an...","[0.8478323221206665, 0.7196636199951172, 0.712...",CNH Industrial America LLC,40.1018,-76.0852,New Holland,PA,US,2
3,10701855,f5866616-c4a1-4fad-81e6-0fb03a99c2c2,Folding implement with tractor assist A towed ...,"[Motor Vehicle Manufacturing, Greenhouse, Nurs...","[0.4398001730442047, 0.4355512261390686, 0.387...","CNH Industrial Canada, Ltd.",52.1318,-106.661,Saskatoon,SK,CA,3
4,10701856,0f802b46-f7dd-48bc-a77e-6654f2e0e5f8,Agricultural implements for soil and vegetatio...,[Motor Vehicle Manufacturing],[0.29373806715011597],CLIMATE LLC,37.779,-122.42,San Francisco,CA,US,2


In [23]:
merged_data_w_assignees.to_pickle('BART_predictions_w_text_and_assignee.pkl')
merged_data_w_assignees.to_csv('BART_predictions_w_text_and_assignee.csv')