# Similarity 
This notebook uses OpenAIs LlamaIndex to find similarity between patent and SBIR abstracts

In [None]:
#!pip install llama-index

In [3]:
#import llama_index
import pandas as pd
import os
from llama_index.evaluation import SemanticSimilarityEvaluator
import import_ipynb
import spacy as sp
import json
import requests
import re
import time
from itertools import product
import io

## Load data

In [4]:
%%time
# read SBIR award data directly from web URL
url="https://data.www.sbir.gov/awarddatapublic/award_data.csv"
s=requests.get(url).content
sbir_df=pd.read_csv(io.StringIO(s.decode('utf-8')))




CPU times: user 10.1 s, sys: 7.15 s, total: 17.3 s
Wall time: 31 s


In [6]:
#sbir_df = sbir_df.sample(1000)
sbir_df = sbir_df[~sbir_df['Abstract'].isna()]
sbir_df['Abstract'] = sbir_df['Abstract'].astype(str)

In [7]:
# get patent data
patent_df = pd.read_json('preprocessed_files/patents.json')
patent_df = patent_df[~patent_df['abstract'].isna()]
patent_df['abstract'] = patent_df['abstract'].apply(lambda x:\
                                                    re.sub(r'\n','',x))
patent_df = patent_df.set_index('doc-number')
patent_df['abstract'] = patent_df['abstract'].astype(str)

## Setup Llamaindex similarity evaulator

In [8]:
os.environ["OPENAI_API_KEY"] = YOUR OPEN AI KEY"

In [9]:
# Will use defaults. Default similarity threshold is 0.8
evaluator = SemanticSimilarityEvaluator()

In [10]:
# create a tuple of indicies that can be walked through
sbir_patent_tuples = list(set(list(product(sbir_df.index,patent_df.index))))
matches = [] # initialize list that contains list of sbir and patent matches 

In [11]:
len(sbir_patent_tuples), sbir_patent_tuples[:10]

(755061,
 [(20148, 20230225709),
  (98267, 20230225591),
  (140175, 20230225416),
  (78831, 20230225407),
  (193550, 20230225333),
  (48154, 20230225609),
  (117183, 20230225924),
  (187914, 20230225494),
  (123587, 20230225342),
  (130402, 20230225792)])

In [1]:
# Following code is stopped after excuting 24+ hours (133k tuples processed) hence will see errors

In [15]:
for tple in sbir_patent_tuples:
    ref = sbir_df['Abstract'].loc[tple[0]]
    res = patent_df['abstract'].loc[tple[1]]
    if not ref or not res:
        continue
        
    result = await evaluator.aevaluate(
                    response=res,
                    reference=ref
                 )
    if result.passing:
        print(tple[0],tple[1],result.score,result.passing)
        matches.append({'sbir_id':tple[0],'patent_id':tple[1],'score':result.score})
    time.sleep(.2)

66946 20230225609 0.8035576541916238 True
177945 20230225795 0.8072123176485497 True
54529 20230225600 0.8158844168103518 True
69226 20230225715 0.8082539626462197 True
160906 20230225609 0.8116168937755376 True
58313 20230225570 0.8187026644495451 True
123552 20230225644 0.8004786595568498 True
585 20230225653 0.8048579154474041 True
54529 20230225665 0.805937601090279 True
197666 20230225925 0.8061148332130356 True
87615 20230225610 0.8019542708408133 True
160785 20230225672 0.8076143016618101 True
117983 20230225328 0.8053217445187189 True
87615 20230225748 0.8229230086663284 True
127462 20230225636 0.8151068190836757 True
11762 20230225702 0.8020294070353361 True
121066 20230225331 0.8088271256418308 True
8108 20230225617 0.8209308134839316 True
5171 20230225873 0.8317202241336064 True
175362 20230225663 0.80231691239958 True
158100 20230225563 0.8064347595453933 True
126757 20230226119 0.8035884539902602 True
108075 20230225645 0.810159417154737 True
120745 20230225566 0.806093381

CancelledError: 

In [17]:
# to find number of tuples processed, check the index of the last tuple processed
sbir_patent_tuples.index((77053, 20230225294))

131306

In [18]:
len(sbir_patent_tuples)

755061

In [21]:
len(matches)

797

In [22]:
matches_df = pd.DataFrame(matches)

In [24]:
matches_df.to_csv('../preprocessed_files/llama_similarity.csv')

## Add winning companies to the csv manually

In [8]:
# Manually add list of companies found via Winning companies notebook to add to SBIR and patent data
winning_list = ['Beirobotics LLC','Ultra Safe Nuclear Corporation',
'Andluca Technologies Inc.',
'FURCIFER INC.',
'Kurt J. Lesker Company',
'Nanosys'
               ]

In [34]:
winning_df = sbir_df[sbir_df['Company'].isin(winning_list)]
winning_df

Unnamed: 0,Company,Award Title,Agency,Branch,Phase,Program,Agency Tracking Number,Contract,Proposal Award Date,Contract End Date,...,Contact Title,Contact Phone,Contact Email,PI Name,PI Title,PI Phone,PI Email,RI Name,RI POC Name,RI POC Phone
1482,Beirobotics LLC,SBIR Phase I:Unmanned Aerial Payload Systems f...,National Science Foundation,,Phase I,SBIR,2136680,2136680,09/15/2022,08/31/2023,...,,(804) 898-8134,michael.beiro@linebird.net,Michael Beiro,,(804) 898-8134,michael.beiro@linebird.net,,,
6726,Ultra Safe Nuclear Corporation,"Affordable, Reliable, High-Performance Ceramic...",Department of Energy,,Phase I,SBIR,0000266673,DE-SC0022735,06/27/2022,06/26/2023,...,,(346) 262-6222,g.gustavson@usnc.com,Charles Lewinsohn,,(385) 393-0660,c.lewinsohn@usnc.com,,,
7726,Andluca Technologies Inc.,Self Powered Smart Glass For Windows,Department of Defense,Air Force,Phase I,SBIR,FX211-CSO1-0562,FA8649-21-P-1247,04/13/2021,07/19/2021,...,,(609) 779-2828,nick@andluca.com,Nicholas Davy,,(609) 779-2828,nick@andluca.com,,,
7727,Andluca Technologies Inc.,"SBIR Phase II: Development of a transparent, n...",National Science Foundation,,Phase II,SBIR,2112279,2112279,08/15/2021,07/31/2023,...,,(832) 859-0382,nickcdavy@gmail.com,Nicholas Davy,,(832) 859-0382,nickcdavy@gmail.com,,,
9317,FURCIFER INC.,Dynamic window films for improved operational ...,Department of Defense,Air Force,Phase I,SBIR,FX211-CSO1-0507,FA8649-21-P-1142,04/14/2021,07/19/2021,...,,(415) 867-9065,ACUSG.Furcifer@gmail.com,Jian Wang,,(510) 516-6909,jianwang@furciferinc.com,,,
21889,Andluca Technologies Inc.,"STTR Phase I: Development of a Transparent, Ne...",National Science Foundation,,Phase I,STTR,1843743,1843743,02/01/2019,07/31/2019,...,,(832) 859-0382,nickcdavy@gmail.com,Nicholas Davy,,(832) 859-0382,nickcdavy@gmail.com,Princeton University,Yueh-Lin Loo,
27910,Ultra Safe Nuclear Corporation,Novel Technologies for Efficient NTP Reactor D...,National Aeronautics and Space Administration,,Phase II,SBIR,188423,80NSSC19C0202,08/14/2019,08/13/2021,...,,(228) 813-6209,james.s.hibbs@nasa.gov,Michael Eades,,(740) 262-2804,m.eades@usnc.com,,,
33678,Ultra Safe Nuclear Corporation,Novel Technologies for Efficient NTP Reactor D...,National Aeronautics and Space Administration,,Phase I,SBIR,188423,80NSSC18P2173,07/27/2018,02/15/2019,...,,(740) 262-2804,m.eades@usnc.com,Michael Eades,,(740) 262-2804,m.eades@usnc.com,,,
39675,Ultra Safe Nuclear Corporation,Accident Tolerant Reactor Shutdown for NTP Sys...,National Aeronautics and Space Administration,,Phase II,SBIR,155532,NNX17CS02C,04/10/2017,04/09/2019,...,Business Official,(858) 342-4837,p.venneri@usnc.com,Paolo Venneri,Principal Investigator,(858) 342-4837,p.venneri@usnc.com,,,
45077,Ultra Safe Nuclear Corporation,Passive Technology to Improve Criticality Cont...,National Aeronautics and Space Administration,,Phase II,SBIR,154873,NNX16CM01C,05/04/2016,05/03/2018,...,COO,(858) 750-8999,cjhamilton@ultrasafe-nuclear.com,Paolo Venneri,Principal Investigator,(858) 342-4837,pvenneri@ultrasafe-nuclear.com,,,


In [33]:
sbir_sim_df = pd.read_csv('./preprocessed_files/sbir_df_1k_similarity.csv')

In [35]:
sbir_sim_df = sbir_sim_df.set_index('id')

In [39]:
winning_df = winning_df.rename_axis(index='id')

In [41]:
final_df = pd.concat([sbir_sim_df, winning_df])

In [44]:
final_df['Company'].value_counts()

Company
Nanosys                           17
Ultra Safe Nuclear Corporation     7
CREARE LLC                         7
CFD RESEARCH CORPORATION           7
CHARLES RIVER ANALYTICS, INC.      6
                                  ..
Panorama Research Incorporated     1
Hx Technologies, Inc.              1
SNORKEL AI INC                     1
Detectnology, Inc.                 1
Focused Research, Inc.             1
Name: count, Length: 722, dtype: int64

In [45]:
final_df.to_csv('../preprocessed_files/sbir_1k_sample.csv')

## add abstract entities column

In [47]:
import joblib
model = joblib.load('../model/trained_tech_classifier_model.joblib')
nlp = sp.load('en_core_sci_lg')

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [50]:
import import_ipynb
import spacy_helper_methods as sph

importing Jupyter notebook from spacy_helper_methods.ipynb


In [52]:
final_df = final_df[~final_df['Abstract'].isna()]

In [53]:
%%time
final_df['abstract_entities'] = sph.extract_tech_entities(nlp, model, final_df['Abstract'])

CPU times: user 2min 40s, sys: 9.99 s, total: 2min 50s
Wall time: 3min 7s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [54]:
final_df.to_csv('../preprocessed_files/sbir_1k_sample.csv')