## Generate simularity scoring for technical entities in the SBIR and Patent data sets



In [7]:
#!pip install pyjedai -U
#!pip show pyjedai

Name: pyjedai
Version: 0.1.3
Summary: An open-source library that builds powerful end-to-end Entity Resolution workflows.
Home-page: 
Author: 
Author-email: Konstantinos Nikoletos <nikoletos.kon@gmail.com>, George Papadakis <gpapadis84@gmail.com>, Jakub Maciejewski <jacobb.maciejewski@gmail.com>, Manolis Koubarakis <koubarak@di.uoa.gr>
License: Apache Software License 2.0
Location: /home/laben/anaconda3/lib/python3.9/site-packages
Requires: faiss-cpu, gensim, matplotlib, matplotlib-inline, networkx, nltk, numpy, optuna, ordered-set, pandas, pandas-profiling, pandocfilters, plotly, py-stringmatching, PyYAML, rdflib, rdfpandas, regex, scipy, seaborn, sentence-transformers, strsim, strsimpy, tomli, tqdm, transformers, valentine
Required-by: 


In [18]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph

from pyjedai.utils import print_clusters, print_blocks, print_candidate_pairs
from pyjedai.evaluation import Evaluation

from pyjedai.datamodel import Data
from pyjedai.joins import EJoin, TopKJoin
from pyjedai.clustering import ConnectedComponentsClustering

In [5]:
sbir_df = pd.read_csv('../preprocessed_files/sbir_entities.csv')

In [6]:
sbir_df.head()

Unnamed: 0.1,Unnamed: 0,Company,Award Title,Agency,Branch,Phase,Program,Agency Tracking Number,Contract,Proposal Award Date,...,Contact Phone,Contact Email,PI Name,PI Title,PI Phone,PI Email,RI Name,RI POC Name,RI POC Phone,abstract_entities
0,145112,NEUROBEHAVIORAL SYSTEMS INC,,Department of Health and Human Services,National Institutes of Health,Phase I,SBIR,1R43NS040623-01,,,...,(510) 653-3461,DLWOODS@NEUROBEHAVIORALSYSTEMS.COM,PETER PEBLER,,() -,,,,,"['transponder', 'multiple frequency band', 'mi..."
1,106926,Applied EM Inc.,An Integrated Antenna Set for Software Radios,Department of Defense,Navy,Phase II,SBIR,N032-0588,N68335-05-C-0421,09/30/2005,...,(757) 224-2035,cjreddy@appliedem.com,C. Reddy,President & Chief Technic,(757) 224-2035,cjreddy@appliedem.com,,,,"['Tactical radio system', 'wideband antenna se..."
2,126831,"SIGMA SYSTEMS RESEARCH, INC.",Integrated Data Fusion and Decision Support fo...,Department of Defense,Missile Defense Agency,Phase II,SBIR,01-0494,F3361502C4028,,...,(703) 582-0638,sigma@sigma-sys.com,Alan Hadjarian,Senior Scientist,(703) 385-5677,ahadjarian@sigma-sys.com,,,,"['datum fusion', 'decision support', 'decision..."
3,25069,MIKEL INC,Non-collinear Wave-front Curvature Range Measu...,Department of Defense,Navy,Phase II,SBIR,N021-0353,N68335-19-C-0091,11/21/2018,...,(508) 523-6832,craig.cameron@mikelinc.com,"TOM NORTHARDT, Ph.D. TOM NORTHARDT, Ph.D.",PROJECT ENGINEER / PRINCIPAL INVESTIGATOR,(401) 846-1462,tom.northardt@mikelinc.com,,,,"['goal phase', 'information', 'submarine', 'ac..."
4,83500,"MOSAIC ATM, INC.","Autonomous Command, Control and Communication ...",Department of Defense,Air Force,Phase II,SBIR,F073-089-1634,FA8650-09-C-3910,03/19/2009,...,(800) 405-8576,stevenson@mosaicatm.com,Chris Brinton,President and Principal A,(703) 737-7637,brinton@mosaicatm.com,,,,"['current', 'paradigm management', 'traffic', ..."


In [16]:
attr = ['abstract_entities']
data = Data(
    dataset_1=sbir_df,
    id_column_name_1='Unnamed: 0',
    #ground_truth=gt,
    attributes_1=attr
)

In [19]:
join = EJoin(similarity_threshold = 0.5,
             metric = 'jaccard',
             tokenization = 'qgrams_multiset',
             qgrams = 2)

g = join.fit(data)

EJoin (jaccard):   0%|          | 0/871 [00:00<?, ?it/s]

In [28]:
ec = ConnectedComponentsClustering()
clusters = ec.process(g, data, similarity_threshold=0.5)

In [29]:
print(clusters)

[{0}, {1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 83, 84, 85, 86, 89, 90, 91, 92, 93, 95, 96, 98, 99, 100, 101, 102, 103, 106, 107, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 135, 138, 141, 143, 146, 147, 148, 150, 152, 153, 154, 155, 156, 157, 158, 160, 161, 163, 164, 165, 166, 167, 169, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 184, 185, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 203, 204, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 234, 235, 237, 238, 239, 240, 241, 242, 243, 244, 246, 247, 248, 249, 250, 251, 253, 254, 255, 256, 257, 258, 259, 260, 262, 263, 266, 267, 270, 271, 272, 273, 274, 277, 278, 280, 281, 283, 

In [23]:
#assign clusters to the data frame
#sbir_df.insert(1,'ClusterID', -1, True)

for i in range(0,len(clusters)):
    sbir_df.at[tuple(clusters[i]),'ClusterID'] = i

ValueError: Invalid call for scalar access (setting)!