# Extract Non-technical terms
Use a sample of 10k SBIR documents to extract non-technical terms. We basically do entity extraction from the 10k samples here. The SBIR document is in csv format and can be downlaoded from: https://www.sbir.gov/sbirsearch/award/all - We used "with abstract data" option to download. 

In [1]:
import pandas as pd
#import scispacy as scisp
from nltk.corpus import stopwords
from itertools import chain
import import_ipynb
import spacy as sp

In [2]:
# Import spacy helper functions from the helper notebook
import spacy_helper_methods as sph

importing Jupyter notebook from spacy_helper_methods.ipynb


In [3]:
sbir_df = pd.read_csv('award_data.csv',usecols=['Abstract'])

In [4]:
# Cleanup
sbir_df = sbir_df.dropna()
len(sbir_df)

173501

In [5]:
# get a sample of 10k abstracts to create non-tech words 
sbir_df = sbir_df.sample(10000)

In [6]:
sbir_df.head()

Unnamed: 0,Abstract
16366,Freedom Photonics proposes to develop a pressu...
50432,The broader impact/commercial potential of thi...
100717,The objective of this proposal is to improve t...
12093,Global economic impact of pathogenic plant vir...
113174,The versatility of military fixed and rotary w...


In [7]:
# Remove stopwords using nltk. Initial parse seems to be faster than spacy and 
# doesn't seem to remove all stopwords compared to spacy hence after inital pass
# will still filter on spacy's stopwords in lemmatize method
stop = stopwords.words('english')

In [8]:
%%time
sbir_df = sbir_df['Abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

CPU times: user 6.1 s, sys: 76.6 ms, total: 6.18 s
Wall time: 6.52 s


In [None]:
#!python -m spacy download en_core_web_lg

In [9]:
# load the scispacy large vocabulary - gives us lot more entities than spacy 
# note that sci spacy vocabulary was trained with spacy 3.6.1 hence may get warning
nlp = sp.load("en_core_web_lg")

In [16]:
%%time
# Load the scispacy large vocabulary. Lemmatization seems to be faster with this 
lemma_ds = sph.lemmatize(nlp, sbir_df)

CPU times: user 7min 8s, sys: 46.1 s, total: 7min 54s
Wall time: 9min 54s


In [17]:
%%time
ent_ds = sph.get_entities(nlp, lemma_ds)

CPU times: user 6min, sys: 36.7 s, total: 6min 36s
Wall time: 7min 38s


In [42]:
# collapse all tuples into a flat list
tuple_list = list(set(list(chain.from_iterable(ent_ds.values))))
tuple_list = [tuple for tuple in tuple_list if tuple[1] not in ['CARDINAL','PERCENT','TIME','QUANTITY','MONEY']]
entities = list(set(list([i[0] for i in tuple_list])))

In [43]:
# remove any entities that are in the technical word list
# load technical terms
with open('tech.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts.head()

#convert entity list generated above from SBIR abstract to lower
nts = pd.Series(entities).str.lower()

#filter all technical words from nts
nts = nts[~nts.isin(ts)]

In [44]:
entities = [entity for entity in entities if entity.lower() in list(nts)]

In [48]:
len(entities), entities[:5], tuple_list[:5]

(30377,
 ['allowteacher',
  'Eltron',
  'MAJOR MEDICAL SUPPLY COMPANY',
  'MORBIDITY MORTALITY',
  'Ocean Observing'],
 [('TUMORS 3', 'ORG'),
  ('tertiary', 'ORDINAL'),
  ('DETECTORS', 'PRODUCT'),
  ('ISAV', 'ORG'),
  ('ac / dc load bank', 'ORG')])

In [49]:
with open('non_tech.txt','w') as nf:
    nf.writelines('\n'.join(map(str, entities)))