# Extract Non-technical terms
Use a sample of 10k SBIR documents to extract non-technical terms. We basically do entity extraction from the 10k samples here. The SBIR document is in csv format and can be downlaoded from: https://www.sbir.gov/sbirsearch/award/all - We used "with abstract data" option to download. 

In [1]:
import pandas as pd
#import scispacy as scisp
from nltk.corpus import stopwords
from itertools import chain
import import_ipynb   #PS comment pip install import-ipynb 
import spacy as sp  #PS !pip install spacy
import nltk  # PS needs to be added
import spacy # PS needs to be added

In [2]:
# Import spacy helper functions from the helper notebook

#subdirectory = "/Users/prakhar/Documents/work/ucsd/dse203/SBIR_Patent_analysis"

# PS spacy_helper_methods notebook should be in same directory

import spacy_helper_methods as sph

importing Jupyter notebook from spacy_helper_methods.ipynb


In [3]:
sbir_df = pd.read_csv('../input_files/award_data.csv',usecols=['Abstract'])  # PS update the path location

In [4]:
# Cleanup
sbir_df = sbir_df.dropna()
len(sbir_df)

173500

In [5]:
# get a sample of 10k abstracts to create non-tech words 
sbir_df = sbir_df.sample(10000)

In [6]:
sbir_df.head()

Unnamed: 0,Abstract
71458,"Currently, the Military lacks the capability t..."
192680,THE CONSTRAINTS OF REAL-TIME APPLICATIONS CANN...
46593,The development of a low-cost permeable unidir...
11359,Nitricity is developing a technology that prod...
54781,Project Summary We aim to introduce to the hea...


In [7]:
# Remove stopwords using nltk. Initial parse seems to be faster than spacy and 
# doesn't seem to remove all stopwords compared to spacy hence after inital pass
# will still filter on spacy's stopwords in lemmatize method

nltk.download('stopwords')    # PS needs to be downloaded

stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prakhar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
%%time
sbir_df = sbir_df['Abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

CPU times: user 1.19 s, sys: 2.33 ms, total: 1.19 s
Wall time: 1.19 s


In [9]:
#!python -m spacy download en_core_web_lg

In [10]:
# load the scispacy large vocabulary - gives us lot more entities than spacy 
# note that sci spacy vocabulary was trained with spacy 3.6.1 hence may get warning

nlp = sp.load("en_core_web_lg")  #PS !python -m spacy download en_core_web_lg to be installed


In [11]:
%%time
# Load the scispacy large vocabulary. Lemmatization seems to be faster with this 
lemma_ds = sph.lemmatize(nlp, sbir_df)

CPU times: user 4min 24s, sys: 2.54 s, total: 4min 26s
Wall time: 4min 27s


In [12]:
%%time
ent_ds = sph.get_entities(nlp, lemma_ds)

CPU times: user 4min 4s, sys: 1.79 s, total: 4min 6s
Wall time: 4min 7s


In [13]:
# collapse all tuples into a flat list
tuple_list = list(set(list(chain.from_iterable(ent_ds.values))))
tuple_list = [tuple for tuple in tuple_list if tuple[1] not in ['CARDINAL','PERCENT','TIME','QUANTITY','MONEY']]
entities = list(set(list([i[0] for i in tuple_list])))

In [17]:
# remove any entities that are in the technical word list
# load technical terms
with open('../preprocessed_files/tech_terms.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts.head()

#convert entity list generated above from SBIR abstract to lower
nts = pd.Series(entities).str.lower()

#filter all technical words from nts
nts = nts[~nts.isin(ts)]

In [18]:
entities = [entity for entity in entities if entity.lower() in list(nts)]

In [19]:
len(entities), entities[:5], tuple_list[:5]

(29976,
 ['Power Energy , Inc.',
  'AL FEASIBILITY',
  'UEOS',
  'CalRAM Inc.',
  'Air Vehicles Directorate'],
 [('NanoSpray Combustion', 'ORG'),
  ('MAC', 'ORG'),
  ('Field Scanning Optical Microscopy ( NSOM', 'ORG'),
  ('commercializationpotential dod program', 'ORG'),
  ('Oregon Health Science Uni', 'ORG')])

In [20]:
with open('non_tech.txt','w') as nf:
    nf.writelines('\n'.join(map(str, entities)))