# Extract Non-technical terms
Use a sample of 10k SBIR documents to extract non-technical terms. We basically do entity extraction from the 10k samples here. The SBIR document is in csv format and can be downlaoded from: https://www.sbir.gov/sbirsearch/award/all - We used "with abstract data" option to download. 

In [1]:
#If these modules or packages are not installed, then uncomment the following lines and install them

#!pip install import-ipynb 
#!pip install spacy
#alternative installation method may work if the above version does not
#!pip install -U spacy

#!python -m spacy download en_core_web_lg to be installed
#If the above installation method does not work, the following command might work
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0.tar.gz

In [3]:
import pandas as pd
from nltk.corpus import stopwords
from itertools import chain
import import_ipynb
import spacy as sp
import nltk
import spacy
import requests
import io

In [4]:
# Import spacy_helper_methods notebook should be in same directory
import spacy_helper_methods as sph

importing Jupyter notebook from spacy_helper_methods.ipynb


## Load, Preprocess, cleanup and sample

In [5]:
%%time
# read SBIR award data directly from web URL
url="https://data.www.sbir.gov/awarddatapublic/award_data.csv"
s=requests.get(url).content
sbir_df=pd.read_csv(io.StringIO(s.decode('utf-8')),usecols=['Abstract'])

CPU times: user 2.02 s, sys: 1.07 s, total: 3.09 s
Wall time: 8.92 s


In [6]:
# Cleanup
sbir_df = sbir_df.dropna()
len(sbir_df)

173500

In [7]:
# get a sample of 10k abstracts to create non-tech words 
sbir_df = sbir_df.sample(10000)

In [8]:
sbir_df.head()

Unnamed: 0,Abstract
72746,DESCRIPTION (provided by applicant): Methadone...
58821,This proposed study intends to develop a model...
94023,The current and next generation military jet a...
52540,This Small Business Innovation Research (SBIR)...
66431,We aim to solve a technical problem that is hi...


In [9]:
# Remove stopwords using nltk. Initial parse seems to be faster than spacy and 
# doesn't seem to remove all stopwords compared to spacy hence after inital pass
# will still filter on spacy's stopwords in lemmatize method

#download stopwords (will cleaning skip if its already downloaded)
nltk.download('stopwords')

stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/laben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
%%time
sbir_df = sbir_df['Abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

CPU times: user 1.61 s, sys: 5.27 ms, total: 1.61 s
Wall time: 1.61 s


## Extract entities

In [11]:
# load the scispacy large vocabulary - gives us lot more entities than spacy 
# note that sci spacy vocabulary was trained with spacy 3.6.1 hence may get warning

nlp = sp.load("en_core_web_lg")


In [12]:
%%time
# Load the scispacy large vocabulary. Lemmatization seems to be faster with this 
lemma_ds = sph.lemmatize(nlp, sbir_df)

CPU times: user 4min 21s, sys: 431 ms, total: 4min 21s
Wall time: 4min 21s


In [13]:
%%time
ent_ds = sph.get_entities(nlp, lemma_ds)

CPU times: user 4min 22s, sys: 24.6 ms, total: 4min 22s
Wall time: 4min 22s


## Filter entities 
1. Remove entities such as time, money, quantity
2. remove any entities that match technical terms

In [14]:
# collapse all tuples into a flat list
tuple_list = list(set(list(chain.from_iterable(ent_ds.values))))
tuple_list = [tuple for tuple in tuple_list if tuple[1] not in ['CARDINAL','PERCENT','TIME','QUANTITY','MONEY']]
entities = list(set(list([i[0] for i in tuple_list])))

In [15]:
# remove any entities that are in the technical word list
# load technical terms
with open('../preprocessed_files/tech_terms.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts.head()

#convert entity list generated above from SBIR abstract to lower
nts = pd.Series(entities).str.lower()

#filter all technical words from nts
nts = nts[~nts.isin(ts)]

In [16]:
entities = [entity for entity in entities if entity.lower() in list(nts)]

In [17]:
len(entities), entities[:5], tuple_list[:5]

(29894,
 ['Ron Hatch', 'Nu - Trek', 'Innoveering', 'schlieren', 'ECL'],
 [('coatingswill', 'ORG'),
  ('Electron - Ion Colliders', 'ORG'),
  ('USMC', 'ORG'),
  ('TYPICAL', 'ORG'),
  ('Egret', 'ORG')])

In [18]:
with open('../preprocessed_files/non_tech.txt','w') as nf:
    nf.writelines('\n'.join(map(str, entities)))