# Extract Non-technical terms
Use a sample of 10k SBIR documents to extract non-technical terms. We basically do entity extraction from the 10k samples here. The SBIR document is in csv format and can be downlaoded from: https://www.sbir.gov/sbirsearch/award/all - We used "with abstract data" option to download. 

In [1]:
#!pip install import-ipynb 
#!pip install spacy

In [2]:
import pandas as pd
from nltk.corpus import stopwords
from itertools import chain
import import_ipynb
import spacy as sp
import nltk
import spacy
import requests
import io

In [3]:
# Import spacy helper functions from the helper notebook

#subdirectory = "/Users/prakhar/Documents/work/ucsd/dse203/SBIR_Patent_analysis"

# PS spacy_helper_methods notebook should be in same directory

import spacy_helper_methods as sph

importing Jupyter notebook from spacy_helper_methods.ipynb


## Load, Preprocess, cleanup and sample

In [4]:
%%time
# read SBIR award data directly from web URL
url="https://data.www.sbir.gov/awarddatapublic/award_data.csv"
s=requests.get(url).content
sbir_df=pd.read_csv(io.StringIO(s.decode('utf-8')),usecols=['Abstract'])

CPU times: user 8.66 s, sys: 10.3 s, total: 18.9 s
Wall time: 50.4 s


In [5]:
# Cleanup
sbir_df = sbir_df.dropna()
len(sbir_df)

173500

In [6]:
# get a sample of 10k abstracts to create non-tech words 
sbir_df = sbir_df.sample(10000)

In [7]:
sbir_df.head()

Unnamed: 0,Abstract
90144,This Small Business Technology Transfer Resear...
65438,DESCRIPTION (provided by applicant): Seasonal ...
46240,Phase II of the New METSAT Display project wil...
184876,THE LONG-TERM GOAL OF THIS RESEARCH IS TO DEVE...
83843,DESCRIPTION (provided by applicant): Indolamin...


In [8]:
# Remove stopwords using nltk. Initial parse seems to be faster than spacy and 
# doesn't seem to remove all stopwords compared to spacy hence after inital pass
# will still filter on spacy's stopwords in lemmatize method

nltk.download('stopwords')    # PS needs to be downloaded

stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sagarjogadhenu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
%%time
sbir_df = sbir_df['Abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

CPU times: user 7.17 s, sys: 149 ms, total: 7.32 s
Wall time: 8.43 s


## Extract entities

In [10]:
#!python -m spacy download en_core_web_lg

In [11]:
# load the scispacy large vocabulary - gives us lot more entities than spacy 
# note that sci spacy vocabulary was trained with spacy 3.6.1 hence may get warning

nlp = sp.load("en_core_web_lg")  #PS !python -m spacy download en_core_web_lg to be installed


In [12]:
%%time
# Load the scispacy large vocabulary. Lemmatization seems to be faster with this 
lemma_ds = sph.lemmatize(nlp, sbir_df)

CPU times: user 7min 5s, sys: 46.6 s, total: 7min 52s
Wall time: 9min 55s


In [13]:
%%time
ent_ds = sph.get_entities(nlp, lemma_ds)

CPU times: user 6min 10s, sys: 43.5 s, total: 6min 53s
Wall time: 10min 33s


## Filter entities 
1. Remove entities such as time, money, quantity
2. remove any entities that match technical terms

In [14]:
# collapse all tuples into a flat list
tuple_list = list(set(list(chain.from_iterable(ent_ds.values))))
tuple_list = [tuple for tuple in tuple_list if tuple[1] not in ['CARDINAL','PERCENT','TIME','QUANTITY','MONEY']]
entities = list(set(list([i[0] for i in tuple_list])))

In [15]:
# remove any entities that are in the technical word list
# load technical terms
with open('../preprocessed_files/tech_terms.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts.head()

#convert entity list generated above from SBIR abstract to lower
nts = pd.Series(entities).str.lower()

#filter all technical words from nts
nts = nts[~nts.isin(ts)]

In [16]:
entities = [entity for entity in entities if entity.lower() in list(nts)]

In [17]:
len(entities), entities[:5], tuple_list[:5]

(29826,
 ['CHOLECYSTOKININ',
  'lacritin efficacious',
  'c / c',
  'Robustness UAS Technology',
  'WisdomTools Project Lead Way'],
 [('projectis', 'NORP'),
  ('Omniox', 'ORG'),
  ('F - NVS', 'PRODUCT'),
  ('R5 X4', 'PRODUCT'),
  ('PDT DRUG', 'ORG')])

In [18]:
with open('../preprocessed_files/non_tech.txt','w') as nf:
    nf.writelines('\n'.join(map(str, entities)))