# Extract technical terms from SBIR data
This notebook extracts entities from SBIR dataset. The SBIR dataset is a csv file. We will run spacy methods to lemmatize and extract entities from the abstract field. We will then filter the entities to technical terms by passing the entities through binary classification model previously created. 

In [None]:
import import_ipynb
import spacy as sp
import json
import pandas as pd
import joblib
import requests
import io

In [None]:
import spacy_helper_methods as sph

## Load input data

In [None]:
%%time
# read SBIR award data directly from web URL
url="https://data.www.sbir.gov/awarddatapublic/award_data.csv"
s=requests.get(url).content
sbir_df=pd.read_csv(io.StringIO(s.decode('utf-8')), low_memory=False)

In [None]:
!unzip -o ../model/trained_tech_classifier_model.joblib.zip -d ../model/

## Extract entities and classify

In [None]:
model = joblib.load('../model/trained_tech_classifier_model.joblib')
nlp = sp.load('en_core_sci_lg')

In [None]:
#sbir_df.info()

In [None]:
#Resolve Nulls for Abstracts by assigning an empty value '' to allow the entity extraction process to work without removing the rows
sbir_df[sbir_df['Abstract'].isna()] = ''
sbir_df['Abstract'].astype('string')
sbir_df.info()

In [None]:
#sbir_df[sbir_df['Abstract'].isna()]

In [None]:
%%time
sbir_df['abstract_entities'] = sph.extract_tech_entities(nlp, model, sbir_df['Abstract'])

In [None]:
sbir_df.to_csv('../preprocessed_files/sbir_entities.csv')

In [None]:
#sbir_df = sbir_df.drop(['Abstract'],axis=1)

In [None]:
sbir_df.to_csv('../preprocessed_files/sbir_entities1.csv')

In [None]:
sbir_df.columns

## Create small output files
Since the dataframe is large, need to breakdown into smaller chunks for upload to github

In [None]:
chunksize = 22000 # number of rows per chunk
num_chunks = len(sbir_df)//chunksize + 1

In [None]:
output_directory = '../preprocessed_files/'
base_filename = "sbir_entities_"

# Write the DataFrame into multiple small files
for i in range(num_chunks):
    start_idx = i * chunksize
    end_idx = (i + 1) * chunksize
    chunk_dataframe = sbir_df.iloc[start_idx:end_idx]
    
    # Formulate the output filename for each chunk
    output_filename = f"{output_directory}{base_filename}{i + 1}.csv"
    
    # Write the chunk to a CSV file
    chunk_dataframe.to_csv(output_filename, index=False)