# Technical term classifier
Model to identify whether a given word is technical or non-technical
Tried few different algorithms including scibert from Huggingface but found 
that RandomForrest method is faster to train and gives better results

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib

In [6]:
# load tech and non-tech terms
#PS update tech.txt filename & directory
with open('../preprocessed_files/tech_terms.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts[:5]

0         multiple signal classification
1                               function
2                   nonvolatile memories
3                                    dep
4    national security information (nsi)
dtype: object

In [7]:
# load terms extracted from sample of SBIR articles - We have two non_tech files, 
# extracted from two different samples 
#PS update library
with open('../preprocessed_files/non_tech2.txt','r') as f:
    text = f.read()
non_techwords = text.split('\n')
nts = pd.Series(non_techwords).str.lower()
nts[:5],len(nts)

(0                    hvt
 1    stanford university
 2                   faee
 3               pikewerk
 4                    mda
 dtype: object,
 17786)

In [8]:
train_data = [{"text":word,"label":1} for word in ts] 
train_data2 = [{"text":word, "label":0} for word in nts]
train_data[:5],train_data2[:5]

([{'text': 'multiple signal classification', 'label': 1},
  {'text': 'function', 'label': 1},
  {'text': 'nonvolatile memories', 'label': 1},
  {'text': 'dep', 'label': 1},
  {'text': 'national security information (nsi)', 'label': 1}],
 [{'text': 'hvt', 'label': 0},
  {'text': 'stanford university', 'label': 0},
  {'text': 'faee', 'label': 0},
  {'text': 'pikewerk', 'label': 0},
  {'text': 'mda', 'label': 0}])

In [9]:
train_data = train_data + train_data2

In [10]:
# convert to Pandas dataframe
#Update {} to []
tdf = pd.DataFrame(train_data,columns=['text','label'])
tdf['label'].value_counts()

label
1    21649
0    17786
Name: count, dtype: int64

In [11]:
%%time
# Sample data
texts = tdf['text']
labels = tdf['label']  # 1 for tech, 0 for non-tech

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, 
                                                    random_state=42)

CPU times: user 1.65 ms, sys: 1.21 ms, total: 2.87 ms
Wall time: 4.83 ms


In [12]:
%%time
# Define and train the model
model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
model.fit(X_train, y_train)

CPU times: user 42.3 s, sys: 53.1 ms, total: 42.4 s
Wall time: 42.5 s


In [13]:
# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.79
Precision: 0.92
Recall: 0.68
F1 Score: 0.78


In [14]:
model.predict(pd.Series(['Cloud Computing','happy smiles', 'invisalign', 'one hot encoding']))

array([1, 0, 0, 1])

In [15]:
# Save model so in future can be loaded directly without retraining

joblib.dump(model, 'trained_tech_classifier_model.joblib')

['trained_tech_classifier_model.joblib']

In [16]:
# load model 
model = joblib.load('trained_tech_classifier_model.joblib')

In [17]:
model.predict(pd.Series(['Cloud Computing','happy smiles', 'invisalign', 'one hot encoding']))

array([1, 0, 0, 1])

In [18]:
# Process to extract terms from abstract and check for technical terms

abstract = """A travel line creation system for an agricultural machine, includes a position 
acquirer to acquire position measurement points for the agricultural machine, a display, a 
first generator to associate the position measurement points with a field displayed by the 
display and generate creation points on the field by shifting the position measurement points 
inward in an agricultural field, a second generator to create a travel line including a loop 
which passes through the creation points and calculate each of virtual lines by connecting 
together adjacent ones of the creation points, and a setter to extract a pair of virtual 
lines adjacent to each other and extending in different directions and set, as a work point 
regarding a breakpoint in work performed by a working device, one of the creation points 
shared by the extracted pair of virtual lines"""
ads = pd.Series([abstract])

In [2]:
import import_ipynb
import spacy_helper_methods as sph
import spacy as sp

importing Jupyter notebook from spacy_helper_methods.ipynb


In [3]:
# Sci-spacy gives more entities than regular spacy hence using sci-spacy
nlp = sp.load("en_core_sci_lg")  #PS en_core_sci_sm worked but not  en_core_sci_lg
lemma_ds = sph.lemmatize(nlp, ads)
ent_ds = sph.get_entities(nlp, lemma_ds)



OSError: [E050] Can't find model 'en_core_sci_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

In [21]:
ent_ds

0    [(travel line creation system, ENTITY), (agric...
dtype: object

In [22]:
entities = [ent[0] for ent in ent_ds[0]]

In [23]:
y_pred = model.predict(entities)

In [24]:
ent_pred = [{entities[i]:y_pred[i]} for i in range(len(y_pred))]
ent_pred

[{'travel line creation system': 1},
 {'agricultural machine': 1},
 {'position': 0},
 {'acquirer acquire': 1},
 {'position measurement': 1},
 {'point agricultural machine': 1},
 {'display': 0},
 {'position measurement point field display': 1},
 {'creation': 0},
 {'point field shift position': 1},
 {'generator': 0},
 {'travel line': 1},
 {'loop': 0},
 {'pass': 0},
 {'creation': 0},
 {'virtual line': 1},
 {'adjacent': 0},
 {'creation': 0},
 {'setter extract pair virtual \n line': 1},
 {'direction set': 0},
 {'work device': 1},
 {'creation': 0},
 {'extract': 0}]