# Technical term classifier
Model to identify whether a given word is technical or non-technical
Tried few different algorithms including scibert from Huggingface but found 
that RandomForrest method is faster to train and gives better results

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib

In [2]:
# load tech and non-tech terms
with open('tech.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts[:5]

0                     global differential gps system
1    simple certificate enrollment protocol ( scep )
2                            nearfield communication
3                                                c&s
4                                               irtf
dtype: object

In [17]:
# load terms extracted from sample of SBIR articles - We have two non_tech files, 
# extracted from two different samples 
with open('non_tech2.txt','r') as f:
    text = f.read()
non_techwords = text.split('\n')
nts = pd.Series(non_techwords).str.lower()
nts[:5],len(nts)

(0                    hvt
 1    stanford university
 2                   faee
 3               pikewerk
 4            rift valley
 dtype: object,
 16107)

In [18]:
train_data = [{"text":word,"label":1} for word in ts] 
train_data2 = [{"text":word, "label":0} for word in nts]
train_data[:5],train_data2[:5]

([{'text': 'global differential gps system', 'label': 1},
  {'text': 'simple certificate enrollment protocol ( scep )', 'label': 1},
  {'text': 'nearfield communication', 'label': 1},
  {'text': 'c&s', 'label': 1},
  {'text': 'irtf', 'label': 1}],
 [{'text': 'hvt', 'label': 0},
  {'text': 'stanford university', 'label': 0},
  {'text': 'faee', 'label': 0},
  {'text': 'pikewerk', 'label': 0},
  {'text': 'rift valley', 'label': 0}])

In [19]:
train_data = train_data + train_data2

In [20]:
# convert to Pandas dataframe
tdf = pd.DataFrame(train_data,columns={'text','label'})
tdf['label'].value_counts()

1    21700
0    16107
Name: label, dtype: int64

In [24]:
%%time
# Sample data
texts = tdf['text']
labels = tdf['label']  # 1 for tech, 0 for non-tech

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, 
                                                    random_state=42)

CPU times: user 13.4 ms, sys: 18.1 ms, total: 31.4 ms
Wall time: 34.9 ms


In [25]:
%%time
# Define and train the model
model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
model.fit(X_train, y_train)

CPU times: user 1min 38s, sys: 2.44 s, total: 1min 40s
Wall time: 3min 1s


In [26]:
# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.80
Precision: 0.93
Recall: 0.71
F1 Score: 0.81


In [27]:
model.predict(pd.Series(['Cloud Computing','happy smiles', 'invisalign', 'one hot encoding']))

array([1, 0, 0, 0])

In [29]:
# Save model so in future can be loaded directly without retraining

joblib.dump(model, 'trained_tech_classifier_model.joblib')

In [30]:
# load model 
model = joblib.load('trained_tech_classifier_model.joblib')

In [32]:
model.predict(pd.Series(['Cloud Computing','happy smiles', 'invisalign', 'one hot encoding']))

array([1, 0, 0, 0])

In [41]:
# Process to extract terms from abstract and check for technical terms

abstract = """A travel line creation system for an agricultural machine, includes a position 
acquirer to acquire position measurement points for the agricultural machine, a display, a 
first generator to associate the position measurement points with a field displayed by the 
display and generate creation points on the field by shifting the position measurement points 
inward in an agricultural field, a second generator to create a travel line including a loop 
which passes through the creation points and calculate each of virtual lines by connecting 
together adjacent ones of the creation points, and a setter to extract a pair of virtual 
lines adjacent to each other and extending in different directions and set, as a work point 
regarding a breakpoint in work performed by a working device, one of the creation points 
shared by the extracted pair of virtual lines"""
ads = pd.Series([abstract])

In [38]:
import import_ipynb
import spacy_helper_methods as sph
import spacy as sp

In [44]:
# Sci-spacy gives more entities than regular spacy hence using sci-spacy
nlp = sp.load("en_core_sci_lg")
lemma_ds = sph.lemmatize(nlp, ads)
ent_ds = sph.get_entities(nlp, lemma_ds)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [45]:
ent_ds

0    [(travel, ENTITY), (agricultural machine, ENTI...
dtype: object

In [50]:
entities = [ent[0] for ent in ent_ds[0]]

In [53]:
y_pred = model.predict(entities)

In [55]:
ent_pred = [{entities[i]:y_pred[i]} for i in range(len(y_pred))]
ent_pred

[{'travel': 0},
 {'agricultural machine': 1},
 {'acquirer acquire position measurement': 1},
 {'point agricultural machine': 1},
 {'display': 1},
 {'point field': 1},
 {'creation': 0},
 {'measurement': 1},
 {'inward': 0},
 {'agricultural field': 1},
 {'generator': 1},
 {'loop': 1},
 {'virtual line': 1},
 {'adjacent': 0},
 {'creation point': 1},
 {'setter extract pair': 0},
 {'adjacent': 0},
 {'direction': 1},
 {'breakpoint work': 1},
 {'working device': 1},
 {'creation': 0},
 {'pair virtual line': 1}]