# Technical term classifier
Model to identify whether a given word is technical or non-technical
Tried few different algorithms including scibert from Huggingface but found 
that RandomForrest method is faster to train and gives better results

In [1]:
# To download en_core_sci_lg language model used for the tests, uncomment and run the following line
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz 

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib
import import_ipynb
import spacy_helper_methods as sph  # This is a noetbook
import spacy as sp

importing Jupyter notebook from spacy_helper_methods.ipynb


## Load data
After loading, turn everything to lowercase for training

In [3]:
# load tech and non-tech terms
with open('../preprocessed_files/tech_terms.txt','r') as f:
    text = f.read()
techwords = text.split('\n')
ts = pd.Series(techwords).str.lower()
ts[:5]

0                                    masking threshold
1                                          restful api
2                                         quantum-dots
3    strength, weakness, opportunity, and threat an...
4                                                dance
dtype: object

In [4]:
# load terms extracted from sample of SBIR articles - We have two non_tech files, 
# extracted from two different samples 
with open('../preprocessed_files/non_tech2.txt','r') as f:
    text = f.read()
non_techwords = text.split('\n')
nts = pd.Series(non_techwords).str.lower()
nts[:5],len(nts)

(0                    hvt
 1    stanford university
 2                   faee
 3               pikewerk
 4                    mda
 dtype: object,
 17786)

## Label the data 
- 0 - non-technology term
- 1 - technology term found in IEEE and ACM vocabulary

In [5]:
train_data1 = [{"text":word,"label":1} for word in ts] 
train_data2 = [{"text":word, "label":0} for word in nts]
train_data1[:5],train_data2[:5]

([{'text': 'masking threshold', 'label': 1},
  {'text': 'restful api', 'label': 1},
  {'text': 'quantum-dots', 'label': 1},
  {'text': 'strength, weakness, opportunity, and threat analysis', 'label': 1},
  {'text': 'dance', 'label': 1}],
 [{'text': 'hvt', 'label': 0},
  {'text': 'stanford university', 'label': 0},
  {'text': 'faee', 'label': 0},
  {'text': 'pikewerk', 'label': 0},
  {'text': 'mda', 'label': 0}])

In [6]:
tdf = pd.DataFrame(train_data1 + train_data2,columns=['text','label'])

In [7]:
tdf['label'].value_counts()

label
1    21649
0    17786
Name: count, dtype: int64

## Data for training
Combine the labeled data for tech and non-tech terms. Split the data for training and validation

In [8]:
%%time
# Sample data
#texts = tdf['text']
#labels = tdf['label']  # 1 for tech, 0 for non-tech

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tdf['text'], tdf['label'], test_size=0.25, stratify=tdf['label'],
                                                    shuffle=True, random_state=42)

CPU times: user 25.9 ms, sys: 4.5 ms, total: 30.4 ms
Wall time: 39.3 ms


In [9]:
y_train.value_counts()

label
1    16237
0    13339
Name: count, dtype: int64

## Train the model

In [10]:
%%time
# Define and train the model
model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
model.fit(X_train, y_train)

CPU times: user 2min 5s, sys: 2.36 s, total: 2min 8s
Wall time: 5min 48s


In [11]:
%%time
# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.79
Precision: 0.92
Recall: 0.68
F1 Score: 0.78
CPU times: user 21.3 s, sys: 628 ms, total: 21.9 s
Wall time: 34 s


## Save the model for future use

In [12]:
# Save model so in future can be loaded directly without retraining

joblib.dump(model, '../model/trained_tech_classifier_model.joblib')

['../model/trained_tech_classifier_model.joblib']

## Some tests

In [13]:
# load model 
model = joblib.load('../model/trained_tech_classifier_model.joblib')

In [14]:
model.predict(pd.Series(['Cloud Computing','happy smiles', 'invisalign', 'one hot encoding']))

array([1, 0, 0, 1])

In [15]:
# Process to extract terms from abstract and check for technical terms

abstract = """A travel line creation system for an agricultural machine, includes a position 
acquirer to acquire position measurement points for the agricultural machine, a display, a 
first generator to associate the position measurement points with a field displayed by the 
display and generate creation points on the field by shifting the position measurement points 
inward in an agricultural field, a second generator to create a travel line including a loop 
which passes through the creation points and calculate each of virtual lines by connecting 
together adjacent ones of the creation points, and a setter to extract a pair of virtual 
lines adjacent to each other and extending in different directions and set, as a work point 
regarding a breakpoint in work performed by a working device, one of the creation points 
shared by the extracted pair of virtual lines"""
ads = pd.Series([abstract])

In [16]:
# Sci-spacy gives more entities than regular spacy hence using sci-spacy
nlp = sp.load("en_core_sci_lg")  #PS en_core_sci_sm worked but not  en_core_sci_lg
lemma_ds = sph.lemmatize(nlp, ads)
ent_ds = sph.get_entities(nlp, lemma_ds)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [17]:
entities = [ent[0] for ent in ent_ds[0]]
entities

['travel',
 'agricultural machine',
 'acquirer acquire position measurement',
 'point agricultural machine',
 'display',
 'point field',
 'creation',
 'measurement',
 'inward',
 'agricultural field',
 'generator',
 'loop',
 'virtual line',
 'adjacent',
 'creation point',
 'setter extract pair',
 'adjacent',
 'direction',
 'breakpoint work',
 'working device',
 'creation',
 'pair virtual line']

In [18]:
# Classify whether the above entiteis are technical or non-technical terms
y_pred = model.predict(entities)

In [19]:
ent_pred = [{entities[i]:y_pred[i]} for i in range(len(y_pred))]
ent_pred

[{'travel': 0},
 {'agricultural machine': 1},
 {'acquirer acquire position measurement': 1},
 {'point agricultural machine': 1},
 {'display': 0},
 {'point field': 1},
 {'creation': 0},
 {'measurement': 1},
 {'inward': 0},
 {'agricultural field': 1},
 {'generator': 0},
 {'loop': 0},
 {'virtual line': 1},
 {'adjacent': 0},
 {'creation point': 1},
 {'setter extract pair': 0},
 {'adjacent': 0},
 {'direction': 0},
 {'breakpoint work': 1},
 {'working device': 1},
 {'creation': 0},
 {'pair virtual line': 1}]