In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [2]:
legal_text = pd.read_excel('data/tx_sales_tax_regulations.xlsx')
legal_text.regulation_text.str.split(".",expand=True).stack()
legal_text.head()

Unnamed: 0,regulation,service_type,urls,regulation_text
0,§3.280,Aircraft,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...
1,§3.281,Records Required; Information Required,https://texreg.sos.state.tx.us/public/readtac$...,(a) Persons who must keep records. (1) Sellers...
2,§3.282,Auditing Taxpayer Records,https://texreg.sos.state.tx.us/public/readtac$...,"(a) The following words and terms, when used i..."
3,§3.283,Bartering Clubs and Exchanges,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...
4,§3.284,"Drugs, Medicines, Medical Equipment, and Devic...",https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...


In [3]:
model = 'bert-base-nli-mean-tokens'
# model = 'roberta-large-nli-stsb-mean-tokens'
embedder = SentenceTransformer(model)
legal_text['sentence'] = legal_text.regulation_text.str.split(".",expand=False).values
legal_text = legal_text.explode('sentence')
legal_text['sentence_embedding'] = embedder.encode(legal_text.sentence).tolist()
legal_text.head()

Unnamed: 0,regulation,service_type,urls,regulation_text,sentence,sentence_embedding
0,§3.280,Aircraft,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...,(a) Definitions,"[-0.08070746809244156, -0.1259254664182663, 1...."
0,§3.280,Aircraft,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...,"The following words and terms, when used in ...","[0.23225294053554535, 0.34092891216278076, 1.7..."
0,§3.280,Aircraft,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...,(1) Affiliate--A member of a group of entiti...,"[0.29893386363983154, 0.11982942372560501, 0.8..."
0,§3.280,Aircraft,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...,(2) Agricultural aircraft operation--The ope...,"[0.30255287885665894, 0.36949747800827026, -0...."
0,§3.280,Aircraft,https://texreg.sos.state.tx.us/public/readtac$...,(a) Definitions. The following words and terms...,Agricultural aircraft operations include crop...,"[-0.18180502951145172, -0.15519556403160095, 0..."


In [4]:
pd.set_option('max_colwidth', None)

In [5]:
legal_corpus = legal_text.copy()
query = 'Repair of a vehicle.'
query_embedding = embedder.encode(query)
legal_corpus['query_similarity'] = np.round(cosine_similarity(query_embedding, list(legal_corpus.sentence_embedding))[0],3)
legal_corpus=legal_corpus.sort_values('query_similarity', ascending=False).drop_duplicates(['service_type'])
legal_corpus[['regulation','service_type','sentence','query_similarity']].head()

Unnamed: 0,regulation,service_type,sentence,query_similarity
10,§3.290,Motor Vehicle Repair and Maintenance; Accessories and Equipment Added to Motor Vehicles; Moveable Specialized Equipment,(d) Repair of motor vehicle components or accessories,0.894
43,§3.324,"Oil, Gas, and Related Well Service",An example of a repair or restoration of real Cont'd,0.772
17,§3.297,"Carriers, Commercial Vessels, Locomotives and Rolling Stock, and Motor Vehicles",290 of this title (relating to Motor Vehicle Repair and Maintenance; Accessories and Equipment Added to Motor Vehicles; Moveable Specialized Equipment,0.771
12,§3.292,"Repair, Remodeling, Maintenance, and Restoration of Tangible Personal Property",290 of this title (relating to Motor Vehicle Repair and Maintenance; Accessories and Equipment Added to Motor Vehicles; Moveable Specialized Equipment),0.77
79,§3.367,"Timber Items (Tax Code, §151.3162 and §151.317)","The term ""equipment"" includes repair, replacement parts, and accessories for equipment",0.724
