# Preprocessing

In [1]:
%%capture
!pip install skweak
!pip install spacy
!pip install python-docx
!python -m spacy download "en_core_web_sm"
!python -m spacy download "en_core_web_md"


In [2]:
!unzip /content/SampleDocs.zip
!mv /content/SampleDocs/Testing /content/Testing

Archive:  /content/SampleDocs.zip
   creating: SampleDocs/
  inflating: SampleDocs/20201023_GG_Loan Agreement.docx  
  inflating: SampleDocs/20201028_MyTTech_EmploymentAgreement.docx  
  inflating: SampleDocs/20201104_TUI_Investment_Commitment_Agreement-formatted.docx  
  inflating: SampleDocs/Consulting Agreement- Nikhil D.docx  
  inflating: SampleDocs/contract-for-the-sale-of-goods-seller-friendly-version.docx  
  inflating: SampleDocs/Demo Joint Venture Agreement .docx  
  inflating: SampleDocs/Draft SHA_Belita_11082015_Clean_Execution Version (1).docx  
  inflating: SampleDocs/Example-Shareholder-Agreement.docx  
  inflating: SampleDocs/FOUNDERS AGREEMENT-December 03 2017 (Final Version) (for compare).docx  
  inflating: SampleDocs/llc-operating-agreement.docx  
  inflating: SampleDocs/Offer Letter- AatmNirbhar Final.docx  
  inflating: SampleDocs/PHPL MRPL JDA 21032020.docx  
  inflating: SampleDocs/rental-agreement-plain-language-lease.docx  
  inflating: SampleDocs/Sale Agreeme

In [3]:
import skweak
import re
import docx
from docx import Document
import spacy
from spacy.tokens import DocBin
docbin = DocBin()

In [4]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])

In [6]:
nlp.meta['name']

'core_web_sm'

In [7]:
import os
path = '/content/SampleDocs'

for filename in os.listdir(path):
    if filename.endswith('.docx'):  # Adjust the file extension as per your documents
        file_path = os.path.join(path, filename)
        
        # Read the contents of the file using python-docx
        doc = Document(file_path)
        text = " ".join([paragraph.text for paragraph in doc.paragraphs])
        
        # Perform spaCy tasks on the text
        doc = nlp(text)
        
        # Add the Doc object to the DocBin
        docbin.add(doc)

# Output path for saving the DocBin
output_path = '/content/docbin.spacy'  # Replace with the desired output path

# Save the DocBin to disk
docbin.to_disk(output_path)
docs_copy = list(skweak.utils.docbin_reader("/content/docbin.spacy"))

In [21]:
import spacy
from spacy import displacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")


# Filter entities based on desired labels
desired_labels = ['MONEY', 'LOCATION', 'DATE', 'LAW', 'ORG']  # Example: Keep entities with labels 'ORG' and 'PERSON'

# Process each document and create new Doc objects with filtered entities
filtered_docs = []

for doc_text in docs_copy:
    doc = nlp(doc_text)
    filtered_entities = [ent for ent in doc.ents if ent.label_ in desired_labels]
    new_doc = spacy.tokens.Doc(doc.vocab, words=[token.text for token in doc])
    new_doc.ents = filtered_entities
    filtered_docs.append(new_doc)

# docbin = DocBin(docs=filtered_docs)
# docbin.to_disk("final_annotated_data.spacy")

In [22]:
skweak.utils.docbin_writer(filtered_docs ,'/content/docbin.spacy')

Write to /content/docbin.spacy...done


In [23]:
docs = list(skweak.utils.docbin_reader("/content/docbin.spacy"))

In [None]:
displacy.render(docs[13], style = 'ent', jupyter=True)

In [None]:
for doc in docs_copy:
  for ent in doc.ents:
    print(ent.label_)

In [66]:
annotator = skweak.spacy.ModelAnnotator("spacy_md", 'en_core_web_md' )

In [10]:
docs = []
for doc in docs_copy:
  docs.append(annotator(doc=doc))

In [None]:
skweak.utils.display_entities(docs[13], 'spicy_md')

In [39]:
text = "i have eighty thousand rupees in my bank account and $450 with ₹12312 "
test_docs = nlp(text)
skweak.utils.display_entities(annotator(test_docs), 'spicy_md')

In [None]:
['MONEY', 'LOCATION', 'DATE', 'LAW']

In [None]:
skweak.utils.display_entities(docs[13], 'spicy_md')

In [None]:
spacy.explain("WORK_OF_ART")

'Titles of books, songs, etc.'

# Annotate Now

## Organizations ✅

In [17]:
from spacy import displacy

In [60]:
semifinaldocs = list(skweak.utils.docbin_reader("/content/final_annotated_data.spacy"))


In [20]:
# company detector
def company_detector_fun(doc):
    for chunk in doc.noun_chunks:
        if chunk[0].lower_.rstrip(".") in {'corp', 'inc', 'ltd', 'llc', 'sa', 'ag', 'co', 'limited', 'private', "law firm", "government agency", "non-profit", "court", "corporation", "partnership"} or \
           chunk[-1].lower_.rstrip(".") in {'corp', 'inc', 'ltd', 'llc', 'sa', 'ag', 'co', 'limited', 'private', "law firm", "government agency", "non-profit", "court", "corporation", "partnership"}:
            yield chunk.start, chunk.end, "ORGS"

company_detector = skweak.heuristics.FunctionAnnotator("company_detector", company_detector_fun)

annotated_docs = []
for doc in docs:
   annotated_docs.append(company_detector(doc))

skweak.utils.display_entities(annotated_docs[13], "company_detector")


ValueError: ignored

In [None]:
%%capture
!git clone https://github.com/NorskRegnesentral/skweak.git

In [None]:
tries = skweak.gazetteers.extract_json_data("/content/skweak/data/crunchbase_companies.json.gz")
gazetteer = skweak.gazetteers.GazetteerAnnotator("gazetteer", tries)

Extracting data from /content/skweak/data/crunchbase_companies.json.gz
Populating trie for class COMPANY (number: 539174)


In [None]:
gazetted = []
for doc in docs_copy:
  gazetted.append(gazetteer(doc))

In [None]:
skweak.utils.display_entities(gazetted[13], "gazetteer")


In [None]:
new_docs = []
for doc in docs_copy:
    doc = annotator(company_detector(gazetteer(doc)))
    new_docs.append(doc)


15
15


In [None]:
hmm = skweak.generative.HMM("hmm", ["COMPANY"])

In [None]:
hmm.fit(new_docs)

Starting iteration 1
Finished E-step with 15 documents
Starting iteration 2


         1      -16934.0606             +nan


Finished E-step with 15 documents
Starting iteration 3


         2      -16849.7798         +84.2808


Finished E-step with 15 documents
Starting iteration 4


         3      -16840.3257          +9.4541


Finished E-step with 15 documents


         4      -16838.4702          +1.8555


In [None]:
hmm_doc = []
for doc in new_docs:
  hmm_doc.append(hmm(doc))

## Contract Clauses ✅

just use the law thing from spacy

In [35]:
def contract_clause_detector_fun(doc):
    
    for chunk in doc.noun_chunks:
        if chunk[0].lower_.rstrip(".") in {'clause', 'provision', 'section', 'article', 'subsection', 'subclause', 'paragraph', 'schedule', 'terms', 'conditions', 'obligations', 'rights', 'representations', 'warranties', 'signatories'} or \
           chunk[-1].lower_.rstrip(".") in {'clause', 'provision', 'section', 'article', 'subsection', 'subclause', 'paragraph', 'schedule', 'terms', 'conditions', 'obligations', 'rights', 'representations', 'warranties', 'signatories'}:
            yield chunk.start, chunk.end, "CONTRACT_CLAUSE"
            if chunk[-1].text.isdigit():
                yield chunk.start, chunk.end + len(chunk[-1]), "CONTRACT_CLAUSE_NUMBER"


contract_clause_detector = skweak.heuristics.FunctionAnnotator("contract_clause_detector", contract_clause_detector_fun)

new_docs = []
for doc in docs_copy:
  new_docs.append(contract_clause_detector(doc))

In [36]:
skweak.utils.display_entities(new_docs[13], "contract_clause_detector")


## Location ✅

In [20]:
import skweak.heuristics as skh
import json

with open("/content/Indian_Cities_In_States_JSON.txt", "r") as f:
    keyword_data = json.load(f)

locations = []
for state, city_list in keyword_data.items():
    locations.append(state.lower())
    locations.extend([city.lower() for city in city_list])

# # Read the JSON file
# with open('countries+states+cities.json', 'r', encoding = 'utf-8') as file:
#     data = json.load(file)

# # Loop through each item in the JSON data
# for item in data:
#     # Append country name to the combined data list
#     locations.append(item['name'])
    
#     # Loop through each state in the country
#     for state in item['states']:
#         # Append state name to the combined data list
#         locations.append(state['name'])
        
#         # Loop through each city in the state
#         for city in state['cities']:
#             # Append city name to the combined data list
#             locations.append(city['name'])
  
# Define the function to detect locations
def location_detector_fun(doc):
    for chunk in doc.noun_chunks:
        if chunk.text.lower() in locations:
            yield chunk.start, chunk.end, "LOCATION"

# Create the FunctionAnnotator for location detection
location_detector = skh.FunctionAnnotator("location_detector", location_detector_fun)



In [56]:
loc_docs = []
for doc in docs_copy:
  loc_docs.append(location_detector(doc))

In [50]:
text = '''BENGALURU: Siddaramaiah began his second innings as chief minister of Karnataka Saturday in the presence of more than a dozen top non-BJP politicians from across the country and a full house of Congress stalwarts.
His 61-year-old deputy and Congress's state unit chief DK Shivakumar was sworn in with eight ministers - all senior MLAs, including G Parameshwara, a former deputy CM and state Congress head in 2013 who recently offered to be CM.
'''
test_doc = nlp(text)
new = location_detector(test_doc)

In [57]:
skweak.utils.display_entities(loc_docs[8], "location_detector")


## Legislation ✅

In [21]:
legal_labels = {
    "statute",
    "law",
    "regulation",
    "act",
    "code",
    "rule",
    "ordinance",
    "enactment",
    "legislative",
    "jurisdiction",
    "mandate",
    "requirement",
    "compliance",
    "stipulation",
    "require",
    "prescribe",
    "authorize",
    "regulate",
    "govern",
    "dictate",
    "establish",
    "implement",
    "apply",
    "enforce",
    "comply",
    "violate",
    "amend",
    "repeal",
    "modify",
    "revise",
    "legally",
    "legitimate",
    "legal",
    "lawful",
    "valid",
    "in accordance with",
    "pursuant to",
    "under the provisions of",
    "in compliance with",
    "as required by",
    "as mandated by",
    "as stipulated by"
}

In [37]:
# legislation_detector
def legislation_detector_fun(doc):
    for chunk in doc.noun_chunks:
        if chunk[0].lower_.rstrip(".") in legal_labels or \
           chunk[-1].lower_.rstrip(".") in legal_labels:
            yield chunk.start, chunk.end, "LEGAL"

legislation_detector = skweak.heuristics.FunctionAnnotator("legislation_detector", legislation_detector_fun)

legal_docs = []
for doc in docs_copy:
   legal_docs.append(legislation_detector(doc))

skweak.utils.display_entities(legal_docs[5], "legislation_detector")


## Financial

In [None]:
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Register the 'skweak_labels' attribute as a string extension
spacy.tokens.Token.set_extension("skweak_labels", default="", force=True)

text = "i wish i had 2000 dollars and forty thousand rupees or dollars"

# Process the text with spaCy
doc = nlp(text)
# Iterate over the entities in the document
for ent in doc.ents:
    if ent.label_ == "MONEY":
        # Label the numerical value as "MONEY"
        for token in ent:
            token._.skweak_labels = "MONEY"

# Print the document with the labeled entities
for token in doc:
    if token._.skweak_labels == "MONEY" or token.text in {"rupees", "dollars"}:
        print(token.text, "- MONEY")
    else:
        print(token.text)


i
wish
i
had
2000 - MONEY
dollars - MONEY
and
forty
thousand
rupees - MONEY
or
dollars - MONEY


In [None]:
displacy_render = displacy.render(doc, style="ent", jupyter = True)

In [None]:
def money_detector(doc):
   for tok in doc[1:]:
      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
          yield tok.i-1, tok.i+1, "MONEY"

financial_detector = skweak.heuristics.FunctionAnnotator("financial_detector", money_detector)


financial_docs = []
for doc in docs_copy:
    financial_docs.append(money_detector(doc))

skweak.utils.display_entities(financial_docs[5], "financial_detector")


In [None]:
doc = nlp("This sentence has $100 and €200")
label = "MONEY"
for start, end, label in money_detector(doc):
  print(f"Found money mention at {start}-{end} with label {label}")

Found money mention at 3-5 with label MONEY
Found money mention at 6-8 with label MONEY


In [None]:
def money_detector(doc):
        for tok in doc[1:]:
            if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
                yield tok.i-1, tok.i+1, "MONEY"

doc = nlp("This sentence has $100 and €200 and there eighty thousand rupees in my bank account")
label = "MONEY"
for start, end, label in money_detector(doc):
  print(f"Found money mention at {start}-{end} with label {label}")

Found money mention at 3-5 with label MONEY
Found money mention at 6-8 with label MONEY


In [None]:
type(docs_copy[0])

spacy.tokens.doc.Doc

In [None]:
# Define the financial_detector_fun function
def financial_detector_fun(doc):
    for token in doc:
        if token.like_num or token.is_currency:
            yield token.idx, token.idx + len(token.text), "FINANCIAL"

# Create the FunctionAnnotator for financial detection
financial_detector = skweak.heuristics.FunctionAnnotator("financial_detector", financial_detector_fun)

# Annotate the documents with financial labels
financial_docs = []
for doc in docs_copy:
    financial_docs.append(financial_detector(doc))

# Display the entities with the financial_detector label
skweak.utils.display_entities(financial_docs[5], "financial_detector")


IndexError: ignored

## Date/Time

# Training

In [7]:
!pip install -q -U locale

NotImplementedError: ignored

In [6]:
import locale
locale.getpreferredencoding(False)
!apt-get install -y locales
!locale-gen en_US.UTF-8
!update-locale LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8


NotImplementedError: ignored

In [4]:
!spacy init config - --lang en --pipeline ner --optimize accuracy | \

NotImplementedError: ignored

In [None]:
!spacy train - --paths.train ./path/to/corpus.spacy  --paths.dev /path/to/corpus.spacy \
--initialize.vectors en_core_web_md --output /path/to/trained_model