In [3]:
import os
import pickle
from pprint import pprint
from os.path import join as JP

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml
from scripts.catalog import Catalog, load_catalog

config = parse_yaml('config.yaml')
paths = config['paths']

catalog = load_catalog(path=paths['catalog'],name='only_US')

# Filter down the catalog
filters = dict(
    topic = ['isocyanate'],
    country = ['US'],
    raw_text_len = 5000)

catalog = catalog.filter_catalog(filters)
print(len(catalog.documents))

43


In [4]:
docu1 = catalog.documents[0]
docu2 = catalog.documents[1]

In [5]:
raw_text = docu1.raw_text
text = docu1.clean_text

In [6]:
text[250:500]

'present invention relates to a method of producing a carbamate compound, comprising reacting a fluorine containing carbonic diester compound represented by formula (1) and a non aromatic diamine compound represented by formula (2) without using a cat'

# Spacy

In [7]:
import spacy
from spacy import displacy
from spacy.lang.en import English

In [8]:
nlp = spacy.load('en_core_web_sm') # Powerfull model with everytihing included
d = nlp(text)

In [9]:
displacy.render(d[:500],style='ent',jupyter=True)

In [10]:
def spacy_cleaning(
    document,
    tags_to_keep=['JJ', 'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    entities_to_remove=['ORG,NORP,GPE,PERSON']):

    def pass_test(w, tags=tags_to_keep):
        if w.ent_type_ == 0:
                return w.tag_ in tags and not w.is_punct and not w.is_stop and w.ent_ not in entities_to_remove
        return w.tag_ in tags and not w.is_punct and not w.is_stop 

    words = [ word for word in document if pass_test(word)]
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in words ]
    return ' '.join(tokens)

#### Sample test

In [18]:
tokens = spacy_cleaning(d)
print(' '.join(tokens.split(' ')[:300]))

application publish asahi glas method produce carbamate compound carbamate compound method produce isocyanate compound present invention relate method produce carbamate compound comprise react fluorine contain carbonic diester compound represent formula aromatic diamine compound represent formula catalyst produce carbamate compound represent formula method produce isocyanate compound represent formula carbamate compound catalyst represent fluorine contain represent divalent divalent divalent aromatic hydrocarbon group cross related application application application file application file claim benefit file base claim benefit priority japanese file entire content incorporate reference entirety field invention relate method produce carbamate compound carbamate compound obtain method method produce isocyanate compound carbamate compound background invention compound material example urethane compound urea compound cure agent resin paint example follow method know method produce isocyanat

In [19]:
d[:300]

Application published. OKAZOE, Takashi;Nagasaki, Yuko;Okamoto, Hidekazu. Asahi Glass Company, Limited;ASAHI GLASS. ASAHI GLASS. METHOD FOR PRODUCING CARBAMATE COMPOUND, CARBAMATE COMPOUND, AND METHOD FOR PRODUCING ISOCYANATE COMPOUND USING SAME. The present invention relates to a method of producing a carbamate compound, comprising reacting a fluorine containing carbonic diester compound represented by formula (1) and a non aromatic diamine compound represented by formula (2) without using a catalyst, to thereby produce a carbamate compound represented by formula (3), and a method of producing an isocyanate compound represented by formula (20) from the carbamate compound without using a catalyst, wherein R represents a fluorine containing monovalent aliphatic hydrocarbon group, and A represents a divalent aliphatic hydrocarbon group, a divalent alicyclic hydrocarbon group or a divalent aromatic-aliphatic hydrocarbon group.. CROSS REFERENCE TO RELATED APPLICATIONS. This application is a

### Apply intense cleaning and save Catalog instance

In [21]:
for d,doc in enumerate(catalog.documents):
    print('[INFO]: Parsing doc ',d)
    catalog.documents[d].processed_text = spacy_cleaning(nlp(doc.clean_text))
catalog.save(path=paths['catalog'],name='spacy_pipeline_on_EN_corpus')

[INFO]: Parsing doc  0
[INFO]: Parsing doc  1
[INFO]: Parsing doc  2
[INFO]: Parsing doc  3
[INFO]: Parsing doc  4
[INFO]: Parsing doc  5
[INFO]: Parsing doc  6
[INFO]: Parsing doc  7
[INFO]: Parsing doc  8
[INFO]: Parsing doc  9
[INFO]: Parsing doc  10
[INFO]: Parsing doc  11
[INFO]: Parsing doc  12
[INFO]: Parsing doc  13
[INFO]: Parsing doc  14
[INFO]: Parsing doc  15
[INFO]: Parsing doc  16
[INFO]: Parsing doc  17
[INFO]: Parsing doc  18
[INFO]: Parsing doc  19
[INFO]: Parsing doc  20
[INFO]: Parsing doc  21
[INFO]: Parsing doc  22
[INFO]: Parsing doc  23
[INFO]: Parsing doc  24
[INFO]: Parsing doc  25
[INFO]: Parsing doc  26
[INFO]: Parsing doc  27
[INFO]: Parsing doc  28
[INFO]: Parsing doc  29
[INFO]: Parsing doc  30
[INFO]: Parsing doc  31
[INFO]: Parsing doc  32
[INFO]: Parsing doc  33
[INFO]: Parsing doc  34
[INFO]: Parsing doc  35
[INFO]: Parsing doc  36
[INFO]: Parsing doc  37
[INFO]: Parsing doc  38
[INFO]: Parsing doc  39
[INFO]: Parsing doc  40
[INFO]: Parsing doc  41
[I

In [22]:
catalog.documents[0].processed_text[:50]

'application publish asahi glas method produce carb'