In [9]:
import os
import pickle
from pprint import pprint
from os.path import join as JP

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml
from scripts.catalog import Catalog, load_catalog

config = parse_yaml('config.yaml')
paths = config['paths']

catalog = load_catalog(path=paths['catalog'],name='only_US')

# Filter down the catalog
filters = dict(
    topic = ['isocyanate'],
    country = ['US'],
    raw_text_len = 5000)

catalog = catalog.filter_catalog(filters)
print(len(catalog.documents))

43


In [10]:
docu1 = catalog.documents[0]
docu2 = catalog.documents[1]

In [11]:
raw_text = docu1.raw_text
text = docu1.clean_text

In [26]:
text[250:500]

'present invention relates to a method of producing a carbamate compound, comprising reacting a fluorine containing carbonic diester compound represented by formula (1) and a non aromatic diamine compound represented by formula (2) without using a cat'

# Spacy

In [1]:
import spacy
from spacy import displacy
from spacy.lang.en import English

In [2]:
nlp = spacy.load('en_core_web_sm') # Powerfull model with everytihing included

In [45]:
nlp = English()

### Tokenizer

In [29]:
from spacy.tokenizer import Tokenizer

In [46]:
# Construction 1
# Create a blank Tokenizer with just the English vocab
tokenizer1 = Tokenizer(nlp.vocab)

# Construction 2
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer2 = nlp.Defaults.create_tokenizer(nlp)

In [49]:
texts = ["One document.", "...", "Lots of documents"]
for doc in tokenizer1.pipe(texts, batch_size=50):
    pass

In [50]:
doc.text

'Lots of documents'

In [58]:
d1 = nlp(raw_text)
d1[250:400]

group or a divalent aromatic-aliphatic hydrocarbon group.
    
        
    
    
        
            
      CROSS REFERENCE TO RELATED APPLICATIONS
    
            This application is a Divisional of U.S. application Ser. No. 13/630,915, filed Sep. 28, 2012 which is a Continuation application filed under 35 U.S.C. 111(a) claiming the benefit under 35 U.S.C. §§120 and 365(c) of PCT International Application No. PCT/JP2011/056095 filed on Mar. 15, 2011, which is based upon and claims the benefit of priority of Japanese Application No. 2010-086126 filed on Apr. 2, 2010, the entire contents of which are hereby incorporated by reference in their entireties.
            
                TECHNICAL FIELD
    
            This invention relates to a method of producing a carbamate compound, a carbamate compound obtained by the method and a method of producing an isocyanate compound using the carbamate compound.

In [59]:
for i,sent in enumerate(doc.sents):
    if i > 10 and i < 20:
        print(sent.text)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: nlp.add_pipe(nlp.create_pipe('sentencizer')) Alternatively, add the dependency parser, or set sentence boundaries by setting doc[i].is_sent_start.

In [60]:
displacy.render(d1,style='ent',jupyter=True)

In [42]:
d2 = nlp(text)
d2

Application published. OKAZOE, Takashi;Nagasaki, Yuko;Okamoto, Hidekazu. Asahi Glass Company, Limited;ASAHI GLASS. ASAHI GLASS. METHOD FOR PRODUCING CARBAMATE COMPOUND, CARBAMATE COMPOUND, AND METHOD FOR PRODUCING ISOCYANATE COMPOUND USING SAME. The present invention relates to a method of producing a carbamate compound, comprising reacting a fluorine containing carbonic diester compound represented by formula (1) and a non aromatic diamine compound represented by formula (2) without using a catalyst, to thereby produce a carbamate compound represented by formula (3), and a method of producing an isocyanate compound represented by formula (20) from the carbamate compound without using a catalyst, wherein R represents a fluorine containing monovalent aliphatic hydrocarbon group, and A represents a divalent aliphatic hydrocarbon group, a divalent alicyclic hydrocarbon group or a divalent aromatic-aliphatic hydrocarbon group.. CROSS REFERENCE TO RELATED APPLICATIONS. This application is a

In [43]:
displacy.render(d2,style='ent',jupyter=True)

## Lemmatizer

In [5]:
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]

In [7]:
apple.vector

ValueError: [E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:
https://spacy.io/usage/models

In [8]:
nlp.vocab.vectors

<spacy.vectors.Vectors at 0x11f25a3d0>

## Depency Parser

In [18]:
nlp = spacy.load('en_core_web_sm')

In [19]:
parser = nlp.create_pipe("parser")

In [20]:
parser

<spacy.pipeline.pipes.DependencyParser at 0x1a3878b280>

In [21]:
from spacy.pipeline import DependencyParser
parser = DependencyParser(nlp.vocab)

In [22]:
parser

<spacy.pipeline.pipes.DependencyParser at 0x1a3878bfa0>

In [23]:
parser(nlp(docu1.clean_text))

ValueError: [E109] Model for component 'parser' not initialized. Did you forget to load a model, or forget to call begin_training()?

In [24]:
parser = DependencyParser(nlp.vocab)
doc = nlp("This is a sentence.")
# This usually happens under the hood
processed = parser(doc)

ValueError: [E109] Model for component 'parser' not initialized. Did you forget to load a model, or forget to call begin_training()?