## Data Preparation

In [8]:
import os
import pickle
from pprint import pprint
from os.path import join as JP

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

from utils.datahandling import (
    filter_dict_by_keys, 
    filter_dict_by_vals,
    filter_df_mean_thres)


# CONFIG 
# ------

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)

In [26]:
catalog = Catalog()
corpus = load_corpus(path=paths['catalog'], name='sample_data')
catalog.load_corpus(corpus=corpus)
# catalog.save(path=paths['catalog'],name='test1_clean')

OF_INTEREST = ['AU', 'CA','US']

# Filter down the catalog
filters = dict(
    topic = ['isocyanate'],
    country = OF_INTEREST,
    raw_text_len = 5000)

catalog_sample = catalog.filter_catalog(filters)
print('Catalog recuded from {} to {}'.format(
    len(catalog.documents), len(catalog_sample.documents)))

Catalog recuded from 3013 to 730


In [27]:
# SPLIT CATALOG INTO THE TWO CATEGORIES
# -------------------------------------

filters = dict(label='relevant')
pos_catalog = catalog_sample.filter_catalog(filters)

filters = dict(label='irrelevant')
neg_catalog = catalog_sample.filter_catalog(filters)

print('Positive documents: ',len(pos_catalog.documents))
print('Negative documents: ',len(neg_catalog.documents))

Positive documents:  53
Negative documents:  677


In [28]:
catalog = pos_catalog

## Spacy

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [30]:
document = catalog.documents[0]
document.clean_text[:50]

'Application published. OKAZOE, Takashi;Nagasaki, Y'

In [34]:
for d,doc in enumerate(catalog.documents[:3]):
    print('[INFO]: Parsing doc ',d)
    catalog.documents[d].sp_doc = nlp(doc.clean_text)
catalog.save(path=paths['catalog'],name='sample_data_spacy')

[INFO]: Parsing doc  0
[INFO]: Parsing doc  1
[INFO]: Parsing doc  2


In [39]:
# catalog.documents[0].sp_doc = nlp(catalog.documents[0].clean_text)
list(catalog.documents[1].sp_doc.noun_chunks)

[Application,
 Zeeuw,
 Arend-Jan;Bent,
 Mark Geoffrey,
 Huntsman International,
 INTERNATIONAL,
 HUNTSMAN INTERNATIONAL,
 Chemical Installation,
 the present invention,
 a chemical installation,
 The chemical installation,
 A first unit,
 a first aqueous waste stream comprising nitrobenzene,
 At least a second unit,
 a second aqueous waste stream comprising aniline,
 The chemical installation,
 an aniline cleaning apparatus,
 nitrobenzene,
 aniline,
 a stripping column,
 an aqueous stream,
 The first and second aqueous waste stream,
 the stripping column,
 aniline,
 nitrobenzene,
 the aqueous waste stream,
 the stripped aniline,
 nitrobenzene,
 the aniline cleaning apparatus,
 CROSS-REFERENCE,
 RELATED APPLICATIONS,
 application,
 U.S. patent application Ser,
 which,
 the National Phase,
 International Application,
 May,
 which,
 the U.S.,
 which,
 priority,
 The noted applications,
 reference,
 0002]The present invention,
 a chemical installation,
 DADPM,
 an integrated chemical insta

## TFIDF

In [16]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vectorizer = TfidfVectorizer(
    min_df=.1,
    max_df=.7,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=3000,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=stopwords.words('english'))