## Data Preparation

In [1]:
import os
import pickle
from pprint import pprint
from os.path import join as JP

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

from utils.datahandling import (
    filter_dict_by_keys, 
    filter_dict_by_vals,
    filter_df_mean_thres)


# CONFIG 
# ------

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)

In [2]:
## Load Catalog | Load Corpus | Generate all

In [10]:
catalog = Catalog()
corpus = load_corpus(path=paths['catalog'], name='sample_data')
catalog.load_corpus(corpus=corpus)
# catalog.save(path=paths['catalog'],name='test1_clean')

OF_INTEREST = ['US'] # ['CA','AU']

# Filter down the catalog
filters = dict(
    topic = ['isocyanate'],
    country = OF_INTEREST,
    raw_text_len = 5000)

catalog_sample = catalog.filter_catalog(filters)
print('Catalog recuded from {} to {}'.format(
    len(catalog.documents), len(catalog_sample.documents)))

Catalog recuded from 3013 to 43


In [4]:
# SPLIT CATALOG INTO THE TWO CATEGORIES
# -------------------------------------

filters = dict(label='relevant')
pos_catalog = catalog_sample.filter_catalog(filters)

filters = dict(label='irrelevant')
neg_catalog = catalog_sample.filter_catalog(filters)

print('Positive documents: ',len(pos_catalog.documents))
print('Negative documents: ',len(neg_catalog.documents))

Positive documents:  43
Negative documents:  0


In [5]:
catalog = pos_catalog

## Spacy

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [7]:
document = catalog.documents[0]
document.clean_text[:50]

'Application published. OKAZOE, Takashi;Nagasaki, Y'

In [18]:
for d,doc in enumerate(catalog.documents[:5]):
    print('[INFO]: Parsing doc ',d)
    catalog.documents[d].sp_doc = nlp(doc.clean_text)
catalog.save(path=paths['catalog'],name='sample_data_spacy')

[INFO]: Parsing doc  0
[INFO]: Parsing doc  1
[INFO]: Parsing doc  2
[INFO]: Parsing doc  3
[INFO]: Parsing doc  4


In [14]:
document.sp_doc

Application published. OKAZOE, Takashi;Nagasaki, Yuko;Okamoto, Hidekazu. Asahi Glass Company, Limited;ASAHI GLASS. ASAHI GLASS. METHOD FOR PRODUCING CARBAMATE COMPOUND, CARBAMATE COMPOUND, AND METHOD FOR PRODUCING ISOCYANATE COMPOUND USING SAME. The present invention relates to a method of producing a carbamate compound, comprising reacting a fluorine containing carbonic diester compound represented by formula (1) and a non aromatic diamine compound represented by formula (2) without using a catalyst, to thereby produce a carbamate compound represented by formula (3), and a method of producing an isocyanate compound represented by formula (20) from the carbamate compound without using a catalyst, wherein R represents a fluorine containing monovalent aliphatic hydrocarbon group, and A represents a divalent aliphatic hydrocarbon group, a divalent alicyclic hydrocarbon group or a divalent aromatic-aliphatic hydrocarbon group.. CROSS REFERENCE TO RELATED APPLICATIONS. This application is a

In [15]:
document.sp_doc[1].lemma_

'publish'

#### Pre-process before TF-IDF making use of Spacy

In [19]:
def spacy_cleaning(
    document,
    tags_to_keep=['JJ', 'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']):
    
    # Lemmatizing each token and dropping non useful tags
    tokens = [ word.lemma_.lower().strip() for word in document if word.tag_ not in tags_to_keep]

    # Removing stop words and punctuations
    tokens = [ word for word in tokens if not word.is_stop and word not in punctuations ]

    # return preprocessed list of tokens
    return tokens

In [20]:
spacy_cleaning(document.sp_doc)

AttributeError: 'str' object has no attribute 'is_stop'

## TFIDF

In [11]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(
    min_df=.1,
    max_df=.7,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=3000,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=stopwords.words('english'))