In [1]:
from app.Transformer_Classifier import Transformer_Classifier 
from app.Data_Loader import Data_Loader
from app.TextRank_Extractor import TextRank_Extractor
from app.Keyword_Classifier import Keyword_Classifier
from app.common.MySQLUtility import MySQLUtility
import os 
from app.Risk_Score_Service import Risk_Score_Service

domains = ['esg', 'liabilities' ] #'liabilities', 'esg'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './config/gcp/genuine-wording-key.json'

DB_HOST = '34.170.168.203'
DB_USER = 'root'
DB_PASSWORD = 'nu123456'
DB_NAME = 'lca_dev'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Data_ETL_Pipeline(object):
    dbutil = None
    data_load = None
    textrank = None 
    key_classifier = None
    class_service = None
    risk_class = None

    def __init__(self):
        self.dbutil = MySQLUtility(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
        self.data_load = Data_Loader(self.dbutil)
        self.textrank = TextRank_Extractor(self.dbutil)
        self.key_classifier = Keyword_Classifier(self.dbutil)
        self.class_service = Transformer_Classifier(self.dbutil, domains)
        self.risk_class = Risk_Score_Service(self.dbutil, domains)
        pass   

    def keyword_testing(self, text): 
        print ('\n', text)
        self.risk_class.load_polarity_data()
        c_score = self.risk_class.get_context_score(text, domains[1])
        print ('Context Score : ', c_score)

        predict_label, predict_prb = self.key_classifier.predict_text_data(text, domains[1])
        print ('Keyword Classifier : ', predict_label, int(predict_prb*100))

        model = self.class_service.load_model(domains[1])
        response = self.class_service.process_contract_request(text, model, domains[1])
        print('Transform Classifier : ', response[0]['label'], response[0]['presence_score'])
        if response[0]['label'] != predict_label:
            print ('Mismatched')
        else: 
            print ('Matched')
        print (response)
        


In [3]:
article_text1 = 'Twenty-five people have been arrested in raids across Germany on suspicion of plotting to overthrow the government. ' + \
    'German reports say the group of far-right and ex-military figures planned to storm the parliament building, the Reichstag, and seize power.' + \
    'A German man referred to as a prince called Heinrich XIII, 71, is alleged to have been central to their plans.' + \
    'Among those arrested in 11 German states were two alleged ringleaders according to federal prosecutors.' 
article_text2 = 'Notes and other accounts receivable and other investments are financial assets with carrying values that approximate fair value. Accounts payable, other accrued expenses and short-term debt (excluding the current portion of long-term debt and including shortterm finance lease liabilities) are financial liabilities with carrying values that approximate fair value. If measured at fair value in the financial statements, these financial instruments would be classified as Level 3 in the fair value hierarchy, except for short-term debt which would be classified as Level 2. '

from app.PreProcessText import PreProcessText

etlpipe = Data_ETL_Pipeline()
ppt = PreProcessText()

stmts = ppt.get_sentences(article_text1)
for text in stmts: 
    etlpipe.keyword_testing(text['sentance'])


stmts = ppt.get_sentences(article_text2)
for text in stmts: 
    etlpipe.keyword_testing(text['sentance'])



 Twenty-five people have been arrested in raids across Germany on suspicion of plotting to overthrow the government.
Polarity Word Found : ' '
Polarity Word Found : ' government'
Polarity Word Found : ' s'
Polarity Word Found : ' m'
Context Score :  74
Keyword Classifier :  current liabilities 48
Polarity Word Found : ' '
Polarity Word Found : ' government'
Polarity Word Found : ' s'
Polarity Word Found : ' m'
Transform Classifier :  contingent liabilities 99
Mismatched
{0: {'sentence': 'Twenty-five people have been arrested in raids across Germany on suspicion of plotting to overthrow the government.', 'presence_score': 99, 'context_score': 74, 'risk_score': 86, 'label': 'contingent liabilities'}}

 German reports say the group of far-right and ex-military figures planned to storm the parliament building, the Reichstag, and seize power.
Polarity Word Found : ' '
Polarity Word Found : ' plan'
Polarity Word Found : ' building'
Polarity Word Found : ' group'
Polarity Word Found : ' repo

In [4]:
''' 
import spacy
import pytextrank

# example text
text = """Compatibility of systems of linear constraints over the set of natural numbers.
Criteria of compatibility of a system of linear Diophantine equations, strict inequations,
and nonstrict inequations are considered. Upper bounds for components of a minimal set of
solutions and algorithms of construction of minimal generating sets of solutions for all types
of systems are given. These criteria and the corresponding algorithms for constructing a minimal
supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."""

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_md")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

doc = nlp(text)
# examine the top-ranked phrases in the document
for phrase in doc._.phrases:
    print(phrase.text)
    print(phrase.rank, phrase.count)
    print(phrase.chunks)
'''

' \nimport spacy\nimport pytextrank\n\n# example text\ntext = """Compatibility of systems of linear constraints over the set of natural numbers.\nCriteria of compatibility of a system of linear Diophantine equations, strict inequations,\nand nonstrict inequations are considered. Upper bounds for components of a minimal set of\nsolutions and algorithms of construction of minimal generating sets of solutions for all types\nof systems are given. These criteria and the corresponding algorithms for constructing a minimal\nsupporting set of solutions can be used in solving all the considered types systems and systems of mixed types."""\n\n# load a spaCy model, depending on language, scale, etc.\nnlp = spacy.load("en_core_web_md")\n# add PyTextRank to the spaCy pipeline\nnlp.add_pipe("textrank")\n\ndoc = nlp(text)\n# examine the top-ranked phrases in the document\nfor phrase in doc._.phrases:\n    print(phrase.text)\n    print(phrase.rank, phrase.count)\n    print(phrase.chunks)\n'