In [1]:
from app.Transformer_Classifier import Transformer_Classifier 
from app.Data_Loader import Data_Loader
from app.TextRank_Extractor import TextRank_Extractor
from app.Keyword_Classifier import Keyword_Classifier
from app.common.MySQLUtility import MySQLUtility
import os 
from app.Risk_Score_Service import Risk_Score_Service

domains = ['esg', 'liabilities' ] #'liabilities', 'esg'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './config/gcp/genuine-wording-key.json'

DB_HOST = '34.170.168.203'
DB_USER = 'root'
DB_PASSWORD = 'nu123456'
DB_NAME = 'lca_dev'

class Data_ETL_Pipeline(object):
    dbutil = None
    data_load = None
    textrank = None 
    key_classifier = None
    class_service = None
    risk_class = None

    def __init__(self):
        self.dbutil = MySQLUtility(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
        self.data_load = Data_Loader(self.dbutil)
        self.textrank = TextRank_Extractor(self.dbutil)
        self.key_classifier = Keyword_Classifier(self.dbutil)
        self.class_service = Transformer_Classifier(self.dbutil, domains)
        self.risk_class = Risk_Score_Service(self.dbutil, domains)
        pass    

    def create_dataset(self):
        print("dbutil.db_cleanup():")
        #self.dbutil.clean_db()
        print("dbutil.create_database():")
        self.dbutil.create_database() 

    def load_seed_training_data(self):
        print("data_load.import_seed_data_batch():")
        #self.data_load.import_seed_data_batch()

        for domain in domains:
            print("textrank.extract_keyword_seed_data():" + domain)
            self.textrank.extract_keyword_seed_data(domain) 

            print("textrank.load_seed_to_training_data_batch():" + domain)
            self.data_load.load_seed_to_training_data_batch(domain) 
    
    def load_contract_data(self):
        for domain in domains:
            print("self.data_load.import_reports_contract_data()" + domain)
            self.data_load.import_reports_contract_data(domain)

    def process_keyword_model(self):
        for domain in domains:
            print("key_classifier.prepare_training_data():" + domain)
            self.key_classifier.prepare_training_data(domain)

            print("key_classifier.train_model():" + domain)
            self.key_classifier.train_model(domain)

            print("key_classifier.evaluate_model():" + domain)
            self.key_classifier.evaluate_model(domain)

            print("key_classifier.process_contract_data():" + domain)
            self.key_classifier.process_contract_data(domain)

    def process_transformer_model(self):
        for domain in domains:
            print("class_service.training():" + domain)
            self.class_service.training(domain)    

            print("class_service.process_contract_training_data_eval():" + domain)
            self.class_service.process_contract_training_data_eval(domain)

            print("risk_class.process_keyword_polarity():" , domain)
            self.risk_class.process_keyword_polarity(domain)

    def evaluate_results(self):
        for domain in domains:
            print ("key_classifier.Keyword Classifier Accuracy: " + domain)
            self.key_classifier.evaluate_model(domain) 
            
            print ("class_service.Transformer Classifier Accuracy: " + domain)
            self.class_service.evalute_model(domain)

if __name__ == '__main__': 
    dbloader = Data_ETL_Pipeline()
    #dbloader.create_dataset()
    dbloader.load_seed_training_data() 
    #dbloader.load_contract_data()
    #dbloader.process_keyword_model()
    #dbloader.process_transformer_model()
    #dbloader.evaluate_results()


  from .autonotebook import tqdm as notebook_tqdm


data_load.import_seed_data_batch():
textrank.extract_keyword_seed_data():esg
DB Pool Created.
Query:  -1 record(s) affected
textrank.load_seed_to_training_data_batch():esg
Query:  -1 record(s) affected
textrank.extract_keyword_seed_data():liabilities
SELECT * from seed_data where domain='liabilities';
UPDATE seed_data SET keywords = 'emission scandal, contingent liability example, billion liability' where id = '1b4e0255-bf07-4be5-9f43-5ff1fffcdafe' ;
Query:  1064 record(s) affected
textrank.load_seed_to_training_data_batch():liabilities
SELECT * from seed_data where domain='liabilities';
INSERT INTO training_data (id, created, content, type, label, eval_label, score, eval_score, domain, userid) VALUES ('5bab563e-a5d1-4693-a471-857437b98e15', '2022-12-07 13:16:02', 'The $. billion liability for Volkswagen related to its  emissions scandal is one such contingent liabilityÂ example.', 'seed', 'contingent liabilities', '', 0, 0, 'liabilities', 'admin')
Query:  1262 record(s) affected


In [2]:
from app.common.GCP_Storage import GCP_Storage

import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './config/gcp/genuine-wording-key.json'

domains = ['liabilities', 'esg']
bucket_name = 'lca_dev'
loader = GCP_Storage(domains, bucket_name)

#loader.setup_bucket()
#loader.upload_models()
#loader.download_models()
#loader.download_seed_data()
#loader.upload_seed_data()