In [1]:
from app.Transformer_Classifier import Transformer_Classifier 
from app.Data_Loader import Data_Loader
from app.TextRank_Extractor import TextRank_Extractor
from app.Keyword_Classifier import Keyword_Classifier
from app.MySQLUtility import MySQLUtility
import os 

domains = ['esg', 'liabilities' ] #'liabilities', 'esg'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './store/genuine-wording-key.json'

DB_HOST = '34.170.168.203'
DB_USER = 'root'
DB_PASSWORD = 'nu123456'

class Data_ETL_Pipeline(object):
    dbutil = None
    data_load = None
    textrank = None 
    key_classifier = None
    class_service = None

    def __init__(self):
        self.dbutil = MySQLUtility(DB_HOST, DB_USER, DB_PASSWORD)
        self.data_load = Data_Loader(self.dbutil)
        self.textrank = TextRank_Extractor(self.dbutil)
        self.key_classifier = Keyword_Classifier(self.dbutil)
        self.class_service = Transformer_Classifier(self.dbutil)
        pass    

    def create_dataset(self):
        print("db_cleanup():")
        self.dbutil.clean_db()
        print("create_database():")
        self.dbutil.create_database() 

    def load_seed_training_data(self):
        for domain in domains:
            print("data_load.import_seed_data():" + domain)
            self.data_load.import_seed_data_batch(domain)

            print("extract_keyword_seed_data():" + domain)
            self.textrank.extract_keyword_seed_data(domain) 

            print("load_seed_to_training_data():" + domain)
            self.data_load.load_seed_to_training_data_batch(domain) 
    
    def load_contract_data(self):
        for domain in domains:
            print("self.data_load.import_reports_data()" + domain)
            self.data_load.import_reports_contract_data(domain)

    def process_keyword_model(self):
        for domain in domains:
            print("key_classifier.prepare_training_data():" + domain)
            self.key_classifier.prepare_training_data(domain)

            print("key_classifier.train_model():" + domain)
            self.key_classifier.train_model(domain)

            print("key_classifier.evaluate_model():" + domain)
            self.key_classifier.evaluate_model(domain)

            print("key_classifier.process_contract_data():" + domain)
            self.key_classifier.process_contract_data(domain)

    def process_transformer_model(self):
        for domain in domains:
            print("training():" + domain)
            self.class_service.training(domain)    

            print("process_contract_training_data_eval():" + domain)
            self.class_service.process_contract_training_data_eval(domain)

    def evaluate_results(self):
        for domain in domains:
            print ("Keyword Classifier Accuracy: " + domain)
            self.key_classifier.evaluate_model(domain) 
            
            print ("Transformer Classifier Accuracy: " + domain)
            self.class_service.evalute_model(domain)

if __name__ == '__main__': 
    dbloader = Data_ETL_Pipeline()
    dbloader.create_dataset()
    dbloader.load_seed_training_data() 
    dbloader.load_contract_data()
    dbloader.process_keyword_model()
    dbloader.process_transformer_model()
    dbloader.evaluate_results()


  from .autonotebook import tqdm as notebook_tqdm


db_cleanup():
DB Pool Created.
Table contract_data successfully deleted.
Table seed_data successfully deleted.
Table training_data successfully deleted.
create_database():
Table contract_data successfully created.
Table seed_data successfully created.
Table training_data successfully created.
data_load.import_seed_data():esg
Working with  seed_data ESG.csv
INSERT INTO seed_data (id, created, keywords, content, type, label, domain, userid) VALUES ('4061e392-97d7-4ead-bd7d-3455344760c8', '2022-11-24 04:57:06', 'audit committee structure, principle, shareholder right, bribery, policy, procedure, board composition, leadership, political contribution, whistleblower program, executive compensation, lobbying', 'Policies, principles and procedures governing leadership, board composition, executive compensation, audit committee structure, shareholder rights, bribery, lobbying, political contributions, and whistleblower programs', 'curated', 'governance', 'esg', 'admin');
Query:  196 record(s) a

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

{'eval_loss': 0.5979799032211304, 'eval_runtime': 8.3907, 'eval_samples_per_second': 6.674, 'eval_steps_per_second': 0.834, 'epoch': 1.0}


 67%|██████▋   | 56/84 [03:25<01:37,  3.48s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 56
  Batch size = 8
                                               
 67%|██████▋   | 56/84 [03:32<01:37,  3.48s/it]

{'eval_loss': 0.41671305894851685, 'eval_runtime': 7.4525, 'eval_samples_per_second': 7.514, 'eval_steps_per_second': 0.939, 'epoch': 2.0}


100%|██████████| 84/84 [05:15<00:00,  3.91s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 56
  Batch size = 8
                                               
100%|██████████| 84/84 [05:24<00:00,  3.91s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 84/84 [05:24<00:00,  3.86s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

{'eval_loss': 0.42863550782203674, 'eval_runtime': 8.6638, 'eval_samples_per_second': 6.464, 'eval_steps_per_second': 0.808, 'epoch': 3.0}
{'train_runtime': 324.0368, 'train_samples_per_second': 2.074, 'train_steps_per_second': 0.259, 'train_loss': 0.5080723535446894, 'epoch': 3.0}


100%|██████████| 7/7 [00:07<00:00,  1.09s/it]
Configuration saved in ./model/esg/config.json


Metrics :  {'eval_loss': 0.42863550782203674, 'eval_runtime': 8.7255, 'eval_samples_per_second': 6.418, 'eval_steps_per_second': 0.802, 'epoch': 3.0}


Model weights saved in ./model/esg/pytorch_model.bin
loading configuration file ./model/esg/config.json
Model config DistilBertConfig {
  "_name_or_path": "./model/esg/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "environment",
    "1": "social",
    "2": "governance"
  },
  "initializer_range": 0.02,
  "label2id": {
    "environment": 0,
    "governance": 2,
    "social": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file ./model/esg/pytorch_model.bin


process_contract_training_data_eval():esg


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ./model/esg/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Query:  SELECT * from training_data where domain='esg';
SELECT * from training_data where domain='esg';
280
UPDATE training_data SET score = 85, eval_label = 'environment', eval_score = 97.01808094978333 where id = '51503621-923e-4719-a6f5-a8064b0f50d8';
280 record(s) affected
training():liabilities
Query:  SELECT * from training_data where domain='liabilities';
SELECT * from training_data where domain='liabilities';
Query:  SELECT * from training_data where domain='liabilities';
SELECT * from training_data where domain='liabilities';


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /Users/saurabhkaushik/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "current liabilities",
    "1": "non-current liabilities",
    "2": "contingent liabilities"
  },
  "initializer_range": 0.02,
  "label2id": {
    "contingent liabilities": 2,
    "current liabilities": 0,
    "non-current liabilities": 1
  },
  "m

{'eval_loss': 0.9778201580047607, 'eval_runtime': 6.6866, 'eval_samples_per_second': 7.179, 'eval_steps_per_second': 0.897, 'epoch': 1.0}


 67%|██████▋   | 48/72 [02:54<01:12,  3.02s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8
                                               
 67%|██████▋   | 48/72 [03:01<01:12,  3.02s/it]

{'eval_loss': 0.8831308484077454, 'eval_runtime': 6.5258, 'eval_samples_per_second': 7.355, 'eval_steps_per_second': 0.919, 'epoch': 2.0}


100%|██████████| 72/72 [04:25<00:00,  3.11s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8
                                               
100%|██████████| 72/72 [04:32<00:00,  3.11s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 72/72 [04:32<00:00,  3.78s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

{'eval_loss': 0.8340222239494324, 'eval_runtime': 6.9848, 'eval_samples_per_second': 6.872, 'eval_steps_per_second': 0.859, 'epoch': 3.0}
{'train_runtime': 272.0881, 'train_samples_per_second': 2.073, 'train_steps_per_second': 0.265, 'train_loss': 0.8203470442030165, 'epoch': 3.0}


100%|██████████| 6/6 [00:05<00:00,  1.10it/s]
Configuration saved in ./model/liabilities/config.json


Metrics :  {'eval_loss': 0.8340222239494324, 'eval_runtime': 6.6234, 'eval_samples_per_second': 7.247, 'eval_steps_per_second': 0.906, 'epoch': 3.0}


Model weights saved in ./model/liabilities/pytorch_model.bin
loading configuration file ./model/liabilities/config.json
Model config DistilBertConfig {
  "_name_or_path": "./model/liabilities/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "current liabilities",
    "1": "non-current liabilities",
    "2": "contingent liabilities"
  },
  "initializer_range": 0.02,
  "label2id": {
    "contingent liabilities": 2,
    "current liabilities": 0,
    "non-current liabilities": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522


process_contract_training_data_eval():liabilities


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ./model/liabilities/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Query:  SELECT * from training_data where domain='liabilities';
SELECT * from training_data where domain='liabilities';
236
UPDATE training_data SET score = 79, eval_label = 'non-current liabilities', eval_score = 78.9350152015686 where id = '327df715-e310-4716-a831-0688620c421e';
236 record(s) affected
Keyword Classifier Accuracy: esg
SELECT * from seed_data where domain='esg';
0      environment
1      environment
2      environment
3      environment
4      environment
          ...     
186     governance
187     governance
188     governance
189     governance
190     governance
Name: label, Length: 191, dtype: object
Classification Report : 
               precision    recall  f1-score   support

 environment       1.00      1.00      1.00        58
  governance       0.89      0.93      0.91        70
      social       0.92      0.87      0.89        63

    accuracy                           0.93       191
   macro avg       0.94      0.93      0.93       191
weighted avg     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
