In [1]:
from app.Transformer_Classifier import Transformer_Classifier 
from app.Data_Loader import Data_Loader
from app.TextRank_Extractor import TextRank_Extractor
from app.Keyword_Classifier import Keyword_Classifier
from app.MySQLUtility import MySQLUtility
import os 

domains = ['esg', 'liabilities' ] #'liabilities', 'esg'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './store/genuine-wording-key.json'

class Data_ETL_Pipeline:
    def __init__(self) -> None:
        pass

    dbutil = MySQLUtility()
    data_load = Data_Loader()
    textrank = TextRank_Extractor()
    key_classifier = Keyword_Classifier()
    class_service = Transformer_Classifier()

    def create_dataset(self):
        print("db_cleanup():")
        self.dbutil.clean_db()
        print("create_database():")
        self.dbutil.create_database() 

    def load_seed_training_data(self):
        for domain in domains:
            print("data_load.import_seed_data():" + domain)
            self.data_load.import_seed_data_batch(domain)

            print("extract_keyword_seed_data():" + domain)
            self.textrank.extract_keyword_seed_data(domain) 

            print("load_seed_to_training_data():" + domain)
            self.data_load.load_seed_to_training_data_batch(domain) 
    
    def load_contract_data(self):
        for domain in domains:
            print("self.data_load.import_reports_data()" + domain)
            self.data_load.import_reports_contract_data(domain)

    def process_keyword_model(self):
        for domain in domains:
            print("key_classifier.prepare_training_data():" + domain)
            self.key_classifier.prepare_training_data(domain)

            print("key_classifier.train_model():" + domain)
            self.key_classifier.train_model(domain)

            print("key_classifier.evaluate_model():" + domain)
            self.key_classifier.evaluate_model(domain)

            print("key_classifier.process_contract_data():" + domain)
            self.key_classifier.process_contract_data(domain)

    def process_transformer_model(self):
        for domain in domains:
            print("training():" + domain)
            self.class_service.training(domain)    

            print("process_contract_training_data_eval():" + domain)
            self.class_service.process_contract_training_data_eval(domain)

    def evaluate_results(self):
        for domain in domains:
            print ("Keyword Classifier Accuracy: " + domain)
            self.key_classifier.evaluate_model(domain) 
            
            print ("Transformer Classifier Accuracy: " + domain)
            self.class_service.evalute_model(domain)

if __name__ == '__main__': 
    dbloader = Data_ETL_Pipeline()
    dbloader.create_dataset()
    dbloader.load_seed_training_data() 
    dbloader.load_contract_data()
    dbloader.process_keyword_model()
    dbloader.process_transformer_model()
    dbloader.evaluate_results()


  from .autonotebook import tqdm as notebook_tqdm


DB Pool Created.
db_cleanup():
Table contract_data successfully deleted.
Table seed_data successfully deleted.
Table training_data successfully deleted.
create_database():
Table contract_data successfully created.
Table seed_data successfully created.
Table training_data successfully created.
data_load.import_seed_data():esg
Working with  seed_data ESG.csv
Query:  196 record(s) affected
extract_keyword_seed_data():esg
Query:  SELECT * from seed_data where domain='esg';
Query:  [('pollution, waste, treatment, natural resource conservation, energy use, corporate climate policy, animal, environmental issue', '8a4d5caa-b12d-467e-bec6-e7323fc0e213'), ('company, environmental risk, risk, esg consideration', '8a4d5caa-b12d-467e-bec6-e7323fc0e213'), ('environmental regulation, compliance, consideration, toxic waste, direct indirect greenhouse gas emission, management', 'ab680cff-9a32-41d6-8a09-0b9aa304fdae'), ('carbon sustainability report', 'c7c0b8db-dfbd-4e02-aa20-37a51d13fefa'), ('limit, ch

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

{'eval_loss': 0.590708315372467, 'eval_runtime': 7.3589, 'eval_samples_per_second': 7.61, 'eval_steps_per_second': 0.951, 'epoch': 1.0}


 67%|██████▋   | 56/84 [03:16<01:30,  3.22s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 56
  Batch size = 8
                                               
 67%|██████▋   | 56/84 [03:24<01:30,  3.22s/it]

{'eval_loss': 0.360762357711792, 'eval_runtime': 7.0673, 'eval_samples_per_second': 7.924, 'eval_steps_per_second': 0.99, 'epoch': 2.0}


100%|██████████| 84/84 [04:59<00:00,  3.29s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 56
  Batch size = 8
                                               
100%|██████████| 84/84 [05:07<00:00,  3.29s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 84/84 [05:07<00:00,  3.66s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

{'eval_loss': 0.35177987813949585, 'eval_runtime': 7.1962, 'eval_samples_per_second': 7.782, 'eval_steps_per_second': 0.973, 'epoch': 3.0}
{'train_runtime': 307.055, 'train_samples_per_second': 2.179, 'train_steps_per_second': 0.274, 'train_loss': 0.5715322494506836, 'epoch': 3.0}


100%|██████████| 7/7 [00:06<00:00,  1.15it/s]
Configuration saved in ./model/esg/config.json


Metrics :  {'eval_loss': 0.35177987813949585, 'eval_runtime': 7.1026, 'eval_samples_per_second': 7.884, 'eval_steps_per_second': 0.986, 'epoch': 3.0}


Model weights saved in ./model/esg/pytorch_model.bin
loading configuration file ./model/esg/config.json
Model config DistilBertConfig {
  "_name_or_path": "./model/esg/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "environment",
    "1": "social",
    "2": "governance"
  },
  "initializer_range": 0.02,
  "label2id": {
    "environment": 0,
    "governance": 2,
    "social": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file ./model/esg/pytorch_model.bin


process_contract_training_data_eval():esg


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ./model/esg/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Query:  SELECT * from training_data where domain='esg';
279
279 record(s) affected
training():liabilities
Query:  SELECT * from training_data where domain='liabilities';
Query:  SELECT * from training_data where domain='liabilities';


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /Users/saurabhkaushik/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "current liabilities",
    "1": "non-current liabilities",
    "2": "contingent liabilities"
  },
  "initializer_range": 0.02,
  "label2id": {
    "contingent liabilities": 2,
    "current liabilities": 0,
    "non-current liabilities": 1
  },
  "m

{'eval_loss': 0.9089303016662598, 'eval_runtime': 5.8409, 'eval_samples_per_second': 7.875, 'eval_steps_per_second': 1.027, 'epoch': 1.0}


 67%|██████▋   | 46/69 [02:38<01:13,  3.18s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 46
  Batch size = 8
                                               
 67%|██████▋   | 46/69 [02:44<01:13,  3.18s/it]

{'eval_loss': 0.7978938817977905, 'eval_runtime': 6.29, 'eval_samples_per_second': 7.313, 'eval_steps_per_second': 0.954, 'epoch': 2.0}


100%|██████████| 69/69 [04:01<00:00,  3.17s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 46
  Batch size = 8

100%|██████████| 69/69 [04:08<00:00,  3.17s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 69/69 [04:08<00:00,  3.60s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 46
  Batch size = 8


{'eval_loss': 0.7506449222564697, 'eval_runtime': 6.3771, 'eval_samples_per_second': 7.213, 'eval_steps_per_second': 0.941, 'epoch': 3.0}
{'train_runtime': 248.1913, 'train_samples_per_second': 2.2, 'train_steps_per_second': 0.278, 'train_loss': 0.8727916772814764, 'epoch': 3.0}


100%|██████████| 6/6 [00:05<00:00,  1.13it/s]
Configuration saved in ./model/liabilities/config.json


Metrics :  {'eval_loss': 0.7506449222564697, 'eval_runtime': 6.6874, 'eval_samples_per_second': 6.879, 'eval_steps_per_second': 0.897, 'epoch': 3.0}


Model weights saved in ./model/liabilities/pytorch_model.bin
loading configuration file ./model/liabilities/config.json
Model config DistilBertConfig {
  "_name_or_path": "./model/liabilities/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "current liabilities",
    "1": "non-current liabilities",
    "2": "contingent liabilities"
  },
  "initializer_range": 0.02,
  "label2id": {
    "contingent liabilities": 2,
    "current liabilities": 0,
    "non-current liabilities": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522


process_contract_training_data_eval():liabilities


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ./model/liabilities/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Query:  SELECT * from training_data where domain='liabilities';
225
225 record(s) affected
Keyword Classifier Accuracy: esg
Query:  SELECT * from seed_data where domain='esg';
0      environment
1      environment
2      environment
3      environment
4      environment
          ...     
186     governance
187     governance
188     governance
189     governance
190     governance
Name: label, Length: 191, dtype: object
Classification Report : 
               precision    recall  f1-score   support

 environment       1.00      1.00      1.00        58
  governance       0.89      0.93      0.91        70
      social       0.92      0.87      0.89        63

    accuracy                           0.93       191
   macro avg       0.94      0.93      0.93       191
weighted avg       0.93      0.93      0.93       191

Confusion Matrix: 
 [[58  0  0]
 [ 0 65  5]
 [ 0  8 55]]
Accuracy Score: 
 93.19371727748691 %
Transformer Classifier Accuracy: esg
Query:  SELECT * from training_data 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
