In [1]:
from app.Transformer_Classifier import Transformer_Classifier 
from app.Data_Loader import Data_Loader
from app.TextRank_Extractor import TextRank_Extractor
from app.Keyword_Classifier import Keyword_Classifier
from app.MySQLUtility import MySQLUtility

class Data_ETL_Pipeline:
    def __init__(self) -> None:
        pass

    dbutil = MySQLUtility()
    data_load = Data_Loader()
    textrank = TextRank_Extractor()
    key_classifier = Keyword_Classifier()
    class_service = Transformer_Classifier()

    def create_dataset(self):
        print("db_cleanup():")
        self.dbutil.clean_db()
        print("create_database():")
        self.dbutil.create_database() 

    def load_seed_training_data(self):
        print("data_load.import_seed_data():")
        self.data_load.import_seed_data_batch()

        print("extract_keyword_seed_data():")
        self.textrank.extract_keyword_seed_data() 

        print("load_seed_to_training_data():")
        self.data_load.load_seed_to_training_data_batch() 
    
    def load_contract_data(self):
        print("self.data_load.import_reports_data()")
        self.data_load.import_reports_contract_data()

    def process_keyword_model(self):
        print("key_classifier.prepare_training_data():")
        self.key_classifier.prepare_training_data()

        print("key_classifier.train_model():")
        self.key_classifier.train_model()

        print("key_classifier.evaluate_model():")
        self.key_classifier.evaluate_model()

        print("key_classifier.process_contract_data():")
        self.key_classifier.process_contract_data()

    def process_transformer_model(self):
        print("training():")
        self.class_service.training()    

        print("process_contract_training_data_eval():")
        self.class_service.process_contract_training_data_eval()

    def evaluate_results(self):
        print ("Keyword Classifier Accuracy: ")
        self.key_classifier.evaluate_model() 
        
        print ("Transformer Classifier Accuracy: ")
        self.class_service.evalute_model()

if __name__ == '__main__': 
    dbloader = Data_ETL_Pipeline()
    dbloader.create_dataset()
    dbloader.load_seed_training_data() 
    dbloader.load_contract_data()
    dbloader.process_keyword_model()
    dbloader.process_transformer_model()
    dbloader.evaluate_results()


  from .autonotebook import tqdm as notebook_tqdm


db_cleanup():
Table contract_data successfully deleted.
Table seed_data successfully deleted.
Table training_data successfully deleted.
create_database():
Table contract_data successfully created.
Table seed_data successfully created.
Table training_data successfully created.
data_load.import_seed_data():
733 record(s) affected
extract_keyword_seed_data():
SELECT * from seed_data
[('company’s vendors, yettobepaid bills', 'f210666d-0bed-4cb0-b893-969b34044365'), ('accounts, businesses, largest current liability', 'f210666d-0bed-4cb0-b893-969b34044365'), ('interest payable interest expense', '7bcfec56-ee3f-401f-a82b-a598ad5611e2'), ('interest expense, interest, income statement, expense', '7bcfec56-ee3f-401f-a82b-a598ad5611e2'), ('company, income tax, income taxes, government', 'c1f79f46-0133-4e99-a31b-af61961aea85'), ('tax, year', 'c1f79f46-0133-4e99-a31b-af61961aea85'), ('longterm liability, tax', 'c1f79f46-0133-4e99-a31b-af61961aea85'), ('bank, type, insufficient funds, bank account, 

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

{'eval_loss': 0.3474709987640381, 'eval_runtime': 33.7897, 'eval_samples_per_second': 7.961, 'eval_steps_per_second': 1.006, 'epoch': 1.0}


 67%|██████▋   | 270/405 [15:26<06:35,  2.93s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 269
  Batch size = 8
                                                 
 67%|██████▋   | 270/405 [16:03<06:35,  2.93s/it]

{'eval_loss': 0.36202123761177063, 'eval_runtime': 36.3007, 'eval_samples_per_second': 7.41, 'eval_steps_per_second': 0.937, 'epoch': 2.0}


100%|██████████| 405/405 [23:18<00:00,  3.09s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 269
  Batch size = 8
                                                 
100%|██████████| 405/405 [23:58<00:00,  3.09s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 405/405 [23:58<00:00,  3.55s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****


{'eval_loss': 0.3911357522010803, 'eval_runtime': 40.3139, 'eval_samples_per_second': 6.673, 'eval_steps_per_second': 0.843, 'epoch': 3.0}
{'train_runtime': 1438.4738, 'train_samples_per_second': 2.244, 'train_steps_per_second': 0.282, 'train_loss': 0.3171847873263889, 'epoch': 3.0}


100%|██████████| 34/34 [00:36<00:00,  1.07s/it]
Configuration saved in ./model/config.json


Metrics :  {'eval_loss': 0.3911357522010803, 'eval_runtime': 37.5908, 'eval_samples_per_second': 7.156, 'eval_steps_per_second': 0.904, 'epoch': 3.0}


Model weights saved in ./model/pytorch_model.bin
loading configuration file ./model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "current liabilities",
    "1": "non-current liabilities",
    "2": "contingent liabilities"
  },
  "initializer_range": 0.02,
  "label2id": {
    "contingent liabilities": 2,
    "current liabilities": 0,
    "non-current liabilities": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file ./model/pyto

process_contract_training_data_eval():


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ./model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


SELECT * from training_data
1341
1341 record(s) affected
Keyword Classifier Accuracy: 
SELECT * from seed_data


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report : 
                          precision    recall  f1-score   support

 contingent liabilities       0.93      0.97      0.95       774
    current liabilities       0.95      0.92      0.93       675
non-current liabilities       0.93      0.91      0.92       708
                  sfsdf       0.00      0.00      0.00         1

               accuracy                           0.94      2158
              macro avg       0.70      0.70      0.70      2158
           weighted avg       0.94      0.94      0.94      2158

Confusion Matrix: 
 [[750  15   9   0]
 [ 12 624  39   0]
 [ 42  21 645   0]
 [  1   0   0   0]]
Accuracy Score: 
 93.55885078776645 %
Transformer Classifier Accuracy: 
SELECT * from training_data
Classification Report : 
                          precision    recall  f1-score   support

                              0.00      0.00      0.00         0
 contingent liabilities       0.96      0.99      0.97       462
    current liabilities       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
