# Overview
* The purpose of this notebook is to test different ML models in **predicting correct values** of a **dirty data**.
* We test the impact on **keeping**/**removing inconsistencies samples** during the **training** phase on the ML model **repair** performence.

Jar to Python : https://www.jython.org/installation.html

## Libraries

In [1]:
import dataset as dt
import training as tr
import preprocessing as tp
import model as m
import evaluation as eva
import utils
# import nn

import pandas as pd
import numpy as np
import joblib
import csv
import json # 2.0.9
import config
import datetime

# import torch # 2.2.0+cu121
import os
import pickle

In [2]:
#LEARNING ALGORITHM
algos = config.classifiers #['ann', 'logistic regression l2', 'xgboost', 'multinomial bayesian', ]

#LANGUAGE MODEL
langs = config.transformers


# Settings

In [3]:
#bert_model_name = 'bert-base-multilingual-cased'
bert_model_name = 'bert-base-uncased'
 
# Set up parameters
num_epochs = 8
learning_rate = 2e-5
sampling = False
# nb of folds
splits = 3

#overconfidence compensators
compensators = [.9,.9,.9]

#path to csv files
path = os.path.abspath(os.getcwd())

# dataset names
data_index = 2 #dt.Allergens
dataset = config.datasets[data_index] 
partial_key = dataset['keys'][0]
dataName = dataset['data_dir']
labels = dataset['labels']
feature = dataset['features'][0]
lambdac = compensators[data_index]

print('dataset:', dataName)
print('---------------------------')
print('feature', feature)
print('---------------------------')
print('labels', labels)
print('---------------------------')

#labels and features
dataFolder, labels, features, keys = dt.get_datasetSchema(dataName)
partial_key = keys[0]
path

dataset: allergens
---------------------------
feature ingredients
---------------------------
labels ['nuts', 'milk', 'gluten', 'soy', 'peanut', 'eggs']
---------------------------


'/home/lab/Documents/rihem/improving-data-cleaning-with-unstructured-data'

# Run training best ML

In [4]:
print(dataName)

allergens


In [5]:
#MODEL
model_name = m.get_best_ml(data_index)
for l in langs:
    if l['name'] in model_name:
        lang = l
        break
for al in algos:
    if al['name'] in model_name:
        alg = al
        break

print('model:', model_name)
print('transformer:', alg['name'], 'classifier:', lang['name'])
print('--------------------------------------')        

model: count-vect-xgboost
transformer: xgboost classifier: count-vect
--------------------------------------


In [6]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

strategy_results = {}
allmodels = {}
for i in range(2):
    if i == 1:
        _parker = ""
        _with = "with_constraints"        
        parker = False
        constraints_check = True
    else:
        _parker = "_parker"
        _with = "with_parker"        
        parker = True
        constraints_check = False
    print(i, '- parker', parker, '- constraints_check', constraints_check)
    all_results = {}
    models = {}
    
#     start_time = datetime.datetime.now()
    
    # read training data and valid data
    train_file_path = utils.get_dir(dataset, file=dataset['data_dir'] + _parker + "_train.csv")
    train_data = dt.read_data_csv(dataset['data_dir'], 'train', parker) 
    #utils.load_df(dataset, train_file_path)
    # encode non-numerical labels
    dtrain, encoders = tp.preprocess(dataset, train_data)
    print('label encoders', encoders)
    
#     feature_processing_time  = (datetime.datetime.now() - start_time).total_seconds()
    
    # per_model
    result_per_model = {} 
#     for label in tqdm(labels):
    for label in tqdm(labels):
        #timestamp
        start_time = datetime.datetime.now()
        
        #per label       
        result_per_label = {}
        
        # remove the inconsistent rows
        if data_index != 4 : dtrain = tr.get_cleaner_train_version(dataName, label, train_data, partial_key)
        else: dtrain = train_data.copy() # allergens do need cleaner versssssion
        dtrain1, valid_data = train_test_split(dtrain, test_size=0.1)
        iter = 0
        while len(valid_data[label].unique()) != len(dtrain[label].unique()):
            dtrain1, valid_data = train_test_split(dtrain, test_size=0.1)
            iter += 1
            print(str(iter) + "th iteration")
        print('dtrain, valid_data', dtrain.shape, valid_data.shape)

        y = dtrain1[label].astype(int)
        n_class, unique_classes, class_counts, ir = tp.get_class_stats(y)
        class_weights = {c: len(y) / (count * n_class) for c, count in zip(unique_classes, class_counts)}
        print(class_weights)
        
        batch_size = dtrain.shape[0]
        # Loop through DataFrame in chunks without numpy
        for i in range(0, len(dtrain1), batch_size):
            if i + batch_size < len(dtrain): batch = dtrain1[i:i + batch_size]
            else: batch = dtrain1[i:len(dtrain)]
            
            # first training
            if i == 0:            
                model, result_per_label = tr.clf_train(batch, dataset, label, alg, lang, config.root_seed, True)
                models[label] = model
            else:
                print(f"Batch {(i // batch_size) + 1}:\n")
                model.fit(batch[feature], batch[label])

        current_time = datetime.datetime.now()
        result_per_label['duration'] = (current_time - start_time).total_seconds()
        
        # save the model
        file_model_name = f"./models/_{label}_classifier_{model_name}_{_with}.pth"
        pickle.dump(model, open(file_model_name, "wb"))
        
        print()
        # save the encoder for each label
        if len(encoders)>0:
            if len(encoders[label])>0: 
                result_per_label['encoder'] = encoders[label]
        print()
            
        
        # evalutae ML model
        print('valid data', valid_data.shape)
        #dtest = dt.read_test_csv(dataName, parker)
        if alg['name'] != 'ann' and lang['name'] != 'bert':
            if len(encoders)>0:
                encoder = encoders[label]
            else:
                encoder = {}
            ## load saved model
            file_model_name = f"./models/_{label}_classifier_{model_name}_{_with}.pth"
            with open(file_model_name, 'rb') as f: model = pickle.load(f)                 
            
            avg_confidence = []
            eces = []
            outputs = model.predict_proba(valid_data[feature].str.lower())
            y_pred = model.predict(valid_data[feature].str.lower())
            
            y = valid_data[label].values #map(encoder) # encoded
    
            for i in range(len(set(y))):# iterate over the number of classes
                probabilities = outputs[:, i]  # Probabilities for the positive class (class 1)

                ece, avg_confidence_in_bin = eva.expected_calibration_error(y, probabilities, i, n_bins=5)
                
                acf = eva.avg_conf_correct_pred(y, y_pred, probabilities, i)
                avg_confidence.append(acf)

      
        #max_proba = [np.max(p) for p in outputs ]
        # compensation for the overconfidence of the model
        result_per_label['proba'] = lambdac * sum(avg_confidence)/len(avg_confidence)
        print(label, 'avg proba', result_per_label['proba'], 'ece', ece)

        result_per_model[label] = result_per_label # for specific model
        all_results[model_name] = result_per_model
        
        print('+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++')
        print()

      
    # sll records of training and evaluation 
    strategy_results[_with] = all_results
    allmodels[_with] = models
    
strategy_results['lambda'] = lambdac    
# save training and test accuracy
recordFile = f"./results/{dataset['data_dir']}/results_training_best_ml.json"
print('recordFile', recordFile)
#with open(recordFile, "w") as outfile: json.dump(strategy_results, outfile)

0 - parker True - constraints_check False
relative path ./data/allergens --before delete (1635, 18)
--after delete (1337, 18)
label encoders {}


  0%|                                                     | 0/6 [00:00<?, ?it/s]

inconsistencies related to: nuts 0
dtrain, valid_data (1337, 18) (134, 18)
{0: 0.5470668485675307, 1: 1.2570532915360502, 2: 2.6556291390728477}
imbalance ratio 0.206 unique_classes [0 1 2] class_counts [733 319 151]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 17%|███████▌                                     | 1/6 [00:00<00:02,  2.41it/s]



valid data (134, 18)
nuts avg proba 0.8263266921043396 ece 1.8672471791505814
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: milk 0


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 33%|███████████████                              | 2/6 [00:00<00:01,  2.51it/s]

dtrain, valid_data (1337, 18) (134, 18)
{0: 0.5440976933514247, 1: 1.474264705882353, 2: 2.0670103092783507}
imbalance ratio 0.263 unique_classes [0 1 2] class_counts [737 272 194]


valid data (134, 18)
milk avg proba 0.8662378370761871 ece 1.6989072216674685
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: gluten 0
dtrain, valid_data (1337, 18) (134, 18)
{0: 0.5478142076502732, 1: 4.05050505050505, 2: 1.0779569892473118}
imbalance ratio 0.135 unique_classes [0 1 2] class_counts [732  99 372]


valid data (134, 18)


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 50%|██████████████████████▌                      | 3/6 [00:01<00:01,  2.76it/s]

gluten avg proba 0.827025318145752 ece 3.704355299472809
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: soy 0
dtrain, valid_data (1337, 18) (134, 18)
{0: 0.514102564102564, 1: 1.3875432525951557, 2: 2.9925373134328357}
imbalance ratio 0.172 unique_classes [0 1 2] class_counts [780 289 134]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 67%|██████████████████████████████               | 4/6 [00:01<00:00,  2.90it/s]



valid data (134, 18)
soy avg proba 0.8736280202865601 ece 2.889617685228586
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: peanut 0
dtrain, valid_data (1337, 18) (134, 18)
{0: 0.3904576436222006, 1: 2.7094594594594597, 2: 14.321428571428571}
imbalance ratio 0.027 unique_classes [0 1 2] class_counts [1027  148   28]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 83%|█████████████████████████████████████▌       | 5/6 [00:01<00:00,  2.87it/s]



valid data (134, 18)
peanut avg proba 0.8451979815959931 ece 0.48403830314055085
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: eggs 0
dtrain, valid_data (1337, 18) (134, 18)
{0: 0.3678899082568807, 1: 5.985074626865671, 2: 8.717391304347826}
imbalance ratio 0.042 unique_classes [0 1 2] class_counts [1090   67   46]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

100%|█████████████████████████████████████████████| 6/6 [00:02<00:00,  2.80it/s]




valid data (134, 18)
eggs avg proba 0.8626466274261474 ece 1.658609487581998
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

1 - parker False - constraints_check True
relative path ./data/allergens --before delete (1337, 14)
--after delete (1333, 14)
label encoders {}


  0%|                                                     | 0/6 [00:00<?, ?it/s]

inconsistencies related to: nuts 116
dtrain, valid_data (945, 14) (95, 14)
{0: 0.39297272306981046, 1: 3.586497890295359, 2: 5.666666666666667}
imbalance ratio 0.069 unique_classes [0 1 2] class_counts [721  79  50]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 17%|███████▌                                     | 1/6 [00:00<00:01,  3.78it/s]



valid data (95, 14)
nuts avg proba 0.8472086727619171 ece 0.9391599129885435
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: milk 111
dtrain, valid_data (983, 14) (99, 14)
{0: 0.41796690307328604, 1: 2.9764309764309766, 2: 3.683333333333333}
imbalance ratio 0.113 unique_classes [0 1 2] class_counts [705  99  80]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 33%|███████████████                              | 2/6 [00:00<00:01,  3.72it/s]



valid data (99, 14)
milk avg proba 0.8854178488254547 ece 0.5811132900416851
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: gluten 115
dtrain, valid_data (970, 14) (97, 14)
{0: 0.4041666666666667, 1: 12.125, 2: 2.255813953488372}
imbalance ratio 0.033 unique_classes [0 1 2] class_counts [720  24 129]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 50%|██████████████████████▌                      | 3/6 [00:00<00:00,  3.74it/s]



valid data (97, 14)
gluten avg proba 0.8719265878200532 ece 1.6712698489427567
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: soy 111
dtrain, valid_data (961, 14) (97, 14)
{0: 0.38605898123324395, 1: 3.096774193548387, 2: 11.52}
imbalance ratio 0.034 unique_classes [0 1 2] class_counts [746  93  25]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 67%|██████████████████████████████               | 4/6 [00:01<00:00,  3.81it/s]



valid data (97, 14)
soy avg proba 0.8694289326667786 ece 0.35265794745646417
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: peanut 56
dtrain, valid_data (1139, 14) (114, 14)
{0: 0.3437290409121395, 1: 14.855072463768115, 2: 42.708333333333336}
imbalance ratio 0.008 unique_classes [0 1 2] class_counts [994  23   8]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

 83%|█████████████████████████████████████▌       | 5/6 [00:01<00:00,  4.05it/s]



valid data (114, 14)
peanut avg proba 0.8155520439147949 ece 0.32407428190344945
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: eggs 26
dtrain, valid_data (1259, 14) (126, 14)
{0: 0.34775936157151627, 1: 11.802083333333334, 2: 25.177777777777777}
imbalance ratio 0.014 unique_classes [0 1 2] class_counts [1086   32   15]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.

100%|█████████████████████████████████████████████| 6/6 [00:01<00:00,  3.62it/s]



valid data (126, 14)
eggs avg proba 0.8744678020477296 ece 0.3304985645227134
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

recordFile ./results/allergens/results_training_best_ml.json





In [7]:
strategy_results

{'with_parker': {'count-vect-xgboost': {'nuts': {'ir': 0.206,
    'duration': 0.399526,
    'proba': 0.8263266921043396},
   'milk': {'ir': 0.263, 'duration': 0.368092, 'proba': 0.8662378370761871},
   'gluten': {'ir': 0.135, 'duration': 0.302608, 'proba': 0.827025318145752},
   'soy': {'ir': 0.172, 'duration': 0.302749, 'proba': 0.8736280202865601},
   'peanut': {'ir': 0.027, 'duration': 0.333878, 'proba': 0.8451979815959931},
   'eggs': {'ir': 0.042, 'duration': 0.329187, 'proba': 0.8626466274261474}}},
 'with_constraints': {'count-vect-xgboost': {'nuts': {'ir': 0.069,
    'duration': 0.250957,
    'proba': 0.8472086727619171},
   'milk': {'ir': 0.113, 'duration': 0.257598, 'proba': 0.8854178488254547},
   'gluten': {'ir': 0.033, 'duration': 0.251381, 'proba': 0.8719265878200532},
   'soy': {'ir': 0.034, 'duration': 0.239615, 'proba': 0.8694289326667786},
   'peanut': {'ir': 0.008, 'duration': 0.20171, 'proba': 0.8155520439147949},
   'eggs': {'ir': 0.014, 'duration': 0.364554, 'prob

In [8]:
recordFile = f"./results/{dataset['data_dir']}/results_training_best_ml.json"
with open(recordFile, "w") as outfile: json.dump(strategy_results, outfile)

In [9]:
stats = {}
dtest = dt.read_test_csv(dataName, parker)
dtest1 = dtest.copy()
print(dtest1.shape)
print('+++++++++++++++++++++Start+++++++++++++++++++++++++++++')

for a in labels:
    # test repaired by parker do not have the following columns: need to fix it!!
    if a + '_gs'not in dtest1.columns:
        dtest1 = dtest1.merge(dt.read_gs_csv(dataName)[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_gs'))
    # confidence score for each attribute
    conf_score = round(strategy_results[_with][model_name][a]['proba'],2)
    ## load saved model
    file_model_name = f"./models/_{a}_classifier_{model_name}_{_with}.pth"
    with open(file_model_name, 'rb') as f: model = pickle.load(f) 
    #model = models[label]

    # get the encoder if exists and encode y_orig  y_gs
    enc = {}
    enc, y_orig, y_gs = tp.encode(encoders, a, dtest1)
    print("------ done encoding ----------")      
    
    # predict the values for the labels to be repaired
    y_pred, outputs, dtest, accuracy = tr.clf_test(model, dtest1, a, dataset, enc)
    print("------ done predicting ----------")

    if a + '_orig' not in dtest1.columns:
        dtest1 = dtest1.merge(dtest1[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_orig')) 
        print('current columns:', dtest1.columns)

    # evaluate on ground truth
    y_repair = eva.assign_repair(outputs, y_orig.values, y_pred, conf_score)
    # stats
    correct_repair, repair, errors = eva.get_stats(y_repair, y_orig.values, y_gs.values)
    # metrics
    metrics = eva.get_metrics(y_repair, y_orig.values, y_gs.values)
    print(' th', conf_score)
    print(a, 'stats: PRECISION, RECALL, F1', metrics)

    #dtest1[a] = y_pred

crs, rs, es = eva.get_all_stats(dtest1, labels)
print('correct_repairs, repairs, errors', crs, rs, es)
print('precision', round(crs/rs,2), 'recall', round(crs/es,2))
if es !=0: 
    print('recall', round(crs/es,2))
    print('F1', 2 * round(crs/rs,2) * round(crs/es,2)/(round(crs/rs,2) + round(crs/es,2)))

relative path ./data/allergens --before delete (298, 20)
--after delete (298, 20)
(298, 20)
+++++++++++++++++++++Start+++++++++++++++++++++++++++++
------ done encoding ----------
------ done predicting ----------
correct repair 17 repairs 40 errors 48
 th 0.85
nuts stats: PRECISION, RECALL, F1 (0.42, 0.35, 0.38)
------ done encoding ----------
------ done predicting ----------
correct repair 14 repairs 27 errors 30
 th 0.89
milk stats: PRECISION, RECALL, F1 (0.52, 0.47, 0.49)
------ done encoding ----------
------ done predicting ----------
correct repair 15 repairs 35 errors 34
 th 0.87
gluten stats: PRECISION, RECALL, F1 (0.43, 0.44, 0.43)
------ done encoding ----------
------ done predicting ----------
correct repair 16 repairs 33 errors 34
 th 0.87
soy stats: PRECISION, RECALL, F1 (0.48, 0.47, 0.47)
------ done encoding ----------
------ done predicting ----------
correct repair 6 repairs 18 errors 20
 th 0.82
peanut stats: PRECISION, RECALL, F1 (0.33, 0.3, 0.31)
------ done enco

In [10]:
import random
a = random.choice(labels)
df1 = dtest.copy()
diff = df1[df1[a] != df1[a + '_gs']][[a,a+'_gs']]
i = random.choice(diff.index)
if diff.shape[0] > 0: 
    i = random.choice(diff.index)
    print(i)
    print(diff.loc[i])
    print(df1.loc[i, feature])
    print(df1.loc[i, partial_key])
    print(dtest[dtest[partial_key]== df1.loc[i, partial_key]][a])

227
peanut       0.0
peanut_gs    1.0
Name: 227, dtype: float64
38% kokosnootschilfers|palmolie|rijstdrank poeder|15% amandelen|ruwe rietsuiker|gemalen bourbon vanillestokjes|zeezout
4104420182479.0
225    0
226    0
227    0
Name: peanut, dtype: int64


# Correlation textual field

In [11]:
model_name = m.get_best_ml(data_index)
for l in langs:
    if l['name'] in model_name:
        lang = l
        break
transformer = lang
trans = transformer["fn"](**transformer["fixed_params"])

features = dataset['features'][0]

if data_index == 0: ratio = 0.35
else: ratio = 1.0

### calculate correlation between each text's embedded vector and the label values**

inconsistencies related to: elderly 85
data shape initial: 14257
data shape now: 14257
to save in ./features/_elderly_top_features_tf-idf_with_constraints.csv


KeyboardInterrupt: 

## Train and repair data