# Overview
* The purpose of this notebook is to test different ML models in **predicting correct values** of a **dirty data**.
* We test the impact on **keeping**/**removing inconsistencies samples** during the **training** phase on the ML model **repair** performence.

Jar to Python : https://www.jython.org/installation.html

## Libraries

In [1]:
import dataset as dt
import training as tr
import preprocessing as tp
import model as m
import evaluation as eva
import utils
# import nn

import pandas as pd
import numpy as np
import joblib
import csv
import json # 2.0.9
import config
import datetime

# import torch # 2.2.0+cu121
import os
import pickle

In [2]:
#LEARNING ALGORITHM
algos = config.classifiers #['ann', 'logistic regression l2', 'xgboost', 'multinomial bayesian', ]

#LANGUAGE MODEL
langs = config.transformers


## Settings

In [3]:
#bert_model_name = 'bert-base-multilingual-cased'
bert_model_name = 'bert-base-uncased'
 
# Set up parameters
num_epochs = 8
learning_rate = 2e-5
sampling = False
# nb of folds
splits = 3

#overconfidence compensators
compensators = [.9,.97,.5]

#path to csv files
path = os.path.abspath(os.getcwd())

# dataset names
data_index = 1 #dt.Allergens
dataset = config.datasets[data_index] 
partial_key = dataset['keys'][0]
dataName = dataset['data_dir']
labels = dataset['labels']
feature = dataset['features'][0]
lambdac = compensators[data_index]

print('dataset:', dataName)
print('---------------------------')
print('feature', feature)
print('---------------------------')
print('labels', labels)
print('---------------------------')

#labels and features
dataFolder, labels, features, keys = dt.get_datasetSchema(dataName)
partial_key = keys[0]
path

dataset: trials_population
---------------------------
feature inclusion
---------------------------
labels ['elderly', 'adults', 'adolescents', 'children', 'female', 'male', 'healthy_volunteers']
---------------------------


'/home/lab/Documents/rihem/improving-data-cleaning-with-unstructured-data'

## Run training best ML

In [4]:
print(dataName)

trials_population


In [5]:
#MODEL
model_name = m.get_best_ml(data_index)
for l in langs:
    if l['name'] in model_name:
        lang = l
        break
for al in algos:
    if al['name'] in model_name:
        alg = al
        break

print('model:', model_name)
print('transformer:', alg['name'], 'classifier:', lang['name'])
print('--------------------------------------')        

model: tf-idf-xgboost
transformer: xgboost classifier: tf-idf
--------------------------------------


In [6]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

strategy_results = {}
allmodels = {}
for i in range(2):
    if i == 1:
        _parker = ""
        _with = "with_constraints"        
        parker = False
        constraints_check = True
    else:
        _parker = "_parker"
        _with = "with_parker"        
        parker = True
        constraints_check = False
    print(i, '- parker', parker, '- constraints_check', constraints_check)
    all_results = {}
    models = {}
    
#     start_time = datetime.datetime.now()
    
    # read training data and valid data
    train_file_path = utils.get_dir(dataset, file=dataset['data_dir'] + _parker + "_train.csv")
    train_data = dt.read_data_csv(dataset['data_dir'], 'train', parker) 
    #utils.load_df(dataset, train_file_path)
    # encode non-numerical labels
    dtrain, encoders = tp.preprocess(dataset, train_data)
    print('label encoders', encoders)
    
#     feature_processing_time  = (datetime.datetime.now() - start_time).total_seconds()
    
    # per_model
    result_per_model = {} 
#     for label in tqdm(labels):
    for label in tqdm(labels):
        #timestamp
        start_time = datetime.datetime.now()
        
        #per label       
        result_per_label = {}
        
        # remove the inconsistent rows
        dtrain = tr.get_cleaner_train_version(dataName, label, train_data, partial_key)
        dtrain1, valid_data = train_test_split(dtrain, test_size=0.1)
        iter = 0
        while len(valid_data[label].unique()) != len(dtrain[label].unique()):
            dtrain1, valid_data = train_test_split(dtrain, test_size=0.1)
            iter += 1
            print(iter + "th iteration")
        print('dtrain, valid_data', dtrain.shape, valid_data.shape)

        y = dtrain1[label].astype(int)
        n_class, unique_classes, class_counts, ir = tp.get_class_stats(y)
        class_weights = {c: len(y) / (count * n_class) for c, count in zip(unique_classes, class_counts)}
        print(class_weights)
        
        batch_size = dtrain.shape[0]
        # Loop through DataFrame in chunks without numpy
        for i in range(0, len(dtrain1), batch_size):
            if i + batch_size < len(dtrain): batch = dtrain1[i:i + batch_size]
            else: batch = dtrain1[i:len(dtrain)]
            
            # first training
            if i == 0:            
                model, result_per_label = tr.clf_train(batch, dataset, label, alg, lang, config.root_seed, True)
                models[label] = model
            else:
                print(f"Batch {(i // batch_size) + 1}:\n")
                model.fit(batch[feature], batch[label])

        current_time = datetime.datetime.now()
        result_per_label['duration'] = (current_time - start_time).total_seconds()
        
        # save the model
        file_model_name = f"./models/_{label}_classifier_{model_name}_{_with}.pth"
        pickle.dump(model, open(file_model_name, "wb"))
        
        print()
        # save the encoder for each label
        if len(encoders)>0:
            if len(encoders[label])>0: 
                result_per_label['encoder'] = encoders[label]
        print()
            
        
        # evalutae ML model
        print('valid data', valid_data.shape)
        #dtest = dt.read_test_csv(dataName, parker)
        if alg['name'] != 'ann' and lang['name'] != 'bert':
            if len(encoders)>0:
                encoder = encoders[label]
            else:
                encoder = {}
            ## load saved model
            file_model_name = f"./models/_{label}_classifier_{model_name}_{_with}.pth"
            with open(file_model_name, 'rb') as f: model = pickle.load(f)                 
            
            avg_confidence = []
            eces = []
            outputs = model.predict_proba(valid_data[feature].str.lower())
            y_pred = model.predict(valid_data[feature].str.lower())
            
            y = valid_data[label].values #map(encoder) # encoded
    
            for i in range(len(set(y))):# iterate over the number of classes
                probabilities = outputs[:, i]  # Probabilities for the positive class (class 1)

                ece, avg_confidence_in_bin = eva.expected_calibration_error(y, probabilities, i, n_bins=5)
                
                acf = eva.avg_conf_correct_pred(y, y_pred, probabilities, i)
                avg_confidence.append(acf)

      
        #max_proba = [np.max(p) for p in outputs ]
        # compensation for the overconfidence of the model
        result_per_label['proba'] = lambdac * sum(avg_confidence)/len(avg_confidence)
        print(label, 'avg proba', result_per_label['proba'], 'ece', ece)

        result_per_model[label] = result_per_label # for specific model
        all_results[model_name] = result_per_model
        
        print('+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++')
        print()

      
    # sll records of training and evaluation 
    strategy_results[_with] = all_results
    allmodels[_with] = models
    
strategy_results['lambda'] = lambdac    
# save training and test accuracy
recordFile = f"./results/{dataset['data_dir']}/results_training_best_ml.json"
print('recordFile', recordFile)
#with open(recordFile, "w") as outfile: json.dump(strategy_results, outfile)

0 - parker True - constraints_check False
relative path ./data/trials_population --before delete (19213, 16)
--after delete (17944, 16)
label encoders {}


  0%|                                                     | 0/7 [00:00<?, ?it/s]

inconsistencies related to: elderly 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 2.308319039451115, 1: 0.6382499407161489}
imbalance ratio 0.276 unique_classes [0 1] class_counts [ 3498 12651]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


 14%|██████▍                                      | 1/7 [00:15<01:30, 15.06s/it]

elderly avg proba 0.8723912268877029 ece 68.8116294145584
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: adults 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 5.195945945945946, 1: 0.5532374100719425}
imbalance ratio 0.106 unique_classes [0 1] class_counts [ 1554 14595]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


 29%|████████████▊                                | 2/7 [00:27<01:07, 13.54s/it]

adults avg proba 0.9096833354234695 ece 14.376107305288263
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: adolescents 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 0.6138903672166046, 1: 2.6950934579439254}
imbalance ratio 0.228 unique_classes [0 1] class_counts [13153  2996]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


 43%|███████████████████▎                         | 3/7 [00:40<00:53, 13.42s/it]

adolescents avg proba 0.8598654255270958 ece 67.83170127868652
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: children 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 0.5931462572540953, 1: 3.1839511041009465}
imbalance ratio 0.186 unique_classes [0 1] class_counts [13613  2536]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


 57%|█████████████████████████▋                   | 4/7 [00:53<00:39, 13.22s/it]

children avg proba 0.8646958211064338 ece 42.63611858710647
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: female 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 15.648255813953488, 1: 0.5165035501823066}
imbalance ratio 0.033 unique_classes [0 1] class_counts [  516 15633]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


 71%|████████████████████████████████▏            | 5/7 [01:04<00:24, 12.30s/it]

female avg proba 0.9579240399599075 ece 3.8574822042138264
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: male 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 8.776630434782609, 1: 0.5302055289250771}
imbalance ratio 0.06 unique_classes [0 1] class_counts [  920 15229]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


 86%|██████████████████████████████████████▌      | 6/7 [01:15<00:12, 12.03s/it]

male avg proba 0.941824947297573 ece 6.817829430103298
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: healthy_volunteers 0
dtrain, valid_data (17944, 16) (1795, 16)
{0: 0.5137430807405994, 1: 18.69097222222222}
imbalance ratio 0.027 unique_classes [0 1] class_counts [15717   432]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1795, 16)


100%|█████████████████████████████████████████████| 7/7 [01:26<00:00, 12.37s/it]

healthy_volunteers avg proba 0.926705960035324 ece 14.52439729962498
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

1 - parker False - constraints_check True





relative path ./data/trials_population --before delete (17944, 16)
--after delete (14607, 16)
label encoders {}


  0%|                                                     | 0/7 [00:00<?, ?it/s]

inconsistencies related to: elderly 85
dtrain, valid_data (14257, 16) (1426, 16)
{0: 2.1166281755196303, 1: 0.6546428571428572}
imbalance ratio 0.309 unique_classes [0 1] class_counts [3031 9800]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1426, 16)


 14%|██████▍                                      | 1/7 [00:11<01:06, 11.13s/it]

elderly avg proba 0.8781195706129074 ece 50.481734067201664
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: adults 29
dtrain, valid_data (14503, 16) (1451, 16)
{0: 4.488308115543329, 1: 0.5626832212450422}
imbalance ratio 0.125 unique_classes [0 1] class_counts [ 1454 11598]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1451, 16)


 29%|████████████▊                                | 2/7 [00:21<00:53, 10.73s/it]

adults avg proba 0.9228563058376312 ece 6.832053780555696
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: adolescents 59
dtrain, valid_data (14379, 16) (1438, 16)
{0: 0.5841910436980859, 1: 3.469436997319035}
imbalance ratio 0.168 unique_classes [0 1] class_counts [11076  1865]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1438, 16)


 43%|███████████████████▎                         | 3/7 [00:32<00:42, 10.66s/it]

adolescents avg proba 0.9112786373496056 ece 28.859265433624387
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: children 18
dtrain, valid_data (14516, 16) (1452, 16)
{0: 0.5654432132963989, 1: 4.3201058201058204}
imbalance ratio 0.131 unique_classes [0 1] class_counts [11552  1512]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1452, 16)


 57%|█████████████████████████▋                   | 4/7 [00:42<00:31, 10.60s/it]

children avg proba 0.9289646196365356 ece 18.39736930280923
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: female 3
dtrain, valid_data (14594, 16) (1460, 16)
{0: 14.925, 1: 0.5173310225303293}
imbalance ratio 0.035 unique_classes [0 1] class_counts [  440 12694]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1460, 16)


 71%|████████████████████████████████▏            | 5/7 [00:51<00:20, 10.12s/it]

female avg proba 0.9608811229467392 ece 5.504823535680753
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: male 6
dtrain, valid_data (14577, 16) (1458, 16)
{0: 9.968844984802432, 1: 0.5264023754112832}
imbalance ratio 0.053 unique_classes [0 1] class_counts [  658 12461]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1458, 16)


 86%|██████████████████████████████████████▌      | 6/7 [01:01<00:09,  9.94s/it]

male avg proba 0.9502085718512535 ece 4.473736330866764
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

inconsistencies related to: healthy_volunteers 7
dtrain, valid_data (14579, 16) (1458, 16)
{0: 0.5140249157721539, 1: 18.325418994413408}
imbalance ratio 0.028 unique_classes [0 1] class_counts [12763   358]


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators", "clf__objective" } are not used.





valid data (1458, 16)


100%|█████████████████████████████████████████████| 7/7 [01:10<00:00, 10.07s/it]

healthy_volunteers avg proba 0.9577457049489021 ece 6.138123796670698
+++++++++++++++++++++done with this label+++++++++++++++++++++++++++++

recordFile ./results/trials_population/results_training_best_ml.json





In [7]:
strategy_results

{'with_parker': {'tf-idf-xgboost': {'elderly': {'ir': 0.276,
    'duration': 14.546749,
    'proba': 0.8723912268877029},
   'adults': {'ir': 0.106, 'duration': 11.971939, 'proba': 0.9096833354234695},
   'adolescents': {'ir': 0.228,
    'duration': 12.777874,
    'proba': 0.8598654255270958},
   'children': {'ir': 0.186,
    'duration': 12.418311,
    'proba': 0.8646958211064338},
   'female': {'ir': 0.033, 'duration': 10.136167, 'proba': 0.9579240399599075},
   'male': {'ir': 0.06, 'duration': 11.017528, 'proba': 0.941824947297573},
   'healthy_volunteers': {'ir': 0.027,
    'duration': 10.198905,
    'proba': 0.926705960035324}}},
 'with_constraints': {'tf-idf-xgboost': {'elderly': {'ir': 0.309,
    'duration': 10.715564,
    'proba': 0.8781195706129074},
   'adults': {'ir': 0.125, 'duration': 9.994932, 'proba': 0.9228563058376312},
   'adolescents': {'ir': 0.168,
    'duration': 10.13669,
    'proba': 0.9112786373496056},
   'children': {'ir': 0.131,
    'duration': 10.085442,
    

In [8]:
recordFile = f"./results/{dataset['data_dir']}/results_training_best_ml.json"
with open(recordFile, "w") as outfile: json.dump(strategy_results, outfile)

In [9]:
stats = {}
dtest = dt.read_test_csv(dataName, parker)
dtest1 = dtest.copy()
print(dtest1.shape)
print('+++++++++++++++++++++Start+++++++++++++++++++++++++++++')

for a in labels:
    # test repaired by parker do not have the following columns: need to fix it!!
    if a + '_gs'not in dtest1.columns:
        dtest1 = dtest1.merge(dt.read_gs_csv(dataName)[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_gs'))
    # confidence score for each attribute
    conf_score = round(strategy_results[_with][model_name][a]['proba'],2)
    ## load saved model
    file_model_name = f"./models/_{a}_classifier_{model_name}_{_with}.pth"
    with open(file_model_name, 'rb') as f: model = pickle.load(f) 
    #model = models[label]

    # get the encoder if exists and encode y_orig  y_gs
    enc = {}
    enc, y_orig, y_gs = tp.encode(encoders, a, dtest1)
    print("------ done encoding ----------")      
    
    # predict the values for the labels to be repaired
    y_pred, outputs, dtest, accuracy = tr.clf_test(model, dtest1, a, dataset, enc)
    print("------ done predicting ----------")

    if a + '_orig' not in dtest1.columns:
        dtest1 = dtest1.merge(dtest1[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_orig')) 
        print('current columns:', dtest1.columns)

    # evaluate on ground truth
    y_repair = eva.assign_repair(outputs, y_orig.values, y_pred, conf_score)
    # stats
    correct_repair, repair, errors = eva.get_stats(y_repair, y_orig.values, y_gs.values)
    # metrics
    metrics = eva.get_metrics(y_repair, y_orig.values, y_gs.values)
    print(' th', conf_score)
    print(a, 'stats: PRECISION, RECALL, F1', metrics)

    #dtest1[a] = y_pred

crs, rs, es = eva.get_all_stats(dtest1, labels)
print('correct_repairs, repairs, errors', crs, rs, es)
print('precision', round(crs/rs,2), 'recall', round(crs/es,2))
if es !=0: 
    print('recall', round(crs/es,2))
    print('F1', 2 * round(crs/rs,2) * round(crs/es,2)/(round(crs/rs,2) + round(crs/es,2)))

relative path ./data/trials_population --before delete (1269, 23)
--after delete (1269, 23)
(1269, 23)
+++++++++++++++++++++Start+++++++++++++++++++++++++++++
------ done encoding ----------
------ done predicting ----------
correct repair 23 repairs 29 errors 34
 th 0.88
elderly stats: PRECISION, RECALL, F1 (0.79, 0.68, 0.73)
------ done encoding ----------
------ done predicting ----------
correct repair 10 repairs 19 errors 12
 th 0.92
adults stats: PRECISION, RECALL, F1 (0.53, 0.83, 0.65)
------ done encoding ----------
------ done predicting ----------
correct repair 207 repairs 218 errors 210
 th 0.91
adolescents stats: PRECISION, RECALL, F1 (0.95, 0.99, 0.97)
------ done encoding ----------
------ done predicting ----------
correct repair 208 repairs 222 errors 217
 th 0.93
children stats: PRECISION, RECALL, F1 (0.94, 0.96, 0.95)
------ done encoding ----------
------ done predicting ----------
correct repair 1 repairs 28 errors 1
 th 0.96
female stats: PRECISION, RECALL, F1 (0.

In [13]:
import random
a = random.choice(labels)
df1 = dtest.copy()
diff = df1[df1[a] != df1[a + '_gs']][[a,a+'_gs']]
i = random.choice(diff.index)
if diff.shape[0] > 0: 
    i = random.choice(diff.index)
    print(i)
    print(diff.loc[i])
    print(df1.loc[i, feature])
    print(df1.loc[i, partial_key])
    print(dtest[dtest[partial_key]== df1.loc[i, partial_key]][a])

19
adults       1.0
adults_gs    0.0
Name: 19, dtype: float64
1  Written informed consent 2  Pediatric patients 5  17 years old with clinically stable chronic renal anemia 3  Hemodialysis treatment for at least 8 weeks 4  Body weight   10 kg 5  Adequate hemodialysis  URR of   65  or Kt V  1 2 for patients on thrice weekly HD  Patients with fewer or with more HD sessions per week should have a weekly Kt V   3 6 6  Baseline pre dialysis Hb concentration 10 0   12 0 g dL determined from the mean of weekly Hb values measured between weeks  2 to  1 7  Intravenous maintenance epoetin alfa  epoetin beta  or darbepoetin alfa with same dosing interval for at least 8 weeks before screening 8  Stable maintenance epoetin alfa  epoetin beta  or darbepoetin alfa treatment with no weekly dose change   25   increase or decrease  during the 2 weeks of screening  Patients who had been previously treated by the sc route could only participate if they have been receiving their ESA by the iv route for at l

## Correlation textual field

In [11]:
model_name = m.get_best_ml(data_index)
for l in langs:
    if l['name'] in model_name:
        lang = l
        break
transformer = lang
trans = transformer["fn"](**transformer["fixed_params"])

features = dataset['features'][0]

if data_index == 0: ratio = 0.35
else: ratio = 1.0

### calculate correlation between each text's embedded vector and the label values**

In [5]:
import random
from sklearn.feature_selection import chi2
from scipy.stats import pearsonr

def count_samples(feature_words, data, feature_column):
    c = []
    for i, w in feature_words.iterrows():
        c.append(data[data[feature_column].str.contains(w['feature'], case=False)].shape[0])
    return c
for parker in [False, True]:
    if not parker:
        _parker = ""
        _with = "with_constraints"
    else:
        _parker = "_parker"
        _with = 'with_parker'
    # read training data and test data
    train_file_path = utils.get_dir(dataset, file=dataset['data_dir'] + _parker + "_train.csv")
    test_file_path = utils.get_dir(dataset, file=dataset['data_dir'] + _parker + "_test.csv")
    train_data = utils.load_df(dataset, train_file_path)
    test_data = utils.load_df(dataset, test_file_path)
        
    # encode non-numerical labels
    dtrain1, encoders = tp.preprocess(dataset, train_data)
    for a in labels:
        dtrain1 = tr.get_cleaner_train_version(dataName, a, dtrain1, partial_key)
        print('data shape initial:', dtrain1.shape[0])
        rd_samples = random.sample(sorted(dtrain1.index), round(dtrain1.shape[0] * ratio))     
        dtrain = dtrain1[dtrain1.index.isin(rd_samples)]
        print('data shape now:', dtrain.shape[0])
        
        X = dtrain[features].str.lower()
        X_trans = trans.fit_transform(X)
        
        y = dtrain[a].astype(int)
        chi2_scores, p_values = chi2(X_trans, y)
        feature_scores = pd.DataFrame(
         {'feature': trans.get_feature_names_out(), f'chi2_score_{a}': chi2_scores, f'p_values_{a}': p_values})
    
        feature_scores = feature_scores.sort_values(by=f'chi2_score_{a}', ascending=False)
        feature_scores = feature_scores[feature_scores[f'p_values_{a}'] < .002].copy()
        feature_scores[f'sample_count_{a}'] = count_samples(feature_scores, dtrain1, features)
        file_top_features = f"./features/_{a}_top_features_{lang['name']}_{_with}.csv"
        print('to save in', file_top_features)
        
        top_features = feature_scores.copy()
    
        correlations = []
        for i in range(X_trans.toarray().shape[1]):
            if i in top_features.index:
                feature_values = X_trans.toarray()[:, i]
                corr,s  = pearsonr(feature_values, y)
                correlations.append(corr)
        
        # compute the average correlation
        correlations = correlations #, [c for c in correlations if c > 0]
        average_correlation =  np.nanmean(correlations)
        print(a, average_correlation)
        top_features[f'correlation_to_{a}'] = correlations
        top_features.to_csv(file_top_features, quoting=csv.QUOTE_NONNUMERIC, index=False)
    
    print('+++++++++++++++++++++NEXT STRATEGY+++++++++++++++++++++++++++++')
    print()

top_features

inconsistencies related to: arms 0
data shape initial: 57928
data shape now: 20275
to save in ./features/_arms_top_features_tf-idf_with_constraints.csv
arms -0.030639067284978994
inconsistencies related to: open 76
data shape initial: 57477
data shape now: 20117
to save in ./features/_open_top_features_tf-idf_with_constraints.csv
open 0.007942268558438414
inconsistencies related to: double_blind 21
data shape initial: 57330
data shape now: 20066
to save in ./features/_double_blind_top_features_tf-idf_with_constraints.csv
double_blind 0.00016379781110094325
inconsistencies related to: single_blind 0
data shape initial: 57330
data shape now: 20066
to save in ./features/_single_blind_top_features_tf-idf_with_constraints.csv
single_blind 0.0558260560638072
inconsistencies related to: controlled 0
data shape initial: 57242
data shape now: 20035
to save in ./features/_controlled_top_features_tf-idf_with_constraints.csv
controlled -0.03526732440204452
inconsistencies related to: parallel_grou

Unnamed: 0,feature,chi2_score_randomised,p_values_randomised,sample_count_randomised,correlation_to_randomised
16161,label,264.242174,2.041041e-59,14512,-0.121358
21118,open,247.309612,1.002262e-55,15767,-0.051725
10151,extension,202.313030,6.532677e-46,3712,-0.081928
29585,term,194.575435,3.189137e-44,5661,0.103522
17105,long,192.216250,1.043725e-43,4345,-0.057023
...,...,...,...,...,...
18506,migalastat,9.719923,1.822812e-03,11,0.069593
18574,minitablets,9.693017,1.849696e-03,16,0.080634
23638,preceding,9.677989,1.864887e-03,58,-0.036602
1729,antecedent,9.585031,1.961701e-03,31,-0.045874


## Train and repair data