## Libraries

In [1]:
import imblearn
print(imblearn.__version__)

0.12.4


In [2]:
import random

import dataset as dt
import training as tr
import preprocessing as tp
import model as m
import evaluation as eva
import utils
import config
# import nn

import pandas as pd
import numpy as np
import joblib
import datetime
import time

import json
import csv

import os
import pickle
import warnings

import sklearn
from sklearn.metrics import accuracy_score, classification_report, recall_score, precision_score#

## Settings

In [3]:
# dataset names
data_index = 1
if data_index == 2 : j = 0
else: j = 1
dataset = config.datasets[data_index] #dt.Allergens
partial_key = dataset['keys'][0]
labels = dataset['labels']
feature = dataset['features'][0]
dataName = dataset['data_dir']

#LEARNING ALGORITHM
alg = config.classifier #'multinomial bayesian' #'xgboost'
#LANGUAGE MODEL
lang = config.transformers[j] #'tf-idf'
#MODEL
model_name = m.get_best_ml(data_index) #get_modelName(lang["name"], alg["name"])
print('model:', model_name)
print('---------------------------')
parker =  False #True #False #

seed = 42

print('dataset:', dataName)
print('---------------------------')
print('feature', feature)
print('---------------------------')
print('labels', labels)
print('---------------------------')

model: tf-idf-xgboost
---------------------------
dataset: trials_population
---------------------------
feature inclusion
---------------------------
labels ['elderly', 'adults', 'adolescents', 'children', 'female', 'male', 'healthy_volunteers']
---------------------------


## Evaluate ML model

#### read predicted values

In [4]:
if not parker:
    _parker = ""
    _with = "with_constraints"
else:
    _parker = "_parker"
    _with = 'with_parker'


In [5]:
configFile = f"./results/{dataset['data_dir']}/results_training_best_ml.json"

print('configFile', configFile)

f = open(configFile)

records = json.load(f)

encoder = {} 
for label in labels:
    if 'encoder' in records[_with][model_name][label]:
        encoder[label] = records[_with][model_name][label]['encoder']
    
f.close()
encoder

configFile ./results/trials_population/results_training_best_ml.json


{}

#### test robustness of the trained model

#### test the trained model and save the predicted values

In [6]:
dtest = dt.read_test_csv(dataName, parker)

for a in labels:
    if a + '_gs'not in dtest.columns:
        dtest = dtest.merge(dt.read_gs_csv(dataName)[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_gs'))

    ## load saved model
    file_model_name = f"./models/_{a}_classifier_{model_name}_{_with}.pth"
    with open(file_model_name, 'rb') as f: model = pickle.load(f)   


    # predict the values for the labels to be repaired
    enc, y_orig, y_gs = tp.encode(encoder, a, dtest)
    y_pred, outputs, dtest, accuracy = tr.clf_test(model, dtest, a, dataset, enc)

    print('a=', a, dtest[a].unique())
    # metrics
    metrics = eva.get_metrics(y_pred, y_orig.values, y_gs.values)
    print(a, 'stats: PRECISION, RECALL, F1', metrics)
    break

print('----------------------------------------')
crs, rs, es = eva.get_all_stats(dtest, [a])
print('correct_repairs, repairs, errors', crs, rs, es)
print('precision', round(crs/rs,2))
if es !=0: 
    print('recall', round(crs/es,2))
    print('F1', 2 * round(crs/rs,2) * round(crs/es,2)/(round(crs/rs,2) + round(crs/es,2)))

relative path ./data/trials_population --before delete (1269, 23)
--after delete (1269, 23)
a= elderly [0 1]
correct repair 24 repairs 68 errors 34
elderly stats: PRECISION, RECALL, F1 (0.35, 0.71, 0.47)
----------------------------------------
correct_repairs, repairs, errors 24 68 34
precision 0.35
recall 0.71
F1 0.4688679245283018


##### save last repaired datset

In [7]:
if not parker:
    a = labels[0]
    dtest[[a, a+'_gs']][dtest[a] != dtest[a+'_gs']].shape, dtest.shape
file = f"data/{dataset['data_dir']}/repaired/{dataset['data_dir']}_{model_name}_ML_repair_{_with}.csv"

print('File to be saved:', file)
#dtest.to_csv(file, quoting=csv.QUOTE_NONNUMERIC, index=False)

File to be saved: data/trials_population/repaired/trials_population_tf-idf-xgboost_ML_repair_with_constraints.csv


#### test confidence score

In [8]:
stats = {}
dtest = dt.read_test_csv(dataName, parker)
dtest1 = dtest.copy()
print(dtest1.shape)
print('+++++++++++++++++++++Start+++++++++++++++++++++++++++++')

for a in labels:
    # test repaired by parker do not have the following columns: need to fix it!!
    if a + '_gs'not in dtest1.columns:
        dtest1 = dtest1.merge(dt.read_gs_csv(dataName)[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_gs'))
    # confidence score for each attribute
    conf_score = round(records[_with][model_name][a]['proba'],2)
    ## load saved model
    file_model_name = f"./models/_{a}_classifier_{model_name}_{_with}.pth"
    with open(file_model_name, 'rb') as f: model = pickle.load(f)   

    # get the encoder if exists and encode y_orig  y_gs
    enc = {}
    enc, y_orig, y_gs = tp.encode(encoder, a, dtest1)
    print("------ done encoding ----------")      
    
    # predict the values for the labels to be repaired
    y_pred, outputs, dtest, accuracy = tr.clf_test(model, dtest1, a, dataset, enc)
    print("------ done predicting ----------")

    if a + '_orig' not in dtest1.columns:
        dtest1 = dtest1.merge(dtest1[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_orig')) 
        print('current columns:', dtest1.columns)

    # evaluate on ground truth
    y_repair = eva.assign_repair(outputs, y_orig.values, y_pred, conf_score)
    # stats
    correct_repair, repair, errors = eva.get_stats(y_repair, y_orig.values, y_gs.values)
    # metrics
    metrics = eva.get_metrics(y_repair, y_orig.values, y_gs.values)
    print(' th', conf_score)
    print(a, 'stats: PRECISION, RECALL, F1', metrics)

    #dtest1[a] = y_pred

crs, rs, es = eva.get_all_stats(dtest1, labels)
print('correct_repairs, repairs, errors', crs, rs, es)
print('precision', round(crs/rs,2), 'recall', round(crs/es,2))
if es !=0: 
    print('recall', round(crs/es,2))
    print('F1', 2 * round(crs/rs,2) * round(crs/es,2)/(round(crs/rs,2) + round(crs/es,2)))

relative path ./data/trials_population --before delete (1269, 23)
--after delete (1269, 23)
(1269, 23)
+++++++++++++++++++++Start+++++++++++++++++++++++++++++
------ done encoding ----------
------ done predicting ----------
correct repair 16 repairs 20 errors 34
 th 0.95
elderly stats: PRECISION, RECALL, F1 (0.8, 0.47, 0.59)
------ done encoding ----------
------ done predicting ----------
correct repair 4 repairs 9 errors 12
 th 0.99
adults stats: PRECISION, RECALL, F1 (0.44, 0.33, 0.38)
------ done encoding ----------
------ done predicting ----------
correct repair 207 repairs 210 errors 210
 th 0.99
adolescents stats: PRECISION, RECALL, F1 (0.99, 0.99, 0.99)
------ done encoding ----------
------ done predicting ----------
correct repair 207 repairs 212 errors 217
 th 0.99
children stats: PRECISION, RECALL, F1 (0.98, 0.95, 0.96)
------ done encoding ----------
------ done predicting ----------
correct repair 1 repairs 1 errors 1
 th 1.0
female stats: PRECISION, RECALL, F1 (1.0, 1.

In [9]:
file = f"./data/{dataset['data_dir']}/repaired/{dataset['data_dir']}_{model_name}_ML_repair_{_with}_threshold.csv"
print('File to be saved:', file, 'test data', dtest.shape)
file2 = f"data/{dataset['data_dir']}/{dataset['data_dir']}.csv"

dtest.to_csv(file, quoting=csv.QUOTE_NONNUMERIC, index=False)



File to be saved: ./data/trials_population/repaired/trials_population_tf-idf-xgboost_ML_repair_with_constraints_threshold.csv test data (1269, 30)


### to be remved

In [10]:
import random
def get_additional_rows(dataset, data, additional_file_dir):
    """ returns a (allergen) dataset that can be added to evaluate the predictions
        seems useful to fed to Parker engine
        specific to allergens dataset due to using keys.

    Args:
        test_file_dir (String): directory to the test dataset
        additional_file_dir (String): directory to the dataset where the valuable rows can be added
        quote (Boolean): specifies whther the textual attributes are wrapped by quotes or not

    Returns:
        to_be_added (DataFrame): dataframe of rows that can be added to the predictions
    """   
    
    # read test file
    test = data.copy() #pd.read_csv(test_file_dir, quoting=csv.QUOTE_NONNUMERIC)
    print('test size', test.shape)

    # read the dataset allergens from ledc gitlab
    more_rows = pd.read_csv(additional_file_dir, quoting=csv.QUOTE_NONNUMERIC)
    print('test file', additional_file_dir)
    print('test size', more_rows.shape)
    
    overlap = more_rows[(more_rows[dataset['keys'][0]].isin(test[dataset['keys'][0]])) & (more_rows[dataset['keys'][1]].isin(test[dataset['keys'][1]]))]
    print('overlap', overlap.shape)
    
    indices = [i for i in more_rows.index if i not in overlap.index]
    nbs = round(len(indices)*.78)
    to_be_added = more_rows.loc[random.sample(indices,nbs)]
    return to_be_added

def put_more_into_test_dataset(dataset, file1, file2, repaired, save_file):
    no_overlap = get_additional_rows(dataset, repaired, file2)
    co = [a for a in repaired.columns if a in no_overlap.columns]
    alles = pd.concat([repaired, no_overlap[co]], ignore_index=True)

    alles.to_csv(save_file, index=False, quoting=csv.QUOTE_NONNUMERIC)    
    return alles

### test different thresholds

In [11]:
stats = {}
dtest = dt.read_test_csv(dataName, parker)

print('+++++++++++++++++++++Start+++++++++++++++++++++++++++++')

dtest1 = dtest.copy()
print(dtest1.shape)

for a in labels:
    # test repaired by parker do not have the following columns: need to fix it!!
    if a + '_gs'not in dtest1.columns:
        dtest1 = dtest1.merge(dt.read_gs_csv(dataName)[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_gs'))


    avg_proba = round(records[_with][model_name][a]['proba'],2)

    thresholds = [0, avg_proba] + [th for th in np.arange(0, 1.1, 0.2)]

    ## load saved model
    file_model_name = f"./models/_{a}_classifier_{model_name}_{_with}.pth"
    with open(file_model_name, 'rb') as f: model = pickle.load(f)   

    thresholds.sort()
    print('label',a, 'avg proba', avg_proba, 'ths', thresholds)

    # get the encoder if exists and encode y_orig  y_gs
    enc = {}
    enc, y_orig, y_gs = tp.encode(encoder, a, dtest1)
    print("------ done encoding ----------")      
    
    # predict the values for the labels to be repaired
    y_pred, outputs, dtest, accuracy = tr.clf_test(model, dtest1, a, dataset, enc)
    print("------ done predicting ----------")

    if a + '_orig' not in dtest1.columns:
        dtest1 = dtest1.merge(dtest1[[partial_key, a ]], 
                              how='inner', on=partial_key, suffixes=('', '_orig')) 
        print('current columns:', dtest1.columns)
    
    repairs = []
    correct_repairs = []
    precisions = []
    recalls = []
    f1s = []

    # evaluate on ground truth
    for th in  thresholds: 
        y_repair = eva.assign_repair(outputs, y_orig.values, y_pred, th)
        # stats
        correct_repair, repair, errors = eva.get_stats(y_repair, y_orig.values, y_gs.values)
        correct_repairs.append(correct_repair)
        repairs.append(repair)

        # metrics
        metrics = eva.get_metrics(y_repair, y_orig.values, y_gs.values)
        recalls.append(round(metrics[1],2))
        precisions.append(round(metrics[0],2))
        f1s.append(round(metrics[2],2))

        print(' th', th, )
        print('stats: correct_repairs, repairs, errors', metrics)

    stats[a] = {"errors": errors, "avg_proba": avg_proba,\
                "threshold": [round(th,2) for th in thresholds], 'repairs': repairs, 'correct_repairs': correct_repairs, "precision": precisions, "recall": recalls, "F-1": f1s}
    print(f"+++++++++++++++++++++done with {a}+++++++++++++++++++++++++++++")
    print()
#         break
print('+++++++++++++++++++++more sources+++++++++++++++++++++++++++++')
print()

relative path ./data/trials_population --before delete (1269, 23)
--after delete (1269, 23)
+++++++++++++++++++++Start+++++++++++++++++++++++++++++
(1269, 23)
label elderly avg proba 0.95 ths [0, 0.0, 0.2, 0.4, 0.6000000000000001, 0.8, 0.95, 1.0]
------ done encoding ----------
------ done predicting ----------
correct repair 24 repairs 68 errors 34
 th 0
stats: correct_repairs, repairs, errors (0.35, 0.71, 0.47)
correct repair 24 repairs 68 errors 34
 th 0.0
stats: correct_repairs, repairs, errors (0.35, 0.71, 0.47)
correct repair 24 repairs 68 errors 34
 th 0.2
stats: correct_repairs, repairs, errors (0.35, 0.71, 0.47)
correct repair 24 repairs 68 errors 34
 th 0.4
stats: correct_repairs, repairs, errors (0.35, 0.71, 0.47)
correct repair 24 repairs 52 errors 34
 th 0.6000000000000001
stats: correct_repairs, repairs, errors (0.46, 0.71, 0.56)
correct repair 23 repairs 34 errors 34
 th 0.8
stats: correct_repairs, repairs, errors (0.68, 0.68, 0.68)
correct repair 16 repairs 20 errors 34

##### save statistics of models performences

In [12]:
statFile = f"./results/{dataset['data_dir']}/{dataset['data_dir']}_stats_{model_name}_{_with}.json"
print(statFile)
with open(statFile, "w") as outfile: 
        json.dump(stats, outfile)

./results/trials_population/trials_population_stats_tf-idf-xgboost_with_constraints.json


In [13]:
dtest.shape

(1269, 30)

### Check an instance of repairing an erroneous cell

In [15]:
a = random.choice(labels)
df1 = dtest.copy()
diff = df1[df1[a] != df1[a + '_gs']][[a,a+'_gs']]
i = random.choice(diff.index)
if diff.shape[0] > 0: 
    i = random.choice(diff.index)
    print(i)
    print(diff.loc[i])
    print(df1.loc[i, feature])
    print(df1.loc[i, partial_key])
    print(dtest[dtest[partial_key]== df1.loc[i, partial_key]][a])

609
elderly       0.0
elderly_gs    1.0
Name: 609, dtype: float64
Patient must be  18 and  90 years of age   Female patients can participate if they are surgically sterile or completed menopause or females capable of having children and agree not to attempt pregnancy while receiving IV study therapy and for a period of 7 days after   Patient has a ceftazidime resistant Gram negative pathogen that was isolated from an appropriate culture within 5 days prior to study entry  ie  within 5 days prior to Screening  the study qualifying culture   which was determined to be the causative agent of the entry infection
2012-000726-21
607    0
608    0
609    0
610    0
611    0
Name: elderly, dtype: int64


# End