# First, let's generate a multi-label dataset for the CTBNC classifier

In [2]:
import os, sys
import pandas as pd
import glob
import shutil
from sklearn.metrics import accuracy_score

# Let's use the default CTBNC

https://github.com/dcodecasa/CTBNCToolkit

In [3]:
!java -version

java version "1.8.0_241"
Java(TM) SE Runtime Environment (build 1.8.0_241-b07)
Java HotSpot(TM) 64-Bit Server VM (build 25.241-b07, mixed mode)


In [4]:
os.chdir('..')

In [5]:
!dir

 El volumen de la unidad C no tiene etiqueta.
 El n£mero de serie del volumen es: 9CE1-B000

 Directorio de C:\Users\berna\Downloads\MUIA\TFM\ctbnc-chain\CTBNCToolkit

04/07/2021  22:54    <DIR>          .
04/07/2021  22:54    <DIR>          ..
07/03/2021  20:15    <DIR>          .vscode
04/03/2021  01:10    <DIR>          CTBNCToolkit
06/04/2021  22:57           175.327 CTBNCToolkit.jar
05/07/2021  03:57    <DIR>          data
04/03/2021  01:10    <DIR>          lib
04/03/2021  01:10             1.518 LICENSE
04/03/2021  01:10             5.313 makefile
04/03/2021  01:10               124 MANIFEST.MF
05/07/2021  03:57    <DIR>          notebooks
04/03/2021  01:10             2.126 README.md
05/07/2021  03:58    <DIR>          results
04/07/2021  22:58    <DIR>          results-base
               5 archivos        184.408 bytes
               9 dirs  413.304.320.000 bytes libres


# CTBNC chain

In [6]:
def read_results(res_dir):
    cols = ['id', 'label', 'pred', 'pred_prob']
    valid_cols = [i for i in range(0, 7, 2)] # [0, 2, 4, 6]

    try:
        results_file = glob.glob(os.path.join(res_dir, '*-results.txt'))[0]
    except:
        print("Error while trying to read", os.path.join(res_dir, '*-results.txt'))

    df_res = pd.read_csv(results_file, names=cols, skipinitialspace=True, usecols=valid_cols, sep=':|,', engine='python')
    
    return df_res

def reset_data(orig_dir, dest_dir):
    try:
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.copytree(orig_dir, dest_dir)
    except:
        return False
    return True

def get_different_values(column):
    diff_vals = []
    try:
        check_val = column[0]
    except:
        return diff_vals
    diff_vals.append(check_val)
    for i in range(len(column)):
        actual_val = column[i]
        if actual_val != check_val:
            diff_vals.append(actual_val)
            check_val = actual_val
    return diff_vals

In [7]:
inDir = os.path.join('data', 'd5')
tmpDir = os.path.join('data', 'd5-tmp')

features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
time = 't'
labels = ["C10","C9","C8","C7","C6","C5","C4","C3","C2","C1"]
models = ['CTNB', 'CTBNC2-LL', 'CTBNC4-LL', 'CTBNC8-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)
features_cmd

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [8]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'ch-d5' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'ch-d5' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)

        # Run the application to generate, train and test the classifier
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )
        sys.stdout.flush()

        classes = []
        files = df_res["id"]
        preds = df_res["pred"]
        ss_dynamic_count = 0
        for i in range(len(files)):
            file = files[i]
            classes.append(preds[i].strip())
            try:
                next_file = files[i + 1]
            except:
                next_file = files[i]
                pass
            #print(file, next_file)
            #sys.stdout.flush()
            data = file.split("_") # filename_ind
            ind = data[-1]
            filename = data[0]
            next_ind = next_file.split("_")[-1] # filename_nextind
            #print(ind, next_ind)
            #sys.stdout.flush()
            if int(next_ind) <= int(ind):
                #print("Updating", filename, "with", classes)
                #sys.stdout.flush()
                # update the file
                tmp_file = os.path.join(tmpDir, filename)
                df_tmp = pd.read_csv(tmp_file)
                different_values = get_different_values(df_tmp[item])
                if len(different_values) != len(classes):
                    print("Error", tmp_file, different_values, classes)
                    sys.stdout.flush()
                    #print("Error con el número de cambios de trjSeparator")
                    #sys.stdout.flush()
                    ss_dynamic_count += 1

                new_col = []
                j = 0
                old_col = df_tmp[item]
                max_val = len(classes)
                for i_tmp in range(len(old_col)):
                    actual_val = old_col[i_tmp]
                    try:
                        next_val = old_col[i_tmp + 1]
                    except:
                        next_val = old_col[i_tmp]
                        pass
                    new_col.append(classes[j])
                    if actual_val != next_val:
                        j += 1
                        if j >= max_val:
                            j -= 1

                df_tmp[item] = new_col
                df_tmp.to_csv(tmp_file, header=True, index=False)
                classes = []

        # Finally append the label to the features for the next model to use it as input and reset the classes
        features.append(item)
        features_cmd = ','.join(f"{item}" for item in features)

        print("Ss dynamic count", ss_dynamic_count)
        sys.stdout.flush()

M0_CTNB results printing
Accuracy score for C10 -> 0.736
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C9 -> 0.644
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C8 -> 0.768
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C7 -> 0.738
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C6 -> 0.648
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C5 -> 0.69
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C4 -> 0.74
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C3 -> 0.662
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C2 -> 0.688
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for C1 -> 0.814
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for C10 -> 0.87
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for C9 -> 0.764
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for C8 -> 0.87
Ss dynamic count 0
M0_CTBNC2-LL

In [9]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True

# Binary relevance

In [10]:
inDir = os.path.join('data', 'd5')
tmpDir = os.path.join('data', 'd5-tmp')

features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
time = 't'
labels = ["C10","C9","C8","C7","C6","C5","C4","C3","C2","C1"]
models = ['CTNB', 'CTBNC2-LL', 'CTBNC4-LL', 'CTBNC8-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)
features_cmd

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [11]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'br-d5' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'br-d5' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)

        # Run the application to generate, train and test the classifier
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --v --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )

M0_CTNB results printing
Accuracy score for C10 -> 0.736
M0_CTNB results printing
Accuracy score for C9 -> 0.644
M0_CTNB results printing
Accuracy score for C8 -> 0.768
M0_CTNB results printing
Accuracy score for C7 -> 0.738
M0_CTNB results printing
Accuracy score for C6 -> 0.648
M0_CTNB results printing
Accuracy score for C5 -> 0.69
M0_CTNB results printing
Accuracy score for C4 -> 0.738
M0_CTNB results printing
Accuracy score for C3 -> 0.662
M0_CTNB results printing
Accuracy score for C2 -> 0.688
M0_CTNB results printing
Accuracy score for C1 -> 0.814
M0_CTBNC2-LL results printing
Accuracy score for C10 -> 0.87
M0_CTBNC2-LL results printing
Accuracy score for C9 -> 0.764
M0_CTBNC2-LL results printing
Accuracy score for C8 -> 0.87
M0_CTBNC2-LL results printing
Accuracy score for C7 -> 0.862
M0_CTBNC2-LL results printing
Accuracy score for C6 -> 0.794
M0_CTBNC2-LL results printing
Accuracy score for C5 -> 0.838
M0_CTBNC2-LL results printing
Accuracy score for C4 -> 0.912
M0_CTBNC2-LL r

In [12]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True