# First, let's generate a multi-label dataset for the CTBNC classifier

In [1]:
import os, sys
import pandas as pd
import glob
import shutil
from sklearn.metrics import accuracy_score

# Let's use the default CTBNC

https://github.com/dcodecasa/CTBNCToolkit

In [2]:
!java -version

java version "1.8.0_241"
Java(TM) SE Runtime Environment (build 1.8.0_241-b07)
Java HotSpot(TM) 64-Bit Server VM (build 25.241-b07, mixed mode)


In [3]:
os.chdir('..')

In [8]:
!dir

 El volumen de la unidad C no tiene etiqueta.
 El n£mero de serie del volumen es: 9CE1-B000

 Directorio de C:\Users\berna\Downloads\MUIA\TFM\ctbnc-chain\CTBNCToolkit

04/07/2021  22:54    <DIR>          .
04/07/2021  22:54    <DIR>          ..
07/03/2021  20:15    <DIR>          .vscode
04/03/2021  01:10    <DIR>          CTBNCToolkit
06/04/2021  22:57           175.327 CTBNCToolkit.jar
04/07/2021  23:09    <DIR>          data
04/03/2021  01:10    <DIR>          lib
04/03/2021  01:10             1.518 LICENSE
04/03/2021  01:10             5.313 makefile
04/03/2021  01:10               124 MANIFEST.MF
04/07/2021  23:09    <DIR>          notebooks
04/03/2021  01:10             2.126 README.md
04/07/2021  23:09    <DIR>          results
04/07/2021  22:58    <DIR>          results-base
               5 archivos        184.408 bytes
               9 dirs  413.960.622.080 bytes libres


# CTBNC chain

In [9]:
def read_results(res_dir):
    cols = ['id', 'label', 'pred', 'pred_prob']
    valid_cols = [i for i in range(0, 7, 2)] # [0, 2, 4, 6]

    try:
        results_file = glob.glob(os.path.join(res_dir, '*-results.txt'))[0]
    except:
        print("Error while trying to read", os.path.join(res_dir, '*-results.txt'))

    df_res = pd.read_csv(results_file, names=cols, skipinitialspace=True, usecols=valid_cols, sep=':|,', engine='python')
    
    return df_res

def reset_data(orig_dir, dest_dir):
    try:
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.copytree(orig_dir, dest_dir)
    except:
        return False
    return True

def get_different_values(column):
    diff_vals = []
    try:
        check_val = column[0]
    except:
        return diff_vals
    diff_vals.append(check_val)
    for i in range(len(column)):
        actual_val = column[i]
        if actual_val != check_val:
            diff_vals.append(actual_val)
            check_val = actual_val
    return diff_vals

In [10]:
inDir = os.path.join('data', 'd1')
tmpDir = os.path.join('data', 'd1-tmp')

features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
time = 't'
labels = ["C10","C9","C8","C7","C6","C5","C4","C3","C2","C1"]
models = ['CTNB', 'CTBNC2-LL', 'CTBNC4-LL', 'CTBNC8-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)
features_cmd

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [11]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'ch-perfect-d1' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'ch-perfect-d1' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)

        # Run the application to generate, train and test the classifier
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )
        sys.stdout.flush()
        
        # Finally append the label to the features for the next model to use it as input and reset the classes
        features.append(item)
        features_cmd = ','.join(f"{item}" for item in features)

M0_CTNB results printing
Accuracy score for C10 -> 0.53
M0_CTNB results printing
Accuracy score for C9 -> 0.518
M0_CTNB results printing
Accuracy score for C8 -> 0.508
M0_CTNB results printing
Accuracy score for C7 -> 0.496
M0_CTNB results printing
Accuracy score for C6 -> 0.546
M0_CTNB results printing
Accuracy score for C5 -> 0.594
M0_CTNB results printing
Accuracy score for C4 -> 0.712
M0_CTNB results printing
Accuracy score for C3 -> 0.716
M0_CTNB results printing
Accuracy score for C2 -> 0.836
M0_CTNB results printing
Accuracy score for C1 -> 0.856
M0_CTBNC2-LL results printing
Accuracy score for C10 -> 0.532
M0_CTBNC2-LL results printing
Accuracy score for C9 -> 0.644
M0_CTBNC2-LL results printing
Accuracy score for C8 -> 0.496
M0_CTBNC2-LL results printing
Accuracy score for C7 -> 0.546
M0_CTBNC2-LL results printing
Accuracy score for C6 -> 0.604
M0_CTBNC2-LL results printing
Accuracy score for C5 -> 0.604
M0_CTBNC2-LL results printing
Accuracy score for C4 -> 0.714
M0_CTBNC2-LL

In [12]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True

# Binary relevance

In [13]:
inDir = os.path.join('data', 'd1')
tmpDir = os.path.join('data', 'd1-tmp')

features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
time = 't'
labels = ["C10","C9","C8","C7","C6","C5","C4","C3","C2","C1"]
models = ['CTNB', 'CTBNC2-LL', 'CTBNC4-LL', 'CTBNC8-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)
features_cmd

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [14]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'br-perfect-d1' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'br-perfect-d1' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)

        # Run the application to generate, train and test the classifier
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --v --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )

M0_CTNB results printing
Accuracy score for C10 -> 0.53
M0_CTNB results printing
Accuracy score for C9 -> 0.518
M0_CTNB results printing
Accuracy score for C8 -> 0.508
M0_CTNB results printing
Accuracy score for C7 -> 0.496
M0_CTNB results printing
Accuracy score for C6 -> 0.546
M0_CTNB results printing
Accuracy score for C5 -> 0.594
M0_CTNB results printing
Accuracy score for C4 -> 0.712
M0_CTNB results printing
Accuracy score for C3 -> 0.714
M0_CTNB results printing
Accuracy score for C2 -> 0.836
M0_CTNB results printing
Accuracy score for C1 -> 0.856
M0_CTBNC2-LL results printing
Accuracy score for C10 -> 0.532
M0_CTBNC2-LL results printing
Accuracy score for C9 -> 0.644
M0_CTBNC2-LL results printing
Accuracy score for C8 -> 0.496
M0_CTBNC2-LL results printing
Accuracy score for C7 -> 0.606
M0_CTBNC2-LL results printing
Accuracy score for C6 -> 0.578
M0_CTBNC2-LL results printing
Accuracy score for C5 -> 0.604
M0_CTBNC2-LL results printing
Accuracy score for C4 -> 0.714
M0_CTBNC2-LL

In [15]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True