# First, let's generate a multi-label dataset for the CTBNC classifier

In [12]:
import os, sys
import pandas as pd
import glob
import shutil
from sklearn.metrics import accuracy_score

# Let's use the default CTBNC

https://github.com/dcodecasa/CTBNCToolkit

In [2]:
!java -version

java version "1.8.0_241"
Java(TM) SE Runtime Environment (build 1.8.0_241-b07)
Java HotSpot(TM) 64-Bit Server VM (build 25.241-b07, mixed mode)


In [3]:
os.chdir('..')

In [10]:
!dir

 El volumen de la unidad C no tiene etiqueta.
 El n£mero de serie del volumen es: 9CE1-B000

 Directorio de C:\Users\berna\Downloads\MUIA\TFM\ctbnc-chain\CTBNCToolkit

14/06/2021  16:06    <DIR>          .
14/06/2021  16:06    <DIR>          ..
07/03/2021  20:15    <DIR>          .vscode
04/03/2021  01:10    <DIR>          CTBNCToolkit
06/04/2021  22:57           175.327 CTBNCToolkit.jar
14/06/2021  16:33    <DIR>          data
04/03/2021  01:10    <DIR>          lib
04/03/2021  01:10             1.518 LICENSE
04/03/2021  01:10             5.313 makefile
04/03/2021  01:10               124 MANIFEST.MF
14/06/2021  16:34    <DIR>          notebooks
04/03/2021  01:10             2.126 README.md
14/06/2021  16:11    <DIR>          results
               5 archivos        184.408 bytes
               8 dirs  408.697.098.240 bytes libres


# Define the parameters

In [5]:
inDir = os.path.join('data', 'energy')
tmpDir = os.path.join('data', 'energy-tmp')
outDir = os.path.join('results', 'real-data')

In [6]:
features = ['IA','IB','IC','VA','VB','VC','SA','SB','SC','PA','PB','PC','QA','QB','QC']
time = 'timestamp'
labels = ['M1','M2','M3','M4','M5','M6']

# Let's make predictions for each one of the labels independently

In [77]:
def read_results(res_dir):
    cols = ['id', 'label', 'pred', 'pred_prob']
    valid_cols = [i for i in range(0, 7, 2)] # [0, 2, 4, 6]
    
    try:
        results_file = glob.glob(os.path.join(res_dir, '*-results.txt'))[0]
    except:
        print("Error while trying to read", os.path.join(res_dir, '*-results.txt'))
        
    df_res = pd.read_csv(results_file, names=cols, skipinitialspace=True, usecols=valid_cols, sep=':|,', engine='python')
    
    return df_res

In [8]:
def reset_data(orig_dir, dest_dir):
    try:
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.copytree(orig_dir, dest_dir)
    except:
        return False
    return True

True

# Rename all the files, filename can't have ','

In [17]:
import re

re.findall(r"\d+", "1096893478_['inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive'].csv")

['1096893478']

In [19]:
import os
import re

i = 0 # unique ID; just in case
for file in os.listdir(inDir):
    file_path = os.path.join(inDir, file)
    file_name = re.findall(r"\d+", file)[0]
    file_name += ("-" + str(i) + '.csv') 
    i += 1
    file_renamed_path = os.path.join(inDir, file_name)
    os.rename(file_path, file_renamed_path)
    
reset_data(inDir, tmpDir)

True

# CTBNC chain

In [24]:
def read_results(res_dir):
    cols = ['id', 'label', 'pred', 'pred_prob']
    valid_cols = [i for i in range(0, 7, 2)] # [0, 2, 4, 6]

    try:
        results_file = glob.glob(os.path.join(res_dir, '*-results.txt'))[0]
    except:
        print("Error while trying to read", os.path.join(res_dir, '*-results.txt'))

    df_res = pd.read_csv(results_file, names=cols, skipinitialspace=True, usecols=valid_cols, sep=':|,', engine='python')
    
    return df_res

def reset_data(orig_dir, dest_dir):
    try:
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.copytree(orig_dir, dest_dir)
    except:
        return False
    return True

def get_different_values(column):
    diff_vals = []
    try:
        check_val = column[0]
    except:
        return diff_vals
    diff_vals.append(check_val)
    for i in range(len(column)):
        actual_val = column[i]
        if actual_val != check_val:
            diff_vals.append(actual_val)
            check_val = actual_val
    return diff_vals

In [25]:
inDir = os.path.join('data', 'energy')
tmpDir = os.path.join('data', 'energy-tmp')

features = ['IA','IB','IC','VA','VB','VC','SA','SB','SC','PA','PB','PC','QA','QB','QC']
time = 'timestamp'
labels = ['M1','M2','M3','M4','M5','M6']
models = ['CTNB', 'CTBNC2-LL', 'CTBNC4-LL', 'CTBNC8-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [26]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'ch-real' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'ch-real' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)
        
        # Run the application to generate, train and test the classifier
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )
        sys.stdout.flush()

        classes = []
        files = df_res["id"]
        preds = df_res["pred"]
        ss_dynamic_count = 0
        for i in range(len(files)):
            file = files[i]
            classes.append(preds[i].strip())
            try:
                next_file = files[i + 1]
            except:
                next_file = files[i]
                pass
            #print(file, next_file)
            #sys.stdout.flush()
            data = file.split("_") # filename_ind
            ind = data[-1]
            filename = data[0]
            next_ind = next_file.split("_")[-1] # filename_nextind
            #print(ind, next_ind)
            #sys.stdout.flush()
            if int(next_ind) <= int(ind):
                #print("Updating", filename, "with", classes)
                #sys.stdout.flush()
                # update the file
                tmp_file = os.path.join(tmpDir, filename)
                df_tmp = pd.read_csv(tmp_file)
                different_values = get_different_values(df_tmp[item])
                if len(different_values) != len(classes):
                    print("Error", tmp_file, different_values, classes)
                    sys.stdout.flush()
                    #print("Error con el número de cambios de trjSeparator")
                    #sys.stdout.flush()
                    ss_dynamic_count += 1

                new_col = []
                j = 0
                old_col = df_tmp[item]
                max_val = len(classes)
                for i_tmp in range(len(old_col)):
                    actual_val = old_col[i_tmp]
                    try:
                        next_val = old_col[i_tmp + 1]
                    except:
                        next_val = old_col[i_tmp]
                        pass
                    new_col.append(classes[j])
                    if actual_val != next_val:
                        j += 1
                        if j >= max_val:
                            j -= 1

                df_tmp[item] = new_col
                df_tmp.to_csv(tmp_file, header=True, index=False)
                classes = []

        # Finally append the label to the features for the next model to use it as input and reset the classes
        features.append(item)
        features_cmd = ','.join(f"{item}" for item in features)

        print("Ss dynamic count", ss_dynamic_count)
        sys.stdout.flush()

M0_CTNB results printing
Accuracy score for M1 -> 0.5760456273764258
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for M2 -> 0.5760456273764258
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for M3 -> 0.6210131332082551
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for M4 -> 0.7142857142857143
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for M5 -> 0.7494969818913481
Ss dynamic count 0
M0_CTNB results printing
Accuracy score for M6 -> 0.7494969818913481
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for M1 -> 0.5760456273764258
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for M2 -> 0.5760456273764258
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for M3 -> 0.6210131332082551
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for M4 -> 0.7142857142857143
Ss dynamic count 0
M0_CTBNC2-LL results printing
Accuracy score for M5 -> 0.7494969818913481
Ss dynamic count 0
M0_CTBN

In [27]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True

# Binary relevance

In [28]:
inDir = os.path.join('data', 'energy')
tmpDir = os.path.join('data', 'energy-tmp')
outDir = os.path.join('results', 'real-data')

features = ['IA','IB','IC','VA','VB','VC','SA','SB','SC','PA','PB','PC','QA','QB','QC']
time = 'timestamp'
labels = ['M1','M2','M3','M4','M5','M6']
models = ['CTNB', 'CTBNC2-LL', 'CTBNC4-LL', 'CTBNC8-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)
features_cmd

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [29]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'br-real' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'br-real' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)

        # Run the application to generate, train and test the classifier
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --v --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )

M0_CTNB results printing
Accuracy score for M1 -> 0.5760456273764258
M0_CTNB results printing
Accuracy score for M2 -> 0.5760456273764258
M0_CTNB results printing
Accuracy score for M3 -> 0.6210131332082551
M0_CTNB results printing
Accuracy score for M4 -> 0.7142857142857143
M0_CTNB results printing
Accuracy score for M5 -> 0.7494969818913481
M0_CTNB results printing
Accuracy score for M6 -> 0.7494969818913481
M0_CTBNC2-LL results printing
Accuracy score for M1 -> 0.5760456273764258
M0_CTBNC2-LL results printing
Accuracy score for M2 -> 0.5760456273764258
M0_CTBNC2-LL results printing
Accuracy score for M3 -> 0.6210131332082551
M0_CTBNC2-LL results printing
Accuracy score for M4 -> 0.7142857142857143
M0_CTBNC2-LL results printing
Accuracy score for M5 -> 0.7494969818913481
M0_CTBNC2-LL results printing
Accuracy score for M6 -> 0.7494969818913481
M0_CTBNC4-LL results printing
Accuracy score for M1 -> 0.5760456273764258
M0_CTBNC4-LL results printing
Accuracy score for M2 -> 0.57604562737

In [None]:
# Restore the tmp directory
reset_data(inDir, tmpDir)