# First, let's generate a multi-label dataset for the CTBNC classifier

In [1]:
import os, sys
import pandas as pd
import glob
import shutil
from sklearn.metrics import accuracy_score

# Let's use the default CTBNC

https://github.com/dcodecasa/CTBNCToolkit

In [2]:
!java -version

java version "1.8.0_241"
Java(TM) SE Runtime Environment (build 1.8.0_241-b07)
Java HotSpot(TM) 64-Bit Server VM (build 25.241-b07, mixed mode)


In [3]:
os.chdir('..')

In [14]:
!dir

 El volumen de la unidad C no tiene etiqueta.
 El n£mero de serie del volumen es: 9CE1-B000

 Directorio de C:\Users\berna\Downloads\MUIA\TFM\ctbnc-chain\CTBNCToolkit

06/07/2021  01:24    <DIR>          .
06/07/2021  01:24    <DIR>          ..
07/03/2021  20:15    <DIR>          .vscode
04/03/2021  01:10    <DIR>          CTBNCToolkit
06/04/2021  22:57           175.327 CTBNCToolkit.jar
05/07/2021  08:33    <DIR>          data
04/03/2021  01:10    <DIR>          lib
04/03/2021  01:10             1.518 LICENSE
04/03/2021  01:10             5.313 makefile
04/03/2021  01:10               124 MANIFEST.MF
06/07/2021  01:29    <DIR>          notebooks
04/03/2021  01:10             2.126 README.md
06/07/2021  01:23    <DIR>          results
04/07/2021  22:58    <DIR>          results-base
05/07/2021  08:11    <DIR>          results-second
               5 archivos        184.408 bytes
              10 dirs  411.965.693.952 bytes libres


# Define the parameters

In [None]:
inDir = os.path.join('data', 'energy')
tmpDir = os.path.join('data', 'energy-tmp')
outDir = os.path.join('results', 'real-data')

In [None]:
features = ['IA','IB','IC','VA','VB','VC','SA','SB','SC','PA','PB','PC','QA','QB','QC']
time = 'timestamp'
labels = ['M6','M5','M4','M3','M2','M1']

# Let's make predictions for each one of the labels independently

In [15]:
def read_results(res_dir):
    cols = ['id', 'label', 'pred', 'pred_prob']
    valid_cols = [i for i in range(0, 7, 2)] # [0, 2, 4, 6]
    
    try:
        results_file = glob.glob(os.path.join(res_dir, '*-results.txt'))[0]
    except:
        print("Error while trying to read", os.path.join(res_dir, '*-results.txt'))
        
    df_res = pd.read_csv(results_file, names=cols, skipinitialspace=True, usecols=valid_cols, sep=':|,', engine='python')
    
    return df_res

In [16]:
def reset_data(orig_dir, dest_dir):
    try:
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.copytree(orig_dir, dest_dir)
    except:
        return False
    return True

# Rename all the files, filename can't have ','

In [None]:
import re

re.findall(r"\d+", "1096893478_['inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive'].csv")

In [None]:
import os
import re

i = 0 # unique ID; just in case
for file in os.listdir(inDir):
    file_path = os.path.join(inDir, file)
    file_name = re.findall(r"\d+", file)[0]
    file_name += ("-" + str(i) + '.csv') 
    i += 1
    file_renamed_path = os.path.join(inDir, file_name)
    os.rename(file_path, file_renamed_path)
    
reset_data(inDir, tmpDir)

# CTBNC chain

In [26]:
def read_results(res_dir):
    cols = ['id', 'label', 'pred', 'pred_prob']
    valid_cols = [i for i in range(0, 7, 2)] # [0, 2, 4, 6]

    try:
        results_file = glob.glob(os.path.join(res_dir, '*-results.txt'))[0]
    except:
        print("Error while trying to read", os.path.join(res_dir, '*-results.txt'))

    df_res = pd.read_csv(results_file, names=cols, skipinitialspace=True, usecols=valid_cols, sep=':|,', engine='python')
    
    return df_res

def reset_data(orig_dir, dest_dir):
    try:
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.copytree(orig_dir, dest_dir)
    except:
        return False
    return True

def get_different_values(column):
    diff_vals = []
    try:
        check_val = column[0]
    except:
        return diff_vals
    diff_vals.append(check_val)
    for i in range(len(column)):
        actual_val = column[i]
        if actual_val != check_val:
            diff_vals.append(actual_val)
            check_val = actual_val
    return diff_vals

In [27]:
inDir = os.path.join('data', 'energy')
tmpDir = os.path.join('data', 'energy-tmp')

features = ['IA','IB','IC','VA','VB','VC','SA','SB','SC','PA','PB','PC','QA','QB','QC']
time = 'timestamp'
labels = ['M6','M5','M4','M3','M2','M1']
models = ['ACTNB2-LL', 'ACTNB4-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [28]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'ch-perfect-real' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'ch-perfect-real' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)
        
        # Run the application to generate, train and test the classifier
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )
        sys.stdout.flush()

        # Finally append the label to the features for the next model to use it as input and reset the classes
        features.append(item)
        features_cmd = ','.join(f"{item}" for item in features)

M0_ACTNB2-LL results printing
Accuracy score for M6 -> 0.7494969818913481
M0_ACTNB2-LL results printing
Accuracy score for M5 -> 0.9486921529175051
M0_ACTNB2-LL results printing
Accuracy score for M4 -> 0.8630952380952381
M0_ACTNB2-LL results printing
Accuracy score for M3 -> 0.7335834896810507
M0_ACTNB2-LL results printing
Accuracy score for M2 -> 0.7357414448669202
M0_ACTNB2-LL results printing
Accuracy score for M1 -> 0.9980988593155894
M0_ACTNB4-LL results printing
Accuracy score for M6 -> 0.7494969818913481
M0_ACTNB4-LL results printing
Accuracy score for M5 -> 0.9486921529175051
M0_ACTNB4-LL results printing
Accuracy score for M4 -> 0.8630952380952381
M0_ACTNB4-LL results printing
Accuracy score for M3 -> 0.7335834896810507
M0_ACTNB4-LL results printing
Accuracy score for M2 -> 0.7357414448669202
M0_ACTNB4-LL results printing
Accuracy score for M1 -> 0.9980988593155894


In [29]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True

# Binary relevance

In [30]:
inDir = os.path.join('data', 'energy')
tmpDir = os.path.join('data', 'energy-tmp')
outDir = os.path.join('results', 'real-data')

features = ['IA','IB','IC','VA','VB','VC','SA','SB','SC','PA','PB','PC','QA','QB','QC']
time = 'timestamp'
labels = ['M6','M5','M4','M3','M2','M1']
models = ['ACTNB2-LL', 'ACTNB4-LL']
train_perc = 0.75

features_cmd = ','.join(f"{item}" for item in features)
features_cmd

# Restore the tmp directory
reset_data(inDir, tmpDir)

True

In [31]:
for mod in models:
    reset_data(inDir, tmpDir)
    features = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"]
    features_cmd = ','.join(f"{item}" for item in features)
    os.mkdir(os.path.join('results', 'br-perfect-real' + mod), 0o777)
    for item in labels:
        outDir = os.path.join('results', 'br-perfect-real' + mod, item) # Generate the directory to store the results
        os.mkdir(outDir, 0o777)

        # Run the application to generate, train and test the classifier
        #!java -jar CTBNCToolkit.jar --CTBNC={models} --v --validation=HO,{train_perc} --validColumns={features_cmd} --timeName={time} --className={item} --rPath={outDir} {tmpDir}
        !java -jar CTBNCToolkit.jar --CTBNC={mod} --validation=HO,{train_perc} --validColumns={features_cmd} --trjSeparator={item} --timeName={time} --className={item} --rPath={outDir} {tmpDir}

        # Get the predictions for the test subset
        df_res = read_results(outDir)
        acc = accuracy_score(df_res['label'], df_res['pred'])
        print('Accuracy score for', item, '->', acc )

M0_ACTNB2-LL results printing
Accuracy score for M6 -> 0.7494969818913481
M0_ACTNB2-LL results printing
Accuracy score for M5 -> 0.7494969818913481
M0_ACTNB2-LL results printing
Accuracy score for M4 -> 0.7142857142857143
M0_ACTNB2-LL results printing
Accuracy score for M3 -> 0.6210131332082551
M0_ACTNB2-LL results printing
Accuracy score for M2 -> 0.5760456273764258
M0_ACTNB2-LL results printing
Accuracy score for M1 -> 0.5760456273764258
M0_ACTNB4-LL results printing
Accuracy score for M6 -> 0.7494969818913481
M0_ACTNB4-LL results printing
Accuracy score for M5 -> 0.7494969818913481
M0_ACTNB4-LL results printing
Accuracy score for M4 -> 0.7142857142857143
M0_ACTNB4-LL results printing
Accuracy score for M3 -> 0.6210131332082551
M0_ACTNB4-LL results printing
Accuracy score for M2 -> 0.5760456273764258
M0_ACTNB4-LL results printing
Accuracy score for M1 -> 0.5760456273764258


In [32]:
# Restore the tmp directory
reset_data(inDir, tmpDir)

True