<a href="https://colab.research.google.com/github/sfortz/VaryMinions/blob/jupyter-notebook/VaryMinions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Imports**

In [4]:
%tensorflow_version 2.x
import argparse
import sys
import time
import numpy as np
import numpy.ma as ma
import tensorflow as tf
import keras  # this activates tensorflow backend directly - do not remove
from sklearn.model_selection import train_test_split
from os import path
from os import environ
from google.colab import drive
import pandas as pd
from sklearn import preprocessing
from tensorflow.keras import backend as kb
from tensorflow.keras.callbacks import TerminateOnNaN
from tensorflow import test as tf_test
from tensorflow.keras import layers

drive.mount('/content/drive')
drive_dir = '/content/drive/My Drive/VaryMinions-Claroline/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Preprocessing**

In [5]:
# load dataset
def load_dataset(dataset_filename):
    file_to_load = drive_dir + dataset_filename
    print(f"Trying to load: {file_to_load}")
    if not path.exists(drive_dir):
        print(f"The expected dataset directory {drive_dir} was not found.")
        sys.exit()
    elif not path.exists(file_to_load):
        print(file_to_load)
        print("The dataset file was not found")
        sys.exit()

    # dealing with process of different lengths by filling missing values
    dataset = np.genfromtxt(file_to_load, delimiter=',', missing_values='', filling_values='', skip_header=0,
                            names=True, dtype=None, encoding="utf-8")
    pd_dataset = pd.DataFrame(dataset)
    return pd_dataset


# retrieve unique values in all dimensions of a dataset -> that would be the 'alphabet' of possible events
def retrieve_unique_values(dataset):
    flat_dataset = dataset.values.flatten()
    unique_value = list(set(flat_dataset))
    # print(len(unique_value))
    # print(unique_value)
    return unique_value


# preprocessing dataset
def preprocessing_to_num(dataset):
    values = retrieve_unique_values(dataset)

    # Creating a dictionary that maps integers to the events/actions (adapted from https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/)
    int2event = dict(enumerate(values))
    # Creating another dictionary that maps events/actions to integers (adapted from https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/)
    event2int = {char: ind for ind, char in int2event.items()}
    # print(event2int)

    # encoding events with mapping dictionary
    encoded_events = []
    # print(range(len(dataset)))
    for i, row in dataset.iterrows():
        encoded_events.append(row)

    for i, row in dataset.iterrows():
        encoded_events[i] = [event2int[event] for event in row]

    return encoded_events, event2int, int2event


# preprocessing categories
def preprocessing_cat(categories):
    lb = preprocessing.LabelBinarizer()
    lb.fit(np.array(categories))

    print("Categories:")
    print(lb.classes_)
    cl_enc = lb.transform(categories)

    return lb, pd.DataFrame(cl_enc)


# preprocessing
def preproc(dataset_filename):
    df_dataset = load_dataset(dataset_filename)
    classes = df_dataset.filter(regex="Category.*")
    df_dataset = df_dataset.drop(classes.columns, axis=1)
    df_cl_decod, df_cl_encoded = preprocessing_cat(classes)
    encoded_ev, event2int, int2event = preprocessing_to_num(df_dataset)
    df_ev_encoded = pd.DataFrame(encoded_ev)
    return df_ev_encoded, df_cl_encoded, event2int, int2event


## **Losses**

In [6]:
# used for segementation, not sure that what we want here :(
def ioU_jaccard_distance(y_true, y_pred, smooth=100):
    """ This distance is copied verbatim from:
    https://github.com/keras-team/keras-contrib/blob/master/keras_contrib/losses/jaccard.py"""

    intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
    sum_ = K.sum(K.abs(y_true) + K.abs(y_pred), axis=-1)
    print("intersection")
    kb.print_tensor(intersection)
    print("union")
    kb.print_tensor(sum_)

    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return (1 - jac) * smooth


def dummy_indicator(y_predicted, thres=0.0):
    """
    This indicator return a tensor with values 0 if values in y_predicted < thres, else 1.
    """

    y_predicted = K.tf.where(K.greater(y_predicted, thres),K.ones_like(y_predicted), K.zeros_like(y_predicted))

    return y_predicted

# TODO: remove this method not suitable as a loss
def vary_boolean_jaccard(y_true, y_predicted):
    """
    This function implements the classic jaccard distance over an indicator functions that converts float predictions
    into boolean values.
    """
    y_predicted = dummy_indicator(y_predicted)
    y_true = K.cast(y_true, tf.float32)
    intersection = K.sum(K.abs(y_true - y_predicted), axis=-1)
    union = K.sum(K.ones_like(y_predicted), axis=-1) # number of possible matches (tensors y_predicted, y_true have the same number
    # of values)
    j_index = K.tf.divide(intersection, union)

    return j_index

# a different form of the jaccard distance which considers minimum and maximum values of the tensors 
def vary_weighted_jaccard(y_true, y_predicted):
    """ This function implements the weighted jaccard distance also known as Soergel:
    https://en.wikipedia.org/wiki/Jaccard_index. Since it works on real and positive numbers no indicator is needed
    to translate probabilities into Boolean values. """

    # we convert actual labels to float to ease comparisons
    y_true = tf.cast(y_true, tf.float32)
    num = kb.sum(kb.minimum(y_true, y_predicted), axis=-1)
    denom = kb.sum(kb.maximum(y_true, y_predicted), axis=-1)
    j_index = tf.divide(num, denom)

    ones = tf.ones_like(j_index)
    return ones - j_index

# bounded version of the weighted jaccard distance putting negative values to 0
def vary_weighted_jaccard_rectified(y_true, y_predicted):
    """ This function implements the weighted jaccard distance also known as Soergel:
    https://en.wikipedia.org/wiki/Jaccard_index. Since it works on real and positive numbers no indicator is needed
    to translate probabilities into Boolean values. This function "rectifies" negative values to zero. """

    # rectification of negative values to zero
    zeros = tf.zeros_like(y_predicted)
    y_predicted = kb.maximum(y_predicted, zeros)
    # we convert actual labels to float to ease comparisons
    y_true = tf.cast(y_true, tf.float32)

    num = kb.sum(kb.minimum(y_true, y_predicted), axis=-1)
    denom = kb.sum(kb.maximum(y_true, y_predicted), axis=-1)
    j_index = tf.divide(num, denom)

    ones = tf.ones_like(j_index)
    return ones - j_index

# the manhattan distance between two tensors
def vary_manhattan_dist(y_true,y_predicted):
    """
    Implements Manatthan distance a loss
    """
    y_actual_float = tf.cast(y_true, tf.float32)
    manh_dist = kb.sum(kb.abs(y_actual_float - y_predicted), axis=-1)
    return manh_dist

# TODO remove this method
def vary_manhattan_dist_indiv(y_true,y_predicted):
    """
    Implements Manatthan distance a loss. Reports detailed losses per classes. DO NOT USE
    """
    y_actual_float = tf.cast(y_true, tf.float32)
    manh_dist = kb.abs(y_actual_float - y_predicted)
    return manh_dist


## **Training Model**

### **Training LSTM**

In [7]:
def get_LSTM_model(alpha_size=128, nb_classes=1, nb_col=128, nb_unit=10, activation='tanh', loss='mse'):
    
    model = tf.keras.Sequential()
    model.add(layers.Embedding(alpha_size, alpha_size, mask_zero=True))
    #model.add(layers.Bidirectional(layers.LSTM(nb_unit, activation='relu', recurrent_activation = 'sigmoid')))
    model.add(layers.Bidirectional(layers.LSTM(nb_unit, activation='tanh', recurrent_activation = 'sigmoid', use_bias=True, recurrent_dropout=0.0, unroll=False)))
###
   
   # softmax is well suited when we predict multiple label for multiple classes:
    if activation == 'tanh':
        model.add(layers.Dense(nb_classes, activation='tanh'))
    elif activation == 'sigmoid':
        model.add(layers.Dense(nb_classes, activation='sigmoid'))
    else:
        raise ValueError("activation = " + activation)

    if loss == 'bin_ce':
        model.compile(optimizer='adam', loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'])
    elif loss == 'bin_ce-logits':
        model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
    elif loss == 'mse':
        model.compile(optimizer='adam', loss=tf.keras.losses.mean_squared_error, metrics=['accuracy'])
    elif loss == 'jaccard':
        model.compile(optimizer='adam', loss=vary_weighted_jaccard, metrics=['accuracy'])
    elif loss == 'manhattan':
        model.compile(optimizer='adam', loss=vary_manhattan_dist, metrics=['accuracy'])
    else:
        raise ValueError("loss = " + loss)

    return model


### **Training GRU**

In [8]:
def get_GRU_model(alpha_size=128, nb_classes=1, nb_col=128, nb_unit=10, activation='tanh', loss='mse'):
  
    model = tf.keras.Sequential()
    model.add(layers.Embedding(alpha_size, alpha_size, input_length=nb_col,mask_zero=True))
    model.add(layers.Bidirectional(layers.GRU(nb_unit, activation='relu', recurrent_activation='sigmoid', reset_after=TRUE,)))

    if activation == 'tanh':
        model.add(layers.Dense(nb_classes, activation='tanh'))
    elif activation == 'sigmoid':
        model.add(layers.Dense(nb_classes, activation='sigmoid'))
    else:
        raise ValueError("activation = " + activation)

    if loss == 'bin_ce':
        model.compile(optimizer='adam', loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'])
    elif loss == 'bin_ce-logits':
        model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
    elif loss == 'mse':
        model.compile(optimizer='adam', loss=tf.keras.losses.mean_squared_error, metrics=['accuracy'])
    elif loss == 'jaccard':
        model.compile(optimizer='adam', loss=vary_weighted_jaccard, metrics=['accuracy'])
    elif loss == 'manhattan':
        model.compile(optimizer='adam', loss=vary_manhattan_dist, metrics=['accuracy'])
    else:
        raise ValueError("loss = " + loss)

    return model


### **Training Model**

In [9]:
def get_compiled_model(model_type="RNN", alpha_size=128, nb_classes=1, nb_col=128, nb_unit=10, activation='tanh', loss='mse'):
    if model_type == "RNN":
        print("Training a RNN")
        model = get_RNN_model(alpha_size, nb_classes, nb_col, nb_unit, activation, loss)
    elif model_type == "LSTM":
        print("Training a LSTM")
        model = get_LSTM_model(alpha_size, nb_classes, nb_col, nb_unit, activation, loss)
    elif model_type == "GRU":
        print("Training a GRU")
        model = get_GRU_model(alpha_size, nb_classes, nb_col, nb_unit, activation, loss)
    else:
        sys.exit("ERROR: " + model_type + " is not recognized as a model type")

    return model


def analyze_predictions(predictions, tfclass):
    samples = predictions[:15]
    np_class = tfclass.numpy()

    for i in range(0, len(samples)):  # for multi-label we look at the highest predictions
        print(" ============ " + "Sample: " + str(i) + " ============")
        pred = (-samples[i]).argsort()  # predictions[i]
        print("pred sorted: " + str(pred) + " pred: " + str(samples[i]))
        np_masked_class = ma.masked_equal(np_class[i], 0)
        label = np_masked_class.nonzero()
        print(" Top predictions indices: " + str(pred[:len(label[0])]) + " vs. real classes indices: " + str(label[0]))
        intersect = np.intersect1d(pred[:len(label[0])], label[0])
        union = np.union1d(pred[:len(label[0])], label[0])
        jaccard_score = len(intersect) / len(union)
        print(f"Jaccard Score: {jaccard_score}")
        print(" ")

In [10]:
def main(dataset_filename, ev_encoded, cl_encoded, event2int, int2event, model_type, nb_epochs, nb_unit, batch_size, percent_training, activation, loss):    
    
    start_time = time.time()

    # we can give both 0.66 and 66 for instance
    if percent_training > 1.0:
        percent_training = percent_training / 100

    x_tr, x_ts, y_tr, y_ts = train_test_split(ev_encoded, cl_encoded, train_size=percent_training)

    print("output y_tr")
    # print(y_tr)
    print(x_tr.shape)
    print(y_tr.shape)
    print(x_ts.shape)
    print(y_ts.shape)

    # Our vectorized labels

    print("Test generation tensorFlow datasets")

    # turn into tensorFlow dataset
    # tf_train = tf.data.Dataset.from_tensor_slices((x_tr.values, y_tr.values))
    tf_train = tf.convert_to_tensor(x_tr)
    tf_label = tf.convert_to_tensor(y_tr)
    # for element in tf_train:
    #    print(element)
    # tf_test = tf.data.Dataset.from_tensor_slices((x_ts.values, y_ts.values))
    tf_test = tf.convert_to_tensor(x_ts)
    tf_class = tf.convert_to_tensor(y_ts)

    # reshape for tensorflow/keras RNN -> df_dataset.columns = number of features; one class to retrieve
    # tf_train = tf.reshape(tf_train,[-1,1,df_dataset.columns])

    # tf_test = tf.reshape(tf_test,[-1,1,df_dataset.columns])

    print("End generating tensorFlow datasets")

    print("Alphabet size:")
    print(len(event2int))
    model = get_compiled_model(model_type=model_type, alpha_size=len(event2int),
                               nb_classes=len(cl_encoded.columns),
                               nb_col=len(ev_encoded.columns), nb_unit=nb_unit, activation=activation, loss=loss)

    print(model.summary())

    callbacks = [TerminateOnNaN()]

    history = model.fit(tf_train, tf_label, epochs=nb_epochs, batch_size=batch_size, callbacks=callbacks)

    #print('Last loss value:')
    #print(list(history.history['loss'])[-1])

    print("Evaluate on test data")
    results = model.evaluate(tf_test, tf_class, batch_size=batch_size)

    print("Generate predictions")
    pred_noarg = model.predict(tf_test)
    print("Analyzing predictions")
    analyze_predictions(pred_noarg, tf_class)

    output_directory = drive_dir + 'results/training_metrics/' # "../../results/training_metrics/"
    output_filename_base = path.basename(dataset_filename)
    output_filename = output_filename_base + '_metrics_' + str(model_type) + '_nb_unit_' + str(
        nb_unit) + '_training_set_size_' + str(
        percent_training) + '_nb_epochs_' + str(nb_epochs) + '_batch_size_' + str(batch_size) + \
                      '_' + TENSORFLOW_DEVICE + '_tensorflow_' + str(loss) + '_' + str(activation)
  
    output_filename = output_filename + '_multi.txt'

    output_file = output_directory + output_filename
    f = open(output_file, "a")
    orig_stdout = sys.stdout
    sys.stdout = f

    seconds = time.time() - start_time
    exec_time = time.strftime("%H:%M:%S", time.gmtime(seconds))
    results.append(exec_time)

    print("test loss, test acc, exec time:", results)

    sys.stdout = orig_stdout
    f.close()
    if np.isnan(results[0]):
        print("LOSS IS NAN! LOOP AGAIN.")
        main(dataset_filename, model_type, nb_epochs, nb_unit, batch_size, percent_training, activation, loss)


## **Main**

### **Configuration**

In [11]:
dataset_filename = "claroline-dis_10.csv"
model_type = "LSTM"
nb_epochs = 20
nb_unit = 30
batch_size = 128
percent_training = 0.66 
activation = "tanh" 
loss = "bin_ce"
nb_iterations = 10

### **Checking for GPU**

In [12]:
if tf_test.is_gpu_available():
  device_name = tf.test.gpu_device_name()
  if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
  print('Found GPU at: {}'.format(device_name))
  !nvidia-smi -L
else:
  print('GPU device not found, running Tensforflow with CPU')

Found GPU at: /device:GPU:0
GPU 0: Tesla K80 (UUID: GPU-65db67d6-48b7-62c5-6382-59f9df755112)


### **Loading Dataset**

In [13]:
print(dataset_filename)
ev_encoded, cl_encoded, event2int, int2event = preproc(dataset_filename)

claroline-dis_10.csv
Trying to load: /content/drive/My Drive/VaryMinions-Claroline/claroline-dis_10.csv
Categories:
[0 1 2 3 4 5 6 7 8 9]


### **Execution**

In [14]:
print(model_type)
for i in range(0, nb_iterations):
      print("Exécution " + str(i) + " : ")
      main(dataset_filename, ev_encoded, cl_encoded, event2int, int2event, model_type, nb_epochs, nb_unit, batch_size, percent_training, activation, loss)

LSTM
Exécution 0 : 
output y_tr
(33000, 300)
(33000, 10)
(17000, 300)
(17000, 10)
Test generation tensorFlow datasets
End generating tensorFlow datasets
Alphabet size:
50
Training a LSTM
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          2500      
                                                                 
 bidirectional (Bidirectiona  (None, 60)               19440     
 l)                                                              
                                                                 
 dense (Dense)               (None, 10)                610       
                                                                 
Total params: 22,550
Trainable params: 22,550
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: ignored

## Archives

In [53]:
file1 = ev_encoded.to_numpy()
print("ev_encoded:")
print(file1)
file2 = cl_encoded.to_numpy()
print("cl_encoded:")
print(file2)
print("event2int:")
print(event2int)
#file3 = event2int.to_numpy()
#file4 = int2event.to_numpy()
storing_dataset("ev_encoded", file1)
storing_dataset("cl_encoded", file2)
#storing_dataset("event2int", file3)
#storing_dataset("int2event", file4)

ev_encoded:
[[49 32 32 ...  1  1  1]
 [49 32 32 ...  1  1  1]
 [49 32 32 ...  1  1  1]
 ...
 [49 32 32 ...  1  1  1]
 [49 32 32 ...  1  1  1]
 [49 32 32 ...  1  1  1]]
cl_encoded:
[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
event2int:
{'': 0, False: 1, 'clic(/claroline/tracking/delete_course_stats.php)': 2, 'clic(/claroline/learnPath/learningPath.php)': 3, 'clic(/claroline/phpbb/viewforum.php)': 4, 'clic(/claroline/phpbb/index.php)': 5, 'clic(/claroline/exercise/exercise.php)': 6, 'clic(/claroline/wiki/page.php)': 7, 'clic(/claroline/announcements/announcements.php)': 8, 'clic(/claroline/learnPath/module.php)': 9, 'clic(/claroline/user/user.php)': 10, 'clic(/claroline/messaging/readmessage.php)': 11, 'clic(/claroline/auth/login.php)': 12, 'clic(/claroline/tracking/course_access_details.php)': 13, 'clic(/claroline/calendar/agenda.php)': 14, 'clic(/claroline/tracking/userReport.php)': 15, 'clic(/claroline/wiki/wi

In [47]:
import h5py

def storing_dataset(fileName, fileData):
    # Create a new HDF5 file
    with h5py.File(drive_dir + fileName + '.h5', 'w') as file:
      # Create a dataset in the file
        print("Type: ")
        #arr = ev_encoded.to_numpy()
        print(fileData.dtype) 
        #print(image.astype('<U50').dtype)

        dataset = file.create_dataset(fileName, np.shape(fileData), h5py.h5t.NATIVE_INT64, data=fileData)
        print(dataset)
        #meta_set = file.create_dataset("meta", np.shape(label), h5py.h5t.STD_U8BE, data=label)