# Machine learning using NLP and bert
## Import 

In [None]:
import os
import shutil
import time

import zipfile 

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
tf.get_logger().setLevel('ERROR')

## Functions 

In [None]:
def split_csv_into_dir(csv_name, label, feature):
    """
    split_csv_into_dir takes a csv file and splits its content by the label. The two new data sets pos and neg are split into a train and test set for each. These are now stored in respective directories 

    :param csv_name: name or url of the csv contining the data
    :param label: the label column inside the csv is expected to be bool
    :param feature: the feature column inside the csv is expected to be string
    """ 

    # Read in dataset
    data = pd.read_csv(csv_name)
    total_data = len(data)
    print("---")
    print("Data has been read in")
    print("Columns:\n")
    print(data.columns)
    print("Head()\n")
    print(data.head())
    print("Info()\n")
    print(data.info())
    print("Describe()\n")
    print(data.describe())
    print("There are", total_data, "entries")
    print("")
    
    # Use mask to split list into pos and neg
    mask = data[label] == True
    pos, neg = data[mask], data[~mask]
    print("---")
    print("Data has been split into True and False\n")
    print("pos len:", len(pos))
    print(pos.head())
    print("\n-\n")
    print("neg len:", len(neg))
    print(neg.head())
    print("")
    
    # Split pos data and neg into train and test set
    data = {'train': {'pos': [],'neg': [],},'test': {'pos': [],'neg': [],}}
    data['train']['pos'], data['test']['pos'] = train_test_split(pos, test_size=0.2)
    data['train']['neg'], data['test']['neg'] = train_test_split(neg, test_size=0.2)
    print("---")
    print("Datasets Have been Split into train and test")
    print("")    

    # Create Folder structure and save files into it 
    # Code fails if folders already exist
    print("---")
    train_test_dirs = ['test', 'train']
    pos_neg_dirs = ['pos', 'neg']
    for train_test_dir in train_test_dirs:
        try:
            os.mkdir(train_test_dir)
        except FileExistsError:
            pass

        
        for pos_neg_dir in pos_neg_dirs:
            new_dir = os.path.join(train_test_dir, pos_neg_dir)
            try:
                os.mkdir(new_dir)
            except FileExistsError:
                print("Cleaning Dir:", new_dir)
                shutil.rmtree(new_dir)
                os.mkdir(new_dir)
                pass
            print("Dir", new_dir, "has been created")
            
            data_len = len(data[train_test_dir][pos_neg_dir])
            
            for idx, row in data[train_test_dir][pos_neg_dir].reset_index(drop=True).iterrows():
                #create new file named after current index
                with open(os.path.join(new_dir, str(idx)+".txt"), "w") as file:
                    # write the text into the new file
                    file.write(row[feature])
            
            folder_content_len = 0
            for path in os.scandir(new_dir):
                if path.is_file():
                    folder_content_len += 1
            if data_len != folder_content_len:
                print('Dir ', new_dir ,'is missing Jokes', "Data:", data_len, "Dir Content", folder_content_len)
            else:
                print("All files have been Created!")
    print("Success")



In [None]:
def build_classifier_model(tfhub_handle_preprocess, tfhub_handle_encoder, var_layer_dropout, var_activation_funct):
    """
    build_classifier_model returns teh keras model.

    :param tfhub_handle_preprocess: 
    :param tfhub_handle_encoder: 
    :param var_layer_dropout: 
    :param var_activation_funct: 
    :return: Keras Model
    """ 
    text_input = tf.keras.layers.Input(shape=(),dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    output = encoder(encoder_inputs)
    print(output)
    net = output['pooled_output']
    net = tf.keras.layers.Dropout(var_layer_dropout)(net) # HYPERPARAMETER DROPOUT
    net = tf.keras.layers.Dense(1, activation=var_activation_funct, name='classifier')(net) # HYPERPARAMETER Layer Size (1) und Activation
    
    print("Return Model")
    return tf.keras.Model(text_input, net)
    

In [None]:
def is_funny(inputs, results):
  result_for_printing = [f'{inputs[i]:<30}\n FUNNY SCORE: {results[i][0]:.6f}\n' for i in range(len(inputs))]
  print(result_for_printing)

In [None]:
def save_trained_model(bert_model_name, classifier_model):
    """
    save_trained_model Saves data in folder and create zip 

    :param bert_model_name: name of the model used
    :param classifier_model: classifier model that saves the data
    """
    timestamp = time.strftime("%Y%m%d-%H%M%S")

    dataset_name = "model_"+bert_model_name+"_"+timestamp
    saved_model_path = './{}'.format(dataset_name.replace('/', '_'))
    print(f'Saving of {bert_model_name}')
    classifier_model.save(saved_model_path, include_optimizer=True)
    # todo: add zip folder 
    zipobj = zipfile.ZipFile(saved_model_path + '.zip', 'w', zipfile.ZIP_DEFLATED)
    rootlen = len(saved_model_path) + 1
    for base, dirs, files in os.walk(saved_model_path):
        for file in files:
            fn = os.path.join(base, file)
            zipobj.write(fn, fn[rootlen:])
    # todo: upload to cos

In [None]:
def export_data_frame(df, name):
    """
    export_data_frame Saves dataframe as file

    :param df: Data frame
    :param name: name of the file to save to
    """
    df.to_excel(name + 'xlsx', sheet_name='sheet_1')
    #todo: add csv export
    #todo: copy to cos

In [None]:
DESIRED_ACCURACY = 0.95
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy')>DESIRED_ACCURACY):
            print(f"\nReached {DESIRED_ACCURACY * 100}% accuracy so cancelling training!", )
            self.model.stop_training = True

## Import Dataset

In [None]:
# todo: change url
csv_name = "https://raw.githubusercontent.com/skywalkeretw/DBE-Humor-Prototype/master/Datasets/dataset.csv"
label = 'humor'
feature = 'text'

split_csv_into_dir(csv_name, label, feature)

## Set Params

In [None]:
# Put params in loop / csv
var_batch_size = 512 #
var_seed = 42
var_layer_dropout = 0.1 
var_activation_funct = None # sigmoid, relu
var_epochs = 10 # call back code verwenden
var_init_lr = 3e-5
var_optimizer = 'adamw'

## Get The Date from the folders

In [None]:
# Split into Train validation and test
AUTOTUNE = tf.data.AUTOTUNE
batch_size = var_batch_size # HYPERPARAMETER
seed = var_seed # HYPERPARAMETER

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

bert models

In [None]:
#   [{
#      "name":"small_bert/bert_en_uncased_L-2_H-128_A-2",
#      "encoder":"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1",
#      "preprocess":"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
#   },...]
bert_models = pd.read_json('https://raw.githubusercontent.com/skywalkeretw/DBE-Humor-Prototype/master/Datasets/bert_models.json').to_dict('records')[0]


: 

In [None]:
metrics = {}

for model in bert_models:
    # start timer
    st = time.time()
    #use data from bert models as parameters
    bert_model_name = model["name"]
    tfhub_handle_encoder = model["encoder"]
    tfhub_handle_preprocess = model["preprocess"]

    # Preprocessing with Bert
    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    
    # Encoding with Bert
    bert_model = hub.KerasLayer(tfhub_handle_encoder)
    
    # Build Classifier Model
    classifier_model = build_classifier_model()
    
    # Define Metrics
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()

    # Init Params
    epochs = var_epochs # HYPERPARAMETER
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = var_init_lr # HYPERPARAMETER
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type=var_optimizer)

    # Compile Model using Params
    classifier_model.compile(optimizer=optimizer,
                             loss=loss,
                             metrics=metrics)

    print(f'Training of {bert_model_name}')
    # Train Model
    classifier_model.fit(x=train_ds,
                            validation_data=val_ds,
                            epochs=epochs)

    # Model is evaluated using the test_ds
    loss, accuracy = classifier_model.evaluate(test_ds)

    # end timer and calculate duration
    duration = time.time() - st
    # creatr  metrics file using test_ds
    metrics[bert_model_name] = {
        'loss': loss,
        'accuracy': accuracy,
        'duration': duration
    }
    
    save_trained_model(bert_model_name, classifier_model)

# convert Metrics dict to dataframe and save as excel
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
export_data_frame(metrics_df, 'metrics')

## Load Models and Test Performance

In [None]:
#todo rename dataframe
df = pd.read_csv('https://raw.githubusercontent.com/skywalkeretw/DBE-Humor-Prototype/master/Datasets/ourExamples.csv')
directories_in_curdir = sorted(list(filter(os.path.isdir, os.listdir(os.curdir))))
available_models = [i for i in directories_in_curdir if i.startswith('model_')]

## test using models

In [None]:
for model in available_models:
    
    reloaded_model = tf.saved_model.load(model)
    reloaded_results = tf.sigmoid(reloaded_model(tf.constant(df['Sentence'])))

    is_funny(df['Sentence'], reloaded_results)
    
    df[model] = reloaded_results
