# Machine learning using NLP and bert
## Import 

In [None]:
import os
import shutil
import time

import zipfile 

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
tf.get_logger().setLevel('ERROR')

import ibm_boto3
from ibm_botocore.client import Config, ClientError

## Functions 

In [None]:
DEBUG=True
def debug(d, override = False):
    """
    debug simple mode to enable and disable output

    :param d: data that should be displayed
    """ 
    if DEBUG or override:
        print(d)

### COS

In [None]:


# Constants for IBM COS values
COS_ENDPOINT = "https://"+"s3.eu-de.cloud-object-storage.appdomain.cloud" # Current list avaiable at https://control.cloud-object-storage.cloud.ibm.com/v2/endpoints
COS_API_KEY_ID = "<apikey>" # eg "W00YixxxxxxxxxxMB-odB-2ySfTrFBIQQWanc--P3byk"
COS_INSTANCE_CRN = "<>" # eg "crn:v1:bluemix:public:cloud-object-storage:global:a/3bf0d9003xxxxxxxxxx1c3e97696b71c:d6f04d83-6c4f-4a62-a165-696756d63903::"

# Create resource
cos_resource = ibm_boto3.resource("s3",
    ibm_api_key_id=COS_API_KEY_ID,
    ibm_service_instance_id=COS_INSTANCE_CRN,
    config=Config(signature_version="oauth"),
    endpoint_url=COS_ENDPOINT
)

# Create client 
cos_client = ibm_boto3.client("s3",
    ibm_api_key_id=COS_API_KEY_ID,
    ibm_service_instance_id=COS_INSTANCE_CRN,
    config=Config(signature_version="oauth"),
    endpoint_url=COS_ENDPOINT
)
# "eu-de-standard"
def create_bucket(bucket_name, location):
    print("Creating new bucket: {0}".format(bucket_name))
    try:
        cos_resource.Bucket(bucket_name).create(
            CreateBucketConfiguration={
                "LocationConstraint": location
            }
        )
        print("Bucket: {0} created!".format(bucket_name))
    except ClientError as be:
        print("CLIENT ERROR: {0}\n".format(be))
    except Exception as e:
        print("Unable to create bucket: {0}".format(e))

def get_buckets():
    print("Retrieving list of buckets")
    try:
        buckets = cos_resource.buckets.all()
        for bucket in buckets:
            print("Bucket Name: {0}".format(bucket.name))
    except ClientError as be:
        print("CLIENT ERROR: {0}\n".format(be))
    except Exception as e:
        print("Unable to retrieve list buckets: {0}".format(e))

def upload_file(file, bucket_name):
    print("Uploading {0} to bucket  {1}".format(file, bucket_name))
    try:
        key = os.path.basename(file)
        cos_client.upload_file(file, bucket_name, key)
        print("File {0} uploaded to {1}".format(file, bucket_name))
        return key
    except ClientError as be:
        print("CLIENT ERROR: {0}\n".format(be))
    except Exception as e:
        print("Unable to upload: {0}".format(e))

def download_file(bucket_name, key, file):
    print("Downloading {0} from bucket  {1}".format(key, bucket_name))
    try:
        cos_client.download_file(Bucket=bucket_name, Key=key,  Filename=file)
        print("File {0} downloaded from {1}".format(file, bucket_name))
    except ClientError as be:
        print("CLIENT ERROR: {0}\n".format(be))
    except Exception as e:
        print("Unable to download: {0}".format(e))

In [None]:
def split_csv_into_dir(csv_name, label, feature):
    """
    split_csv_into_dir takes a csv file and splits its content by the label. The two new data sets pos and neg are split into a train and test set for each. These are now stored in respective directories 

    :param csv_name: name or url of the csv contining the data
    :param label: the label column inside the csv is expected to be bool
    :param feature: the feature column inside the csv is expected to be string
    """ 

    # Read in dataset
    data = pd.read_csv(csv_name)
    total_data = len(data)
    debug("---")
    debug("Data has been read in")
    debug("Columns:\n")
    debug(data.columns)
    debug("Head()\n")
    debug(data.head())
    debug("Info()\n")
    debug(data.info())
    debug("Describe()\n")
    debug(data.describe())
    debug(f"There are {total_data} entries")
    debug("")
    
    # Use mask to split list into pos and neg
    mask = data[label] == True
    pos, neg = data[mask], data[~mask]
    debug("---")
    debug("Data has been split into True and False\n")
    debug(f"pos len: {len(pos)}")
    debug(pos.head())
    debug("\n-\n")
    debug(f"neg len: {len(neg)}")
    debug(neg.head())
    debug("")
    
    # Split pos data and neg into train and test set
    data = {'train': {'pos': [],'neg': [],},'test': {'pos': [],'neg': [],}}
    data['train']['pos'], data['test']['pos'] = train_test_split(pos, test_size=0.2)
    data['train']['neg'], data['test']['neg'] = train_test_split(neg, test_size=0.2)
    debug("---")
    debug("Datasets Have been Split into train and test")
    debug("")    

    # Create Folder structure and save files into it 
    # Code fails if folders already exist
    debug("---")
    train_test_dirs = ['test', 'train']
    pos_neg_dirs = ['pos', 'neg']
    for train_test_dir in train_test_dirs:
        try:
            os.mkdir(train_test_dir)
        except FileExistsError:
            pass

        
        for pos_neg_dir in pos_neg_dirs:
            new_dir = os.path.join(train_test_dir, pos_neg_dir)
            try:
                os.mkdir(new_dir)
            except FileExistsError:
                debug(f"Cleaning Dir: {new_dir}")
                shutil.rmtree(new_dir)
                os.mkdir(new_dir)
                pass
            debug(f"Dir {new_dir} has been created")
            
            data_len = len(data[train_test_dir][pos_neg_dir])
            
            for idx, row in data[train_test_dir][pos_neg_dir].reset_index(drop=True).iterrows():
                #create new file named after current index
                with open(os.path.join(new_dir, str(idx)+".txt"), "w") as file:
                    # write the text into the new file
                    file.write(row[feature])
            
            folder_content_len = 0
            for path in os.scandir(new_dir):
                if path.is_file():
                    folder_content_len += 1
            if data_len != folder_content_len:
                debug(f'Dir {new_dir} is missing Jokes Data: {data_len} Dir Content  {folder_content_len}')
            else:
                debug("All files have been Created!")
    debug("Success")



In [None]:
def build_classifier_model(tfhub_handle_preprocess, tfhub_handle_encoder, var_layer_dropout, var_activation_funct):
    """
    build_classifier_model returns teh keras model.

    :param tfhub_handle_preprocess: 
    :param tfhub_handle_encoder: 
    :param var_layer_dropout: 
    :param var_activation_funct: 
    :return: Keras Model
    """ 
    text_input = tf.keras.layers.Input(shape=(),dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    output = encoder(encoder_inputs)
    debug(output)
    net = output['pooled_output']
    net = tf.keras.layers.Dropout(var_layer_dropout)(net) # HYPERPARAMETER DROPOUT
    net = tf.keras.layers.Dense(1, activation=var_activation_funct, name='classifier')(net) # HYPERPARAMETER Layer Size (1) und Activation
    
    debug("Return Model")
    return tf.keras.Model(text_input, net)
    

In [None]:
def is_funny(inputs, results):
  result_for_printing = [f'{inputs[i]:<30}\n FUNNY SCORE: {results[i][0]:.6f}\n' for i in range(len(inputs))]
  print(result_for_printing)

In [None]:
def save_trained_model(bert_model_name, classifier_model, bucket_name):
    """
    save_trained_model Saves data in folder and create zip 

    :param bert_model_name: name of the model used
    :param classifier_model: classifier model that saves the data
    :param bucket_name: Bucket the zip files to upload
    """
    timestamp = time.strftime("%Y%m%d-%H%M%S")

    dataset_name = "model_"+bert_model_name+"_"+timestamp
    saved_model_path = './{}'.format(dataset_name.replace('/', '_'))
    debug(saved_model_path)
    debug(f'Saving of {bert_model_name}')
    classifier_model.save(saved_model_path, include_optimizer=True)
    # todo: add zip folder 
    file = saved_model_path + '.zip'
    zipobj = zipfile.ZipFile(file, 'w', zipfile.ZIP_DEFLATED)
    rootlen = len(saved_model_path) + 1
    for base, dirs, files in os.walk(saved_model_path):
        for file in files:
            fn = os.path.join(base, file)
            zipobj.write(fn, fn[rootlen:])
    # todo: upload to cos
    upload_file(file, bucket_name)

In [None]:
def export_data_frame(df, name, bucket_name):
    """
    export_data_frame Saves dataframe as file

    :param df: Data frame
    :param name: name of the file to save to
    """

    debug(name)
    excel = name + 'xlsx'
    csv = name + '.csv'
    df.to_excel(excel, sheet_name='sheet_1')
    df.to_csv(csv, sep=',')
   
    # Upload Files To Cos
    upload_file(excel, bucket_name)
    upload_file(csv, bucket_name)

In [None]:
# this is the default value should be overridden 
DESIRED_ACCURACY = 0.95
# could need : binary_accuracy instead of accuracy
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('binary_accuracy')>DESIRED_ACCURACY):
            print(f"\nReached {logs.get('binary_accuracy') * 100}% accuracy Desired {DESIRED_ACCURACY * 100}% so cancelling training!", )
            self.model.stop_training = True

## Import Dataset

In [None]:
# todo: change url
csv_name = "https://raw.githubusercontent.com/skywalkeretw/DBE-Humor-Prototype/master/Datasets/dataset.csv"
label = 'humor'
feature = 'text'

split_csv_into_dir(csv_name, label, feature)

## Setup COS

In [None]:
bucket_name="trained-bert-models"
location="eu-de-standard"
create_bucket(bucket_name, location)

## Set Params

In [None]:
# Put params in loop / csv
var_batch_size = 512 #
var_seed = 42
var_layer_dropout = 0.1 
var_activation_funct = None # sigmoid, relu
var_epochs = 10 # call back code verwenden
var_init_lr = 3e-5
var_optimizer = 'adamw'
DESIRED_ACCURACY = 0.95

## Get The Date from the folders

In [None]:
# Split into Train validation and test
AUTOTUNE = tf.data.AUTOTUNE
batch_size = var_batch_size # HYPERPARAMETER
seed = var_seed # HYPERPARAMETER
debug("raw train dataset")
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
debug("val data set")
val_ds = tf.keras.utils.text_dataset_from_directory(
    'train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

debug("test dataset")
test_ds = tf.keras.utils.text_dataset_from_directory(
    'test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

bert models

In [None]:
#   [{
#      "name":"small_bert/bert_en_uncased_L-2_H-128_A-2",
#      "encoder":"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1",
#      "preprocess":"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
#   },...]
debug("readin bert_models as datafram and convert to dict")
bert_models = pd.read_json('https://raw.githubusercontent.com/skywalkeretw/DBE-Humor-Prototype/master/Datasets/bert_models.json').to_dict('records')


: 

In [None]:
run_metrics = {}

debug("Generate Models for all enteries in bert_model list")
for model in bert_models:
    # start timer
    debug("start timer")
    st = time.time()

    #use data from bert models as parameters
    debug(f'name: {model["name"]} encoder: {model["encoder"]} preprocess: {model["preprocess"]}')
    bert_model_name = model["name"]
    tfhub_handle_encoder = model["encoder"]
    tfhub_handle_preprocess = model["preprocess"]

    # Preprocessing with Bert
    debug("Preprocessing")
    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    
    # Encoding with Bert
    debug("Encoding")
    bert_model = hub.KerasLayer(tfhub_handle_encoder)
    
    # Build Classifier Model
    debug("Build Classifer Model")
    classifier_model = build_classifier_model(tfhub_handle_preprocess,tfhub_handle_encoder,var_layer_dropout,var_activation_funct)
    
    # Define Metrics
    debug("Define The Metrics")
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()

    # Init Params
    epochs = var_epochs # HYPERPARAMETER
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = var_init_lr # HYPERPARAMETER
    debug("Create Optimizer")
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type=var_optimizer)

    # Compile Model using Params
    debug("Compile Model")
    classifier_model.compile(optimizer=optimizer,
                             loss=loss,
                             metrics=metrics)

    debug(f'Training of {bert_model_name}')
    # Train Model
    callbacks = myCallback()
    classifier_model.fit(x=train_ds,
                            validation_data=val_ds,
                            epochs=epochs,
                            callbacks=[callbacks])

    # Model is evaluated using the test_ds
    debug(f"Evaluate {bert_model_name}")
    loss, accuracy = classifier_model.evaluate(test_ds)

    # end timer and calculate duration
    duration = time.time() - st
    debug("End Timer")
    # creatr  metrics file using test_ds
    run_metrics[bert_model_name] = {
        'loss': loss,
        'accuracy': accuracy,
        'duration': duration
    }
    
    debug(f"Save Trained Model: {bert_model_name}")
    save_trained_model(bert_model_name, classifier_model, bucket_name)

# convert Metrics dict to dataframe and save as excel
export_data_frame(pd.DataFrame.from_dict(run_metrics, orient='index'), 'metrics', bucket_name)

## Load Models and Test Performance

In [None]:
#todo rename dataframe
ourExamples = pd.read_csv('https://raw.githubusercontent.com/skywalkeretw/DBE-Humor-Prototype/master/Datasets/ourExamples.csv')
directories_in_curdir = sorted(list(filter(os.path.isdir, os.listdir(os.curdir))))
available_models = [i for i in directories_in_curdir if i.startswith('model_')]

## test using models

In [None]:
for model in available_models:
    
    reloaded_model = tf.saved_model.load(model)
    reloaded_results = tf.sigmoid(reloaded_model(tf.constant(ourExamples['Sentence'])))

    is_funny(ourExamples['Sentence'], reloaded_results)
    
    ourExamples[model] = reloaded_results
debug(ourExamples.head())
export_data_frame(ourExamples, 'metrics',bucket_name)

## Test Used

In [None]:
header =  ourExamples['Sentence'].to_numpy().tolist()
header.insert(0, "MODEL")
evaluation = pd.DataFrame(columns=header)
evaluation.head()

In [None]:
for model in available_models:
    
    reloaded_model = tf.saved_model.load(model)
    reloaded_results = tf.sigmoid(reloaded_model(tf.constant(ourExamples['Sentence'])))
    results = [item for sublist in np.array(reloaded_results.numpy()).tolist() for item in sublist]
    row = [model] + results
    print(len(row))
    evaluation.loc[len(evaluation)] = row

evaluation.head(20)

In [None]:
evaluation.to_excel('bert_models_result.xlsx',index=False)
evaluation.to_csv('bert_models_result.csv', sep=',', encoding='utf-8')