# Load the model

In [None]:
import keras

model = keras.models.load_model("colbert-trained/")
model.summary()

# complete code

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# import tensorflow_hub as hub
import tensorflow as tf
# import bert_tokenization as tokenization
import tensorflow.keras.backend as K
from tensorflow import keras 

import os
from scipy.stats import spearmanr
from math import floor, ceil
from transformers import *

import seaborn as sns
import string
import re    #for regex

np.set_printoptions(suppress=True)
print(tf.__version__)

# Prep / tokenizer

#### 1. Read data and tokenizer

Read tokenizer and data, as well as defining the maximum sequence length that will be used for the input to Bert (maximum is usually 512 tokens)

In [None]:
training_sample_count = 1000 # 4000
test_count = 1000

MAX_SENTENCE_LENGTH = 20
MAX_SENTENCES = 5
MAX_LENGTH = 100

In [None]:
#!dir /kaggle/input/200k-short-texts-for-humor-detection

In [None]:
os.getcwd()

### original dataset

In [None]:
df = pd.read_csv(r'C:\Users\spark\OneDrive\Desktop\zero21\bert_humour_detection\Colbert\Data\dataset.csv')

df_train = pd.read_csv(r'C:\Users\spark\OneDrive\Desktop\zero21\bert_humour_detection\Colbert\Data\train.csv')
display(df_train.head(3))
df_train = df_train[:training_sample_count]

df_test = pd.read_csv(r'C:\Users\spark\OneDrive\Desktop\zero21\bert_humour_detection\Colbert\Data\dev.csv')
display(df_test.head(3))
df_test = df_test[:test_count]

In [None]:
test_df_y = df_test.copy()
del df_test['humor']

df_sub = test_df_y.copy()

print(len(df),len(df_train),len(df_test))
display(df_train.head())
display(df_test.head())

In [None]:
print(list(df_train.columns))

In [None]:
output_categories = list(df_train.columns[[1]])
input_categories = list(df_train.columns[[0]])

TARGET_COUNT = len(output_categories)

print('\ninput categories:\n\t', input_categories)
print('\noutput TARGET_COUNT:\n\t', TARGET_COUNT)
print('\noutput categories:\n\t', output_categories)

## 2. Preprocessing functions

These are some functions that will be used to preprocess the raw text data into useable Bert inputs.<br>


In [None]:
from transformers import BertTokenizer

MODEL_TYPE = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
def return_id(str1, str2, truncation_strategy, length):

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation_strategy=truncation_strategy)

    input_ids =  inputs["input_ids"]
    input_masks = [1] * len(input_ids)
    input_segments = inputs["token_type_ids"]
    padding_length = length - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]


def compute_input_arrays(df, columns, tokenizer):
    model_input = []
    for xx in range((MAX_SENTENCES*3)+3):
        model_input.append([])
    
    for _, row in tqdm(df[columns].iterrows()):
        i = 0
        
        # sent
        sentences = sent_tokenize(row.text)
        for xx in range(MAX_SENTENCES):
            s = sentences[xx] if xx<len(sentences) else ''
            ids_q, masks_q, segments_q = return_id(s, None, 'longest_first', MAX_SENTENCE_LENGTH)
            model_input[i].append(ids_q)
            i+=1
            model_input[i].append(masks_q)
            i+=1
            model_input[i].append(segments_q)
            i+=1
        
        # full row
        ids_q, masks_q, segments_q = return_id(row.text, None, 'longest_first', MAX_LENGTH)
        model_input[i].append(ids_q)
        i+=1
        model_input[i].append(masks_q)
        i+=1
        model_input[i].append(segments_q)
        
    for xx in range((MAX_SENTENCES*3)+3):
        model_input[xx] = np.asarray(model_input[xx], dtype=np.int32)
        
    print(model_input[0].shape)
    return model_input



In [None]:
inputs      = compute_input_arrays(df_train, input_categories, tokenizer)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer)

In [None]:
print(len(inputs), len(inputs[0]), len(inputs[0][0]))

# check out input for 7th row
xx = 7
print(df_train.iloc[xx,0])
print(sent_tokenize(df_train.iloc[xx,0]))
inputs[0][xx], inputs[3][xx], inputs[6][xx], inputs[15][xx]

In [None]:
def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

outputs = compute_output_arrays(df_train, output_categories)
outputs[:3]

## 5. Training, validation and testing

Loops over the folds in gkf and trains each fold for 3 epochs --- with a learning rate of 3e-5 and batch_size of 6. A simple binary crossentropy is used as the objective-/loss-function. 

In [None]:
# Evaluation Metrics
import sklearn
def print_evaluation_metrics(y_true, y_pred, label='', is_regression=True, label2=''):
    print('==================', label2)
    ### For regression
    if is_regression:
        print('mean_absolute_error',label,':', sklearn.metrics.mean_absolute_error(y_true, y_pred))
        print('mean_squared_error',label,':', sklearn.metrics.mean_squared_error(y_true, y_pred))
        print('r2 score',label,':', sklearn.metrics.r2_score(y_true, y_pred))
        #     print('max_error',label,':', sklearn.metrics.max_error(y_true, y_pred))
        return sklearn.metrics.mean_squared_error(y_true, y_pred)
    else:
        ### FOR Classification
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('average_precision_score',label,':', sklearn.metrics.average_precision_score(y_true, y_pred))
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('accuracy_score',label,':', sklearn.metrics.accuracy_score(y_true, y_pred))
        print('f1_score',label,':', sklearn.metrics.f1_score(y_true, y_pred))
        
        matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        print(matrix)
        TP,TN,FP,FN = matrix[1][1],matrix[0][0],matrix[0][1],matrix[1][0]
        Accuracy = (TP+TN)/(TP+FP+FN+TN)
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1 = 2*(Recall * Precision) / (Recall + Precision)
        print('Acc', Accuracy, 'Prec', Precision, 'Rec', Recall, 'F1',F1)
        return sklearn.metrics.accuracy_score(y_true, y_pred)

print_evaluation_metrics([1,0], [0.9,0.1], '', True)
print_evaluation_metrics([1,0], [1,1], '', False)

### Loss function selection
Regression problem between 0 and 1, so binary_crossentropy and mean_absolute_error seem good.

Here are the explanations: https://www.dlology.com/blog/how-to-choose-last-layer-activation-and-loss-function/

In [None]:
valid_inputs = inputs
valid_outputs = outputs

In [None]:
preds = model.predict(valid_inputs)


In [None]:
len(valid_inputs[0])

In [None]:
print(valid_outputs.shape, preds.shape)
print_evaluation_metrics(np.array(valid_outputs), np.array(preds), '')

In [None]:
test_preds = model.predict(test_inputs)

In [None]:
len(test_preds)

## Binary submission

In [None]:
for split in np.arange(0.1, 0.99, 0.1).tolist():
    df_sub['pred_bi'] = (test_preds > split)

    print_evaluation_metrics(df_sub['humor'], df_sub['pred_bi'], '', False, 'SPLIT on '+str(split))

    df_sub.to_csv('sub3.csv', index=False)
    df_sub.head()

In [None]:
df_sub['pred_bi'] = (test_preds > 0.5)

print_evaluation_metrics(df_sub['humor'], df_sub['pred_bi'], '', False, 'SPLIT on '+str(split))

df_sub.to_csv('sub.csv', index=False)
df_sub.head()

In [None]:
print('Texts that the model failed to correctly predict:')
df_sub[df_sub['pred_bi']!=df_sub['humor']]

In [None]:
valid_inputs

## Testing

In [None]:
input_string=["are you nuts inside your brain"]

In [None]:
import pandas as pd
input_df=pd.DataFrame(data=input_string,columns=['text'])

In [None]:
input_df

In [None]:
pred_input = compute_input_arrays(input_df, ['text'], tokenizer)

In [None]:
pred_input = model.predict(pred_input)

In [None]:
pred_input

In [None]:
for split in np.arange(0.1, 0.99, 0.1).tolist():
    input_df['pred_bi'] = (pred_input > split)

    #print_evaluation_metrics(df_sub['humor'], df_sub['pred_bi'], '', False, 'SPLIT on '+str(split))

    input_df.to_csv('sub3.csv', index=False)
    input_df.head()

##### Binary Submission

In [None]:
if input_df['pred_bi'][0]==True:
    print("Hahah you are funny")
else:
    print("you are not funny")

#### FInal; TEst

In [1]:
#load Pretrained model
import keras
def load_model(model_file_path):
    model = keras.models.load_model(model_file_path)
    return model

#load tokenizer
from transformers import BertTokenizer
def load_tokenizer():
    MODEL_TYPE = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
    return tokenizer

model=load_model("colbert-trained/")
tokenizer=load_tokenizer()



In [9]:
def return_id(str1, str2, truncation_strategy, length):
    training_sample_count = 1000 # 4000
    test_count = 1000
    MAX_SENTENCE_LENGTH = 20
    MAX_SENTENCES = 5
    MAX_LENGTH = 100

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation_strategy=truncation_strategy)

    input_ids =  inputs["input_ids"]
    input_masks = [1] * len(input_ids)
    input_segments = inputs["token_type_ids"]
    padding_length = length - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]


def compute_input_arrays(df, columns, tokenizer):
    training_sample_count = 1000 # 4000
    test_count = 1000
    MAX_SENTENCE_LENGTH = 20
    MAX_SENTENCES = 5
    MAX_LENGTH = 100
    model_input = []
    for xx in range((MAX_SENTENCES*3)+3):
        model_input.append([])
    
    for _, row in tqdm(df[columns].iterrows()):
        i = 0
        
        # sent
        sentences = sent_tokenize(row.text)
        for xx in range(MAX_SENTENCES):
            s = sentences[xx] if xx<len(sentences) else ''
            ids_q, masks_q, segments_q = return_id(s, None, 'longest_first', MAX_SENTENCE_LENGTH)
            model_input[i].append(ids_q)
            i+=1
            model_input[i].append(masks_q)
            i+=1
            model_input[i].append(segments_q)
            i+=1
        
        # full row
        ids_q, masks_q, segments_q = return_id(row.text, None, 'longest_first', MAX_LENGTH)
        model_input[i].append(ids_q)
        i+=1
        model_input[i].append(masks_q)
        i+=1
        model_input[i].append(segments_q)
        
    for xx in range((MAX_SENTENCES*3)+3):
        model_input[xx] = np.asarray(model_input[xx], dtype=np.int32)
        
    print(model_input[0].shape)
    return model_input

In [16]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# import tensorflow_hub as hub
import tensorflow as tf
# import bert_tokenization as tokenization
import tensorflow.keras.backend as K
from tensorflow import keras 

import os
from scipy.stats import spearmanr
from math import floor, ceil
from transformers import *

import seaborn as sns
import string
import re    #for regex
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def input_for_humor_detection(input_str: list,model,tokenizer):
    
    
    input_df=pd.DataFrame(data=input_str,columns=['text'])
    pred_input = compute_input_arrays(input_df, ['text'], tokenizer)
    pred_input = model.predict(pred_input)
    for split in np.arange(0.1, 0.99, 0.1).tolist():
        input_df['pred_bi'] = (pred_input > split)

    #print_evaluation_metrics(df_sub['humor'], df_sub['pred_bi'], '', False, 'SPLIT on '+str(split))

    #input_df.to_csv('sub3.csv', index=False)
    print(input_df.head())
    if input_df['pred_bi'][0]==True:
        print("Hahah you are funny")
    else:
        print("you are not funny")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
input_for_humor_detection(["All good"],model,tokenizer)

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(1, 20)
       text  pred_bi
0  All good    False
you are not funny


NameError: name 'input_str' is not defined