In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re  # for regex
import seaborn as sns
import os
import unidecode
from autocorrect import Speller
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, confusion_matrix, 
                             ConfusionMatrixDisplay, roc_auc_score, roc_curve, auc, make_scorer)
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
# import tensorflow_hub as hub
# import bert_tokenization as tokenization
from transformers import *
from scipy.stats import spearmanr
from math import floor, ceil
from nltk.corpus import stopwords
np.set_printoptions(suppress=True)

print(tf.__version__)









2.16.1


In [2]:
from transformers import BertTokenizer, TFBertModel, BertModel, BertConfig


tf_checkpoint_path = 'bert'
MAX_SIZE = 200
BATCH_SIZE = 500

tokenizer = BertTokenizer.from_pretrained(tf_checkpoint_path)
model = BertModel.from_pretrained(tf_checkpoint_path)

loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file bert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "ColBERT"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LAB

In [3]:
MAX_SENTENCE_LENGTH = 20
MAX_SENTENCES = 5
MAX_LENGTH = 100

In [4]:
humorousQ = pd.read_csv("Humorous.csv")
non_humorousQ = pd.read_csv("Non-humorous-unbiased.csv")

In [5]:
questions = pd.concat([humorousQ, non_humorousQ], ignore_index=True)
questions.rename(columns={'label': 'humor'}, inplace=True)
questions.drop(['product_description', 'image_url'], axis=1, inplace=True)
questions

Unnamed: 0,question,humor
0,Will the volca sample get me a girlfriend?,1
1,Can u communicate with spirits even on Saturday?,1
2,I won't get hunted right?,1
3,I have a few questions.. Can you get possessed...,1
4,Has anyone asked where the treasure is? What w...,1
...,...,...
19137,Serve na f800r 2013?,0
19138,Can it run mine sweeper abve 10 fps?,0
19139,What is the difference between the pro weight ...,0
19140,Can you provide me a phone number to the compa...,0


In [6]:
def case_convert():
    questions.question = [i.lower() for i in questions.question.values]

def remove_specials():
    questions.text =  [re.sub(r"[^a-zA-Z]"," ",text) for text in questions.question.values]

def remove_shorthands():
    CONTRACTION_MAP = {
    "u": "you",
    "abve": "above",
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }
    texts = []
    for text in questions.question.values:
        string = ""
        for word in text.split(" "):
            if word.strip() in list(CONTRACTION_MAP.keys()):
                string = string + " " + CONTRACTION_MAP[word]
            else:
                string = string + " " + word
        texts.append(string.strip())
    questions.question = texts

def remove_stopwords():
    texts = []
    stopwords_list = stopwords.words('english')
    for item in questions.question.values:
        string = ""
        for word in item.split(" "):
            if word.strip() in stopwords_list:
                continue
            else:
                string = string + " " + word
        texts.append(string)
                
def remove_links():
    texts = []
    for text in questions.question.values:
        remove_https = re.sub(r'http\S+', '', text)
        remove_com = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
        texts.append(remove_com)
    questions.question = texts

def remove_accents():
    questions.question = [unidecode.unidecode(text) for text in questions.question.values]

def normalize_spaces():
    questions.question = [re.sub(r"\s+"," ",text) for text in questions.question.values]

case_convert()
remove_links()
remove_shorthands()
remove_accents()
remove_specials()
remove_stopwords()
normalize_spaces()
print(questions)

  questions.text =  [re.sub(r"[^a-zA-Z]"," ",text) for text in questions.question.values]


                                                question  humor
0             will the volca sample get me a girlfriend?      1
1      can you communicate with spirits even on satur...      1
2                           i will not get hunted right?      1
3      i have a few questions.. can you get possessed...      1
4      has anyone asked where the treasure is? what w...      1
...                                                  ...    ...
19137                               serve na f800r 2013?      0
19138              can it run mine sweeper above 10 fps?      0
19139  what is the difference between the pro weight ...      0
19140  can you provide me a phone number to the compa...      0
19141                 is the blenders sponge latex free?      0

[19142 rows x 2 columns]


In [7]:
df_train, df_test = train_test_split(questions, test_size=0.2)

In [8]:
test_df_y = df_test.copy()
del df_test['humor']

df_sub = test_df_y.copy()

print(len(questions),len(df_train),len(df_test))
display(df_train.head())
display(df_test.head())

19142 15313 3829


Unnamed: 0,question,humor
8159,can i use these to fight crime or will the oth...,1
3574,is there any chance that this tie can make ame...,1
17908,does it come packaged in its original package?,0
5948,does this product prevent broken hearts?,1
6602,if i seed over established st. augustine will ...,1


Unnamed: 0,question
16437,what kind of horn is this made from?
11612,is this one single ticket? i need a roll of them.
6657,do my kids need the hepatitis vaccine before p...
11036,why would you buy this its probably a small qu...
9058,can it make me a sandwich?


In [9]:
output_categories = list(df_train.columns[[1]])
input_categories = list(df_train.columns[[0]])

TARGET_COUNT = len(output_categories)

print('\ninput categories:\n\t', input_categories)
print('\noutput TARGET_COUNT:\n\t', TARGET_COUNT)
print('\noutput categories:\n\t', output_categories)


input categories:
	 ['question']

output TARGET_COUNT:
	 1

output categories:
	 ['humor']


In [10]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\grbha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
def return_id(str1, str2, truncation_strategy, length):

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation_strategy=truncation_strategy)

    input_ids =  inputs["input_ids"]
    input_masks = [1] * len(input_ids)
    input_segments = inputs["token_type_ids"]
    padding_length = length - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]


def compute_input_arrays(df, columns, tokenizer):
    model_input = []
    for xx in range((MAX_SENTENCES*3)+3):
        model_input.append([])
    
    for _, row in tqdm(df[columns].iterrows()):
        i = 0
        
        # sent
        sentences = sent_tokenize(row.question)
        for xx in range(MAX_SENTENCES):
            s = sentences[xx] if xx<len(sentences) else ''
            ids_q, masks_q, segments_q = return_id(s, None, 'longest_first', MAX_SENTENCE_LENGTH)
            model_input[i].append(ids_q)
            i+=1
            model_input[i].append(masks_q)
            i+=1
            model_input[i].append(segments_q)
            i+=1
        
        # full row
        ids_q, masks_q, segments_q = return_id(row.question, None, 'longest_first', MAX_LENGTH)
        model_input[i].append(ids_q)
        i+=1
        model_input[i].append(masks_q)
        i+=1
        model_input[i].append(segments_q)
        
    for xx in range((MAX_SENTENCES*3)+3):
        model_input[xx] = np.asarray(model_input[xx], dtype=np.int32)
        
    print(model_input[0].shape)
    return model_input

In [12]:
inputs      = compute_input_arrays(df_train, input_categories, tokenizer)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer)

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(15313, 20)


0it [00:00, ?it/s]

(3829, 20)


In [13]:
print(len(inputs), len(inputs[0]), len(inputs[0][0]))

# check out input for 7th row
xx = 7
print(df_train.iloc[xx,0])
print(sent_tokenize(df_train.iloc[xx,0]))
inputs[0][xx], inputs[3][xx], inputs[6][xx], inputs[15][xx]

18 15313 20
the second picture shows serrations. so this knife comes with them?
['the second picture shows serrations.', 'so this knife comes with them?']


(array([  101,  1996,  2117,  3861,  3065, 22737,  9285,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0]),
 array([ 101, 2061, 2023, 5442, 3310, 2007, 2068, 1029,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]),
 array([101, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]),
 array([  101,  1996,  2117,  3861,  3065, 22737,  9285,  1012,  2061,
         2023,  5442,  3310,  2007,  2068,  1029,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
          

In [14]:
def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

outputs = compute_output_arrays(df_train, output_categories)
outputs[:3]

array([[1],
       [1],
       [0]], dtype=int64)

In [15]:
# Evaluation Metrics
import sklearn
def print_evaluation_metrics(y_true, y_pred, label='', is_regression=True, label2=''):
    print('==================', label2)
    ### For regression
    if is_regression:
        print('mean_absolute_error',label,':', sklearn.metrics.mean_absolute_error(y_true, y_pred))
        print('mean_squared_error',label,':', sklearn.metrics.mean_squared_error(y_true, y_pred))
        print('r2 score',label,':', sklearn.metrics.r2_score(y_true, y_pred))
        #     print('max_error',label,':', sklearn.metrics.max_error(y_true, y_pred))
        return sklearn.metrics.mean_squared_error(y_true, y_pred)
    else:
        ### FOR Classification
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('average_precision_score',label,':', sklearn.metrics.average_precision_score(y_true, y_pred))
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('accuracy_score',label,':', sklearn.metrics.accuracy_score(y_true, y_pred))
        print('f1_score',label,':', sklearn.metrics.f1_score(y_true, y_pred))
        
        matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        print(matrix)
        TP,TN,FP,FN = matrix[1][1],matrix[0][0],matrix[0][1],matrix[1][0]
        Accuracy = (TP+TN)/(TP+FP+FN+TN)
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1 = 2*(Recall * Precision) / (Recall + Precision)
        print('Acc', Accuracy, 'Prec', Precision, 'Rec', Recall, 'F1',F1)
        return sklearn.metrics.accuracy_score(y_true, y_pred)


In [16]:
valid_inputs = inputs
valid_outputs = outputs

In [17]:
preds = model.predict(valid_inputs)


AttributeError: 'BertModel' object has no attribute 'predict'