In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from tqdm import tqdm
# from keras.models import Sequential
# from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
# from keras.initializers import Constant
# from keras.optimizers import Adam
# from keras import regularizers
# import keras.backend as K

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import tensorflow_hub as hub

try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import bert


In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
#     y_true = K.argmax(y_true_in, axis=1)
#     y_pred = K.argmax(y_pred_in, axis=1)
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
train= pd.read_csv('../input/extensive-pre-processing-for-bert/processed train.csv')
train.head(5)

In [None]:
train.loc[4,'processed_text']

In [None]:
test=pd.read_csv('../input/extensive-pre-processing-for-bert/processed test.csv')
test = test.set_index(test['id'])
test.head(5)

In [None]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
# module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
#https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1
#https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/1
bert_layer = hub.KerasLayer(module_url, trainable=True)


# BERT Encodings

In [None]:
"""BERT Methods Predefined"""
def bert_encode(texts, tokenizer, max_len=50):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [None]:
print('There are {} rows and {} columns in train'.format(train.shape[0],train.shape[1]))
print('There are {} rows and {} columns in train'.format(test.shape[0],test.shape[1]))

In [None]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocab_file, do_lower_case)

In [None]:
full_input = bert_encode(train.processed_text.values, tokenizer, max_len=50)
full_labels = train.target.values.copy()
full_labels_oe = to_categorical(full_labels, 2)

In [None]:
train_data, val_data, train_labels, val_labels = train_test_split(train.processed_text.values, train.target.values, test_size=0.15, random_state=10)
train_input = bert_encode(train_data, tokenizer, max_len=50)
val_input = bert_encode(val_data, tokenizer, max_len=50)
test_input = bert_encode(test.processed_text.values, tokenizer, max_len=50)

test_labels_oe = to_categorical(test_labels, 2)
val_labels_oe = to_categorical(val_labels, 2)

In [None]:
learning_rate=9e-6
decay=9e-2
max_len=50

K.clear_session()

In [None]:
input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
# clf_pooled = pooled_output[:, 0, :]

## Type1
# out = Dense(100, activation='relu', activity_regularizer=regularizers.l2(9e-5))(clf_output)
# out = Dense(100, activation='relu', activity_regularizer=regularizers.l2(9e-5))(out)
# out = Dense(100, activation='relu')(out)

## Type2
# out = Dropout(Dropout_num)(clf_output)
out = Dense(1024, activation='relu', activity_regularizer=regularizers.l2(9e-5))(clf_output)
out = Dense(1024, activation='relu', activity_regularizer=regularizers.l2(9e-5))(out)
out = Dense(1024, activation='relu')(out)

## Type3
# out = clf_output


out = Dense(1, activation='sigmoid')(out)


## Type4
# out = clf_output

In [None]:
sBERT = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
# sBERT.compile(SGD(lr=learning_rate, momentum=0.9), loss='binary_crossentropy', metrics=['accuracy'])
sBERT.compile(Adam(lr=learning_rate, decay=decay), loss='binary_crossentropy', metrics=['accuracy',f1_m])
sBERT.summary()

In [None]:
# init_weights = sBERT.get_weights()

In [None]:
# sBERT.set_weights(init_weights)
# sBERT.layers[3].trainable = True
# sBERT.compile(Adam(lr=9e-6, decay=9e-2), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
checkpoint1 = ModelCheckpoint('best_accuracy.h5',
                             monitor='val_f1_m',
                             save_best_only=True)
checkpoint2 = ModelCheckpoint('best_loss.h5',
                             monitor='val_loss',
                             save_best_only=True)


train_history = sBERT.fit(
    full_input, full_labels,
    epochs = 3,
    batch_size = 12
#     callbacks = [checkpoint1]
)

In [None]:
# # K.set_value(sBERT.optimizer.lr, 9e-7)
# train_history = sBERT.fit(
#     full_input, full_labels,
#     epochs = 3,
#     batch_size = 16
# #     callbacks = [checkpoint1]
# )

In [None]:
# sBERT.layers[3].trainable = False
# sBERT.compile(Adam(lr=5e-4, decay=5e-2), loss='binary_crossentropy', metrics=['accuracy',f1_m])

In [None]:
# # K.set_value(sBERT.optimizer.lr, 5e-3)
# sBERT.fit(
#     full_input, full_labels,
#     epochs = 5,
#     callbacks=[checkpoint1],
#     batch_size = 16
# )

In [None]:
# sBERT.load_weights('testacc.h5')

In [None]:
bert_encoder = Model(sBERT.inputs, sBERT.layers[4].output)
bert_encoder.summary()

# bert_encoder.layers[-3].set_weights(sBERT.layers[-4].get_weights())
# bert_encoder.layers[-2].set_weights(sBERT.layers[-3].get_weights())

In [None]:
%%time
full_embed = bert_encoder.predict(full_input)
train_embed = bert_encoder.predict(train_input)
test_embed = bert_encoder.predict(test_input)

# Pickle BERT Encodings for further use

In [None]:
import pickle
with open('Train BERT 1024d Embed', 'ab') as fo:     
    pickle.dump(full_embed, fo)    

with open('Test BERT 1024d Embed', 'ab') as fo:     
    pickle.dump(test_embed, fo)  

In [None]:
# with open('Train BERT 1024d Embed', 'rb') as fo:
#     tt = pickle.load(fo, encoding='latin1')

# Support Vector Machine

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.svm import SVC

In [None]:
%%time
svc_model = SVC(gamma=0.01, kernel='rbf', C=3)
svc_model.fit(full_embed, full_labels)
# svc_model.fit(train_embed, train_labels)

In [None]:
import xgboost as xgb

In [None]:
%%time

clf = xgb.XGBClassifier(max_depth=200, n_estimators=400, subsample=1, learning_rate=0.07, reg_lambda=0.1, reg_alpha=0.1,\
                       gamma=1)
clf.fit(full_embed, full_labels)
# clf.fit(train_embed, train_labels)

predictions = clf.predict(full_embed)
# predictions = clf.predict(train_embed)
print ("Training set f1_score :", np.round(f1_score(train_labels, predictions),5))

In [None]:
test_pred1 = clf.predict(test_embed).round().astype(int)
test_pred2 = svc_model.predict(test_embed).round().astype(int)
test_pred3 = sBERT.predict(test_input).round().astype(int)
print("XGBOOST: ", accuracy_score(test_labels, test_pred1), f1_score(test_labels, test_pred1))
print("SVC: ",accuracy_score(test_labels, test_pred2), f1_score(test_labels, test_pred2))
print("BERT: ",accuracy_score(test_labels, test_pred3), f1_score(test_labels, test_pred3))

In [None]:
sub = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sub['target'] = test_pred1
sub.to_csv('submission_xgboost.csv', index=False)

sub['target'] = test_pred2
sub.to_csv('submission_svc.csv', index=False)

sub['target'] = test_pred3
sub.to_csv('submission_bertnn.csv', index=False)