Transfer learning is when a model developed for one task is reused to work on a second task. 
Fine tuning is one approach to transfer learning.

BERT (Bidirectional Encoder Representations from Transformers) is a big neural network architecture, with a huge number of parameters, that can range from 100 million to over 300 million. So, training a BERT model from scratch on a small dataset would result in overfitting.

So, it is better to use a pre-trained BERT model that was trained on a huge dataset, as a starting point. We can then further train the model on our relatively smaller dataset and this process is known as model fine-tuning.
“BERT stands for Bidirectional Encoder Representations from Transformers. It is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of NLP tasks.”
Different Fine-Tuning Techniques
Train the entire architecture – We can further train the entire pre-trained model on our dataset and feed the output to a softmax layer. In this case, the error is back-propagated through the entire architecture and the pre-trained weights of the model are updated based on the new dataset.
Train some layers while freezing others – Another way to use a pre-trained model is to train it partially. What we can do is keep the weights of initial layers of the model frozen while we retrain only the higher layers. We can try and test as to how many layers to be frozen and how many to be trained.
Freeze the entire architecture – We can even freeze all the layers of the model and attach a few neural network layers of our own and train this new model. Note that the weights of only the attached layers will be updated during model training.


# Install Needed libraries

In [None]:
!pip install tensorflow_hub
!pip install bert-for-tf2
!pip install tensorflow
!pip install sentencepiece
!pip install transformers

# Import Needed Libraries

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub
import bert
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers import DistilBertModel,DistilBertTokenizer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow.keras.layers import Dense, Input
from sklearn.metrics import classification_report
from tensorflow.keras import backend as K
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
import os
import warnings
warnings.filterwarnings("ignore")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# load Bert tokenizer

In [None]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

# Helper Functions

In [None]:
import re
import emoji
import nltk
#import stopwords corpus from nltk
from nltk.corpus import stopwords
import string #load punctuation charachers

#remove stopwords and punctuations
stopwrds = set(stopwords.words('english'))

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    #translate emojis 
    sentence = emoji.demojize(sen)
    
    #Remove URLs
    sentence = re.sub(r"http:\S+",'',sentence)
    
    #remove stop words
    sentence = ' '.join([x for x in nltk.word_tokenize(sentence) if x not in stopwrds])
    
    # Removing html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    #sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r'['+string.punctuation+']','',sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.strip()

In [None]:
def tokenize_bert(data):
    tokenized = data.apply((lambda x: tokenizer.convert_tokens_to_ids(['[CLS]']) + tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))))
    return tokenized
def pad_mask(data_tokenized,max_len):
    padded = tf.keras.preprocessing.sequence.pad_sequences(data_tokenized, maxlen=max_len, dtype='int32', padding='post',value=0.0)
    masked = np.where(padded!=0,1,0)
    return padded, masked
def get_max_len(data):
    max_len = 0
    for val in data:
        tmp = len(tokenizer.tokenize(val))
        if tmp > max_len:
            max_len = tmp
    return max_len

In [None]:
def summarize_model(history):
    pyplot.subplot(211)
    pyplot.title('Loss')
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    # plot accuracy during training
    pyplot.subplot(212)
    pyplot.title('Accuracy')
    pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.legend()
    pyplot.show()

In [None]:
def encode(df):
    tweet = tf.ragged.constant([tokenizer.convert_tokens_to_ids(tokenizer.tokenize(s)) for s in df])
    cls1 = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*tweet.shape[0]
    input_word_ids = tf.concat([cls1, tweet], axis=-1)
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    type_cls = tf.zeros_like(cls1)
    type_tweets = tf.zeros_like(tweet)
    input_type_ids = tf.concat([type_cls, type_tweets], axis=-1).to_tensor()
    
    inputs = {
      'input_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}
    return inputs

# Load Data

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
train.info()

# Data Statistics

In [None]:
train["clean"]  = train["text"].apply(lambda x: preprocess_text(x.lower()) )
test["clean"]  = test["text"].apply(lambda x: preprocess_text(x.lower()) )

print("length of train set:",len(train))
print("length of test set:",len(test))

In [None]:
train["clean"]

# Prepare Data for Bert 

In [None]:
all_df = pd.concat([train,test])

In [None]:
#get max length of input data
max_len = get_max_len(train["clean"]) + 1
#encode and prepare data for Bert input
encode_ds_all = encode(all_df["clean"])

In [None]:
encode_ds_tr = {'input_ids':encode_ds_all["input_ids"][0:7613,:],
                'input_mask':encode_ds_all["input_mask"][0:7613,:],
                'input_type_ids':encode_ds_all["input_type_ids"][0:7613,:]}
encode_ds_tr

# Build Model

In [None]:
def build_bert(max_len):
    input_ids = keras.layers.Input(shape=(max_len,), name="input_ids", dtype=tf.int32)
    input_typ = keras.layers.Input(shape=(max_len,), name="input_type_ids", dtype=tf.int32)
    input_mask = keras.layers.Input(shape=(max_len,), name="input_mask", dtype=tf.int32)
    bert_inputs = {"input_ids": input_ids, "input_mask": input_mask,'input_type_ids':input_typ}
    ## BERT encoder
    bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True,name='keraslayer')
    pooled_output, _ = bert_model([input_ids, input_mask,input_typ])
    out =  keras.layers.Dense(1, activation='sigmoid')(pooled_output)
    model = keras.Model(inputs=bert_inputs,outputs=out)
    
    return model

model = build_bert(max_len)
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, dpi=48)

# Adjust Fine tune bert model settings

In [None]:
y_enc = train["target"]

In [None]:
loss = tf.keras.losses.BinaryCrossentropy (from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-6,decay=1e-6/32)
model.compile(optimizer=optimizer, loss=[loss, loss],metrics=["accuracy"])
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy')
fine_history = model.fit(encode_ds_tr, y_enc, validation_split=0.34,shuffle=True,epochs=3,batch_size=32,verbose=1)

In [None]:
summarize_model(fine_history)

# Evaluation

In [None]:
y_pred=model.predict(encode_ds_tr)
y_pred = y_pred.round()
print(classification_report(y_enc,y_pred))

In [None]:
encode_ds_ts = {'input_ids':encode_ds_all["input_ids"][7613:,:],
                'input_mask':encode_ds_all["input_mask"][7613:,:],
                'input_type_ids':encode_ds_all["input_type_ids"][7613:,:]}
encode_ds_ts

In [None]:
#save in submission dataframe
y_pred=model.predict(encode_ds_ts)
y_pred= y_pred.round()
submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['id']=test['id']
submission['target']=y_pred
submission['target']=submission['target'].astype(int)
submission.head(10)


In [None]:
submission.to_csv('sample_submission.csv',index=False)