The author Fanglida Yan has used code from these references in the notebook. <br>
BERT for classfication: https://github.com/dipanjanS/deep_transfer_learning_nlp_dhs2019/blob/master/notebooks/6%20-%20Transformers%20-%20DistilBERT.ipynb <br>
remove @, # and http://... : https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression <br>

0. feature preprocessing <br>
1. split train and cross validation sets <br>
2. create features for BERT<br>
3. build the model in Keras <br>
4. model tuning and cross validation <br>
5. make prediction for test set <br>

**read in the data**

In [None]:
import numpy as np
import pandas as pd
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

**take a look at the tweets**

In [None]:
for i in range(100):
    print(train.iloc[i,3])

**1. remove #, @, punctuations and weblinks** <br>
**copied from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression**

In [None]:
for i in range(train.shape[0]):
    train.iloc[i,3]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", train.iloc[i,3]).split())

In [None]:
for i in range(test.shape[0]):
    test.iloc[i,3]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", test.iloc[i,3]).split())

**tweets look much more clean**

In [None]:
for i in range(100):
    print(train.iloc[137+i,3])

**create bert input features**

In [None]:
!pip3 install tqdm
!pip3 install transformers

In [None]:
import transformers
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
import tqdm

def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks = [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        tokens = tokenizer.tokenize(doc)
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids)
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
        all_ids.append(ids)
        all_masks.append(masks)
    encoded = np.array([all_ids, all_masks])
    return encoded

**seperate train and test**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(train['text'], train['target'], test_size=0.15, random_state=42)

In [None]:
train_X, val_X, train_Y, val_Y = train_X.values, val_X.values, train_Y.values, val_Y.values

In [None]:
test_X = test['text'].values

In [None]:
MAX_SEQ_LENGTH = 500

train_features_ids, train_features_masks = create_bert_input_features(tokenizer, train_X, 
                                                                      max_seq_length=MAX_SEQ_LENGTH)
val_features_ids, val_features_masks = create_bert_input_features(tokenizer, val_X, 
                                                                  max_seq_length=MAX_SEQ_LENGTH)
#test_features = create_bert_input_features(tokenizer, test_reviews, max_seq_length=MAX_SEQ_LENGTH)
print('Train Features:', train_features_ids.shape, train_features_masks.shape)
print('Val Features:', val_features_ids.shape, val_features_masks.shape)

In [None]:
test_features_ids, test_features_masks = create_bert_input_features(tokenizer, test_X, 
                                                                    max_seq_length=MAX_SEQ_LENGTH)
print('Test Features:', test_features_ids.shape, test_features_masks.shape)

**define model**

In [None]:
import tensorflow as tf

In [None]:
inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inputs = [inp_id, inp_mask]

hidden_state = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')(inputs)[0]
pooled_output = hidden_state[:, 0]    
dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.3)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.3)(dense2)
output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)


model = tf.keras.Model(inputs=inputs, outputs=output)
model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-6, 
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy',tf.keras.metrics.AUC()])

**fit and tune model**

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=2,
                                      restore_best_weights=True,
                                      verbose=1)

In [None]:
model.fit([train_features_ids, 
           train_features_masks], train_Y, 
          validation_data=([val_features_ids, 
                            val_features_masks], val_Y),
          epochs=10, 
          batch_size=20, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

**cross validation**

In [None]:
predictions = [1 if pr > 0.5 else 0 
                   for pr in model.predict([val_features_ids, 
                                            val_features_masks], batch_size=200, verbose=0).ravel()]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
print('null accuracy:', max(sum(val_Y)/val_Y.shape[0],1-sum(val_Y)/val_Y.shape[0]))
print("Accuracy: %.2f%%" % (accuracy_score(val_Y, predictions)*100))
print("roc auc:", roc_auc_score(val_Y, predictions))
print(classification_report(val_Y, predictions))
pd.DataFrame(confusion_matrix(val_Y, predictions))

**predict the test set**

In [None]:
test_Y=model.predict([test_features_ids, test_features_masks], batch_size=200, verbose=0)

test_label=[]

for i in range(test_Y.shape[0]):
    if test_Y[i]>=0.5:
        test_label.append(1)
    else:
        test_label.append(0)

**18. submit**

In [None]:
submission=pd.DataFrame({'id': test['id'], 'target':test_label})
print(submission.head(10))

filename = 'submission_nlp_tweets_bert.csv'

submission.to_csv(filename,index=False)