<a href="https://colab.research.google.com/github/serdarbozoglan/My_NLP/blob/master/My_BERT_tokenizer2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stage 1: Importing dependencies

In [0]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [71]:
!pip install bert-for-tf2
!pip install sentencepiece



In [0]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [73]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    '/content/drive/My Drive/DS_Projects/BERT/sentiment_data/train.csv',
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


In [0]:
# Kolaylik olmasi icin sadece ilk 20K ve son 20K yi alacagim data'dan (sirali oludgu icin ilk 20K negative sentiment, last 20K positive sentiment)
data1 = data[:20000]
data2 = data[-20000:]
data = pd.concat([data1, data2], axis=0)

In [0]:
## Drop unnecessary columns
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [77]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

### Cleaning

In [0]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @, mentions such as @tigers
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet) # [^] means "not" yani a-zA-Z etc olmayanlari degistir anlaminda
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [0]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [0]:
data_labels = data.sentiment.values

# We will convert 4 to 1 because in dataset positive is represented by 4 rather than 1
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [0]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [82]:
tokenizer.tokenize("My dog loves, strawberries.")

['my', 'dog', 'loves', ',', 'straw', '##berries', '.']

In [83]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('My dog loves, strawberries.'))

[2026, 3899, 7459, 1010, 13137, 20968, 1012]

In [0]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [0]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset creation

We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [0]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

## Initial/original file has ordered labels, first comes 0s then 4s(we converted to 1s later) so we need to shuffle
random.shuffle(data_with_len)

# data_with_len in elemanlari siranyla sentence, label ve sent lenght (row number 42)
# we're sorting the list based on the sentence length which is the index of [2] means 3rd element in the list
data_with_len.sort(key=lambda x: x[2])


In [87]:
# Those ones the last 3 longest sentences in our data set
print(data_with_len[-3:])

[[[1045, 2293, 2017, 1998, 3335, 2017, 1012, 1045, 2064, 1005, 1056, 3524, 2005, 2017, 1012, 2123, 1005, 1056, 2022, 5305, 999, 999, 1998, 1012, 1012, 1012, 1045, 3246, 2115, 7514, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999], 1, 64], [[1045, 2293, 2017, 1998, 3335, 2017, 1012, 1045, 2064, 1005, 1056, 3524, 2005, 2017, 1012, 2123, 1005, 1056, 2022, 5305, 999, 999, 1998, 1012, 1012, 1012, 1045, 3246, 2115, 7514, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999], 1, 66], [[2420, 6229, 2047, 2327, 6718, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029, 1029,

In [88]:
print(data_with_len[:3])
# As we can see we have some sentences actually no word (those were cleanded actually) in in but we have label for those

[[[], 1, 0], [[], 0, 0], [[], 0, 0]]


In [0]:
# We'll only keep the sentences with at least 7 tokens and get rid of less
sorted_all = [(sent_lab[0], sent_lab[1]) # sent_lab means sentence label
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [0]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))
# first tf.int32 for inputs [word ids] and second one is for the labels [for 0 and 1]

In [0]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
# padded_shapes, (None, ) is used for inputs/token ids and () is used for labels, we will not padd the labels so it shoud be ()

In [92]:
next(iter(all_batched))
# What we get here is, for the first batch we have 32 different inputs with 8 tokens each one and corresponding labels
# We don't see any padding here actulaly

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2196,  2568,  2009,  2134,  1005,  1056,  2147,  4312],
        [ 2003,  4469,  8945,  4103, 10094,  2000,  5438, 13255],
        [24547, 22345,  2038, 17021,  2545,  1029,   999,  1029],
        [ 2038,  2053,  6501,  2005,  1037,  2452,  1997,  5572],
        [ 2003,  5305,  1997,  2009,  2182,   999,   999,   999],
        [ 2003,  2635,  1037,  3338,  2013,  3752, 16012,   999],
        [ 3791,  3637,  2021,  2064,  1005,  1056,  2079,  2009],
        [ 4530,  2242,  2011,  2032,  8840,  3363,  3363,  2140],
        [ 3398,  2008,  1005,  1055,  1037,  2204,  2028,  2205],
        [ 2667,  2000,  2147,  2041,  2129,  2023,  2035,  2573],
        [ 2021,  6266,  4665,  3475,  1005,  1056,  2648,  1012],
        [ 1998,  2054,  1005,  1055,  2187,  1997,  2256, 16324],
        [ 2003,  1037, 10474, 10459,  2099,   999,   999,   999],
        [ 7459,  1996,  1046,  2497,  1005,  1055,  2047,  2201],
        [ 8038,  2100,  1045

In [0]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10

In [94]:
print("Number of Batches in Test Set :", NB_BATCHES)
print("Number of Batches in Test Set :", NB_BATCHES_TEST) # Actually validation set is this one

Number of Batches in Test Set : 1032
Number of Batches in Test Set : 103


In [0]:
## all_batched i shuflle etmemiz gerekmektedir yoksa en kisa cumlelerden en uzun cumlelere dogru bir siralma var halihaizrda
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)  # we grap first NUMBER_BATCHES_TEST for validation
train_dataset = all_batched.skip(NB_BATCHES_TEST) # we skip first BUMBER_BATCHES_TEST but rest for training set

# Stage 3: Model building

In [0]:
# We're creating a Deep CNN (Convolutional Neural Netowrk) Class which intherits from tf.keras.Model class
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,       # embedding dimension is 128 as default now
                 nb_filters=50,     # we'll use 50 filters/ feature detectors as default, we will use 50 filters for filter size 2 and 50 filters for 3 and 50 filters for 4 filter size
                 FFN_units=512,     # Number of Hidden Units we will use in Dense Layers at the end. We'll have 2 Dense Layers. we'll use FFN_units there
                 nb_classes=2,      # We have 2 classes 
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):      # model name we gave

        super(DCNN, self).__init__(name=name)# We're using super class and initiliaze tf.keras.Model class

        # We're staring to create layers
        # We're startting with embedding layer
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)  # for vocab_size we create a vector embedding of each word and emd dim is embd_dim here

        # Srarting Creating CNN layers
        # First one will be the size of 2, means it will focus on 2 consecutive words, let's call it bigram
        # the width of feature detector is the same as input size, so will have 1 Dimensional vector when we aplly filter, remeber the figure from the lesson. We do not use smaller filter than input size width becasue it is nonnse to split the embedding vector. Whole vector represents a single word
        # out stride will be 1
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,          # We shift feature detector to only 1D  # kernel_size= 2 for bigrams
                                    padding="valid",        # sometimes feature detectors exceeds the inputs size when it is strided then padding='valid handle this 
                                    activation="relu")      # we only keep the positive results or 0 for the neagive results

        # We will create the same thing for filter size 3 and 4 as well, they will check the 3 and 4 consecutive words
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                     activation="relu")
        
        # This layer will get the max pool of feature detector/filter
        self.pool = layers.GlobalMaxPool1D()

        # We're creating the Feed Forward Neural Network parts (We'll use 2 dense layers)
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu") # Neuron numbers

        # We will apply droputs to avoid overfitting
        # Dropout will be applied only in TRAINING not in PREDICTION
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        # Let's create the last Dense layer which is output layer
        if nb_classes == 2: # For binary classification
            self.last_dense = layers.Dense(units=1, # Don't confused here, if we have binary classes we will have 1 neuron
                                           activation="sigmoid")
        else: # multi-class
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training): #training shows if we are in training or not, it is boolean

        # training will decide if we apply droput or not, if it is True it means we're in Trainig  and will aplly droput 
        # if it is False it measn we're in Prediction and not apply dropout
    
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # # we get the shape of (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # # we get the shape of (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # # we get the shape of (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # we concat the results based on the last parameter of shape, it is nb_filters
        # merged shape is (batch_size, 3*nb_filters)

        # We're gonna apply our first dense layer
        merged = self.dense_1(merged)

        # Dropout will be applied if it is in TRAINING
        merged = self.dropout(merged, training)

        # output layer
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [0]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200           # you can play around with this number as a hyper parameter
NB_FILTERS = 100        # you can play around with this number as a hyper parameter
FFN_UNITS = 256         # you can play around with this number as a hyper parameter
NB_CLASSES = 2

DROPOUT_RATE = 0.2      # you can play around with this number as a hyper parameter

NB_EPOCHS = 5           # you can play around with this number as a hyper parameter

In [0]:
# We're creating our Neural Network
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [0]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [0]:
## We will save the weight of the trained model to use it later as well
checkpoint_path = "./drive/My Drive/DS_Projects/BERT/ckpt_bert_tok/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)
#max_to_keep shows how many checkpoints will be kept in this folder, we may increase this number if we wanna keep previuos checkpoints as well

if ckpt_manager.latest_checkpoint: # if we have a checkpoint in our relevant folder we get True if so not it will return None
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [0]:
# If we want to do anything custom in any epoch or any batch we can do it in the way below
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None): # on each epoch end we will save it and print out 
        ckpt_manager.save() # we will save the state of model at the end of the each epoch 
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [102]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
    929/Unknown - 60s 65ms/step - loss: 0.5337 - accuracy: 0.7277Checkpoint saved at ./drive/My Drive/DS_Projects/BERT/ckpt_bert_tok/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f0d0bfa3748>

# Stage 5: Evaluation

In [103]:
results = Dcnn.evaluate(test_dataset)
print(results)

    103/Unknown - 1s 8ms/step - loss: 0.9869 - accuracy: 0.7418[0.9868806210681073, 0.74180824]


In [0]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    # We're trying to simulate a batch 
    # So we add a dimention to tokens and it will be the first one
    # We add an empty dimension which simulates the batch 
    inputs = tf.expand_dims(tokens, 0)

    # we don't want to apply dropout so training is False 
    output = Dcnn(inputs, training=False)

    # by multiplying the output by 2 we get a number between 0 and 2
    # if it is between 0 and 1 the predicted class will be 0 & if the output is between 1 and 2 the predicition class will be 1
    # Instead of this we can also use round() fuction which will yield the same result
    sentiment = math.floor(output*2)

    
    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [105]:
get_prediction("This movie was pretty interesting.")
# Training set'i bayagi azalttigimiz icin bu sekilde sonuc almamiz normal

Ouput of the model: [[0.0769823]]
Predicted sentiment: negative.


In [106]:
get_prediction("I like your sweater")

Ouput of the model: [[0.62157565]]
Predicted sentiment: positive.


In [107]:
get_prediction("I'm so so")

Ouput of the model: [[0.47023496]]
Predicted sentiment: negative.


In [108]:
get_prediction("I am not bad at all")

Ouput of the model: [[0.0204423]]
Predicted sentiment: negative.
