<a href="https://colab.research.google.com/github/sljm12/machine_learning_notebooks/blob/master/2020USElection/HuggingFace_Bert_Trump_Biden_Kamala_Classifier_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Intro ##
This notebook uses Hugging Faces TFBertForSequenceClassification to do speaker identification based on the US election speeches.

https://github.com/ralphbrooks/tensorflow-tutorials/blob/master/2-Sentiment-Classification-with-BERT.ipynb


In [1]:
!nvidia-smi

Thu Oct 29 00:29:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip -q install transformers

[K     |████████████████████████████████| 1.3MB 9.4MB/s 
[K     |████████████████████████████████| 890kB 54.8MB/s 
[K     |████████████████████████████████| 2.9MB 51.1MB/s 
[K     |████████████████████████████████| 1.1MB 47.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

import json
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


In [4]:
!wget https://www.dropbox.com/s/fna7obll05a8dmi/2020USElection.zip
!unzip 2020USElection.zip

--2020-10-29 00:29:21--  https://www.dropbox.com/s/fna7obll05a8dmi/2020USElection.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.1, 2620:100:601c:1::a27d:601
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/fna7obll05a8dmi/2020USElection.zip [following]
--2020-10-29 00:29:21--  https://www.dropbox.com/s/raw/fna7obll05a8dmi/2020USElection.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc1480993e7b1f951589516899d3.dl.dropboxusercontent.com/cd/0/inline/BCJxN3PLlZkBn1lG7F8cZCjNmmi62jG1fpv8FWjQThGTH7bim63SL68wOpcaFIe_uc0LIXrQyXLSZJzluxA3tjUfneQ6gtIJZDYks8u81HH9H8FkDbKX21dDDgWqJIGu8ug/file# [following]
--2020-10-29 00:29:22--  https://uc1480993e7b1f951589516899d3.dl.dropboxusercontent.com/cd/0/inline/BCJxN3PLlZkBn1lG7F8cZCjNmmi62jG1fpv8FWjQThGTH7bim63SL68wOpcaFIe_uc0LIXrQyXLSZJzluxA3tjUfneQ6gtIJ

## Prepping the data ##

In [None]:
df = pd.read_csv("/content/2020USElection-BreakSentence.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,file,text
0,0,Bernie Sanders,2020 Democratic National Convention (DNC) Nigh...,"We must come together to defeat Donald Trump, ..."
1,1,Joe Biden,2020 Democratic National Convention (DNC) Nigh...,I’ll see you on Thursday.
2,2,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,"In this election, we have a chance to change t..."
3,3,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,We’re all in this fight together.
4,4,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,What an awesome responsibility.


In [None]:
df["text"].values

array(['We must come together to defeat Donald Trump, and elect Joe Biden and Kamala Harris as our next President and Vice President.',
       'I’ll see you on Thursday.',
       'In this election, we have a chance to change the course of history.',
       ..., 'We’re going to be in Detroit Monday night.',
       'Come join us Monday night.', 'I’ll see you later.'], dtype=object)

## One Hot Encoding of Labels

In [None]:
names = df["name"].unique()

In [None]:
num_categories = len(names)

In [None]:
i, m = pd.factorize(df["name"])

In [None]:
m

Index(['Bernie Sanders', 'Joe Biden', 'Kamala Harris', 'Donald Trump',
       'Mike Pence'],
      dtype='object')

In [None]:
i

array([0, 1, 2, ..., 2, 2, 2])

In [None]:
df["cat_num"]=i

In [None]:
labels = tf.one_hot(i, depth=len(m))

In [None]:
(train_df, others)=train_test_split(df,shuffle=True,test_size=0.4, stratify=df['name'])
(val_df, test_df)=train_test_split(others,shuffle=True,test_size=0.5, stratify=others['name'])

In [None]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

53659
17887
17887


In [None]:
X_train = train_df["text"].astype(str).values
y_train = tf.one_hot(train_df["cat_num"], num_categories)
X_val = val_df["text"].astype(str).values
y_val = tf.one_hot(val_df["cat_num"], num_categories)
X_test = test_df["text"].astype(str).values
y_test= tf.one_hot(test_df["cat_num"], num_categories)

In [None]:
len(X_train)

53659

## Tokenizer the text ##

In [None]:
max_len =100
max_features = 20000
batch_size=64
dims=50

## Token ##

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
X_train_pad = []
for i in X_train[0:(53659-11)]:
  encoding = tokenizer.encode_plus(
  i,
  add_special_tokens=True,
  max_length=100, # truncates if len(s) > max_length
  return_token_type_ids=True,
  return_attention_mask=True,
  pad_to_max_length=True, # pads to the right by default
  return_tensors="tf"
  )
  X_train_pad.append(encoding)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
def encode_text(texts):
  results = []
  for i in texts:
    encoding = tokenizer.encode_plus(
    i,
    add_special_tokens=True,
    max_length=100, # truncates if len(s) > max_length
    return_token_type_ids=True,
    return_attention_mask=True,
    pad_to_max_length=True, # pads to the right by default
    return_tensors="tf"
    )
    results.append(encoding)
  return results

In [None]:
X_train_pad = encode_text(X_train)



In [None]:
len(X_train_pad)

53659

In [None]:
X_val_pad = encode_text(X_val)



In [None]:
len(X_val_pad)

17887

In [None]:
X_val_pad[0]

{'input_ids': <tf.Tensor: shape=(1, 100), dtype=int32, numpy=
array([[ 101, 1220,  787, 1231, 1280, 1106, 1474,  117,  789, 1284, 1328,
        1106, 1198, 1474, 1380,  119,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 100), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0

In [None]:
X_train_pad[0]["input_ids"][0]

<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 101,  146, 1221, 1128,  787, 1231, 1303, 4476,  119,  102,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)>

In [None]:
y_train[1].shape

TensorShape([5])

In [None]:
def create_data(texts, labels):
  input_ids = []
  attention = []
  tokens = []
  labels = []
  for i in texts:
    encoding = tokenizer.encode_plus(
    i,
    add_special_tokens=True,
    max_length=100, # truncates if len(s) > max_length
    return_token_type_ids=True,
    return_attention_mask=True,
    pad_to_max_length=True, # pads to the right by default
    return_tensors="tf"
    )
    input_ids.append(encoding["input_ids"][0])
    attention.append(encoding["attention_mask"][0])
    tokens.append(encoding["token_type_ids"][0])
  
  return [input_ids,attention, tokens, labels]

In [None]:
dataset = create_data(X_train[0:(53659-11)], y_train[0:(53659-11)])



Define a generator for TFDataset

In [None]:
def gen_dataset(tokenise_text, labels):

  def gen():
    for i in range(len(tokenise_text)):
      train = tokenise_text[i]
      yield (
                  {
                      "input_ids": train["input_ids"][0],
                      "attention_mask": train["attention_mask"][0],
                      "token_type_ids": train["token_type_ids"][0],
                  },
                  labels[i],
              )
  dataset = tf.data.Dataset.from_generator(
      gen,
      ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.float32),
      (
          {
              "input_ids": tf.TensorShape([100]),
              "attention_mask": tf.TensorShape([100]),
              "token_type_ids": tf.TensorShape([100]),
          },
          tf.TensorShape([5]),
      ),
  )

  return dataset

In [None]:
X_train_dataset = gen_dataset(X_train_pad, y_train)

In [None]:
X_val_dataset = gen_dataset(X_val_pad, y_val)

In [None]:
for i in X_train_dataset:
  print(i)
  break

({'input_ids': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 101,  146, 1221, 1128,  787, 1231, 1303, 4476,  119,  102,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
def gen():
    for i in range(len(X_train_pad[0:(53659-11)])):
      train = X_train_pad[i]
      yield (
                  {
                      "input_ids": train["input_ids"][0],
                      "attention_mask": train["attention_mask"][0],
                      "token_type_ids": train["token_type_ids"][0],
                  },
                  y_train[i],
              )


In [None]:
dataset = tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([100]),
                "attention_mask": tf.TensorShape([100]),
                "token_type_ids": tf.TensorShape([100]),
            },
            tf.TensorShape([5]),
        ),
    )
dataset = dataset.shuffle(buffer_size=len(X_train_pad), reshuffle_each_iteration=True).batch(BATCH_SIZE).repeat(-1)

In [None]:
for i in gen():
  print(i)
  break

({'input_ids': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 101,  138, 1974, 1104, 1172,  119,  102,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
len(X_train_pad[0]["input_ids"])

100

## Model ##

In [None]:
model = TFBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=len(m), return_dict=True)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier', 'dropout_151']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
len(X_train_pad)/16
#len(X_train)-3353*16

0.0625

In [None]:
BATCH_SIZE = 16


optimizer = tf.keras.optimizers.Adam(learning_rate=3e-05, epsilon=1e-08)
loss = tf.keras.losses.CategoricalCrossentropy()
metric = tf.keras.metrics.CategoricalCrossentropy(name='accuracy')
model.compile(optimizer=optimizer,
              loss="categorical_crossentropy",
              metrics=[metric])

train_steps = len(X_train_pad)//BATCH_SIZE
EPOCHS = 1


In [None]:
history = model.fit(dataset, 
                    epochs=EPOCHS,
                    steps_per_epoch=train_steps)

In [None]:
X_train_dataset = X_train_dataset.shuffle(buffer_size=len(X_train_pad), reshuffle_each_iteration=True).batch(BATCH_SIZE).repeat(-1)
history = model.fit(X_train_dataset, 
                    epochs=EPOCHS,
                    steps_per_epoch=train_steps)

