# Using FakeBERT Architecture for Sequence Classification

[Original FakeBERT Paper](https://link.springer.com/content/pdf/10.1007/s11042-020-10183-2.pdf)

FakeBERT Architecture:

![FakeBERT](fakebert.PNG "FakeBERT Architecture")

In [1]:
import logging
import time
from platform import python_version

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from torch.autograd import Variable
from transformers import BertTokenizer, BertModel
from torch.optim import Adam
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from platform import python_version

In [2]:
print("python version==%s" % python_version())
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)
print("torch==%s" % torch.__version__)
print("sklearn==%s" % sklearn.__version__)
print("transformers==%s" % transformers.__version__)
print("matplotlib==%s" % matplotlib.__version__)

python version==3.7.4
pandas==0.25.1
numpy==1.19.5
torch==1.10.2+cpu
sklearn==0.24.1
transformers==4.16.2
matplotlib==3.1.1


In [3]:
# inputs
train_fp = '../data/train.csv'
test_fp = '../data/test.csv'
pretrained_fp = 'bert-base-uncased'

### load data

In [4]:
train = pd.read_csv(train_fp, usecols=['text', 'label'])
test = pd.read_csv(test_fp, usecols=['text', 'label'])
df = pd.concat([train, test], ignore_index=True)
df.head()

Unnamed: 0,text,label
0,ATHENS (Reuters) - Turkish President Tayyip Er...,1
1,"Ted Cruz would be fair, honest and most of all...",0
2,WASHINGTON (Reuters) - White House Chief of St...,1
3,DUBAI (Reuters) - Saudi Arabia welcomed the ne...,1
4,"SIGONELLA, Italy (Reuters) - U.S. President Do...",1


In [5]:
print(f'full data: {df.shape[0]} rows, {df.shape[1]} features')

full data: 44898 rows, 2 features


In [6]:
# target skew?
df.label.value_counts(normalize=True)

0    0.522985
1    0.477015
Name: label, dtype: float64

In [7]:
# any missing values?
df.isna().sum().sum()

0

In [8]:
# what text lengths (# characters)?
print(f'avg text length (chars): {df.text.str.len().mean():0.2f}')
print(f'median text length (chars): {df.text.str.len().median()}')
print(f'min text length (chars): {df.text.str.len().min():0.2f}')
print(f'max text length (chars): {df.text.str.len().max():0.2f}')

avg text length (chars): 2469.11
median text length (chars): 2186.0
min text length (chars): 1.00
max text length (chars): 51794.00


In [9]:
# what text lengths (# words)?
print(f'avg text length (words): {df.text.str.split().str.len().mean():0.2f}')
print(f'median text length (words): {df.text.str.split().str.len().median()}')
print(f'min text length (words): {df.text.str.split().str.len().min():0.2f}')
print(f'max text length (words): {df.text.str.split().str.len().max():0.2f}')

avg text length (words): 405.28
median text length (words): 362.0
min text length (words): 0.00
max text length (words): 8135.00


In [10]:
# what do texts look like with only 1 character?
print(f'{df[df.text.str.len() == 1].shape[0]} rows with no text')
df[df.text.str.len() == 1][:3]

627 rows with no text


Unnamed: 0,text,label
145,,0
199,,0
251,,0


In [11]:
df[df.text.str.len() == 1]['text'].values[0]

' '

In [12]:
# any other short texts?
min_chars = 150
df[df.text.str.len() < min_chars].shape

(1299, 2)

In [13]:
df[(df.text.str.len() > 1) & (df.text.str.len() < min_chars)].head()

Unnamed: 0,text,label
42,WOW This woman absolutely nails it!,0
56,https://youtu.be/0J4xPRYbsLU,0
86,,0
107,2 Corinthians 9:7 Each one must give as he ha...,0
116,Ronald Reagan shut down the Berkeley protests ...,0


In [14]:
# what do text lengths look like for only positive samples?
df[df.label == 1].text.str.len().describe()

count    21417.000000
mean      2383.278517
std       1684.835730
min          1.000000
25%        914.000000
50%       2222.000000
75%       3237.000000
max      29781.000000
Name: text, dtype: float64

In [15]:
df[(df.label == 1) & (df.text.str.len() < min_chars)]

Unnamed: 0,text,label
13911,,1


In [16]:
df[df.text.str.len() < min_chars].label.value_counts()

0    1298
1       1
Name: label, dtype: int64

In [17]:
# characters
df.groupby('label').text.apply(lambda x: x.str.len().describe()).to_frame(name='text_stats')

Unnamed: 0_level_0,Unnamed: 1_level_0,text_stats
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,count,23481.0
0,mean,2547.396235
0,std,2532.884399
0,min,1.0
0,25%,1433.0
0,50%,2166.0
0,75%,3032.0
0,max,51794.0
1,count,21417.0
1,mean,2383.278517


In [18]:
# words
df.groupby('label').text.apply(lambda x: x.str.split().str.len().describe()).to_frame(name='text_stats')

Unnamed: 0_level_0,Unnamed: 1_level_0,text_stats
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,count,23481.0
0,mean,423.197905
0,std,408.38889
0,min,0.0
0,25%,240.0
0,50%,363.0
0,75%,506.0
0,max,8135.0
1,count,21417.0
1,mean,385.640099


In [19]:
df[(df.label == 1) & (df.text.str.len() > 1)].text.str.len().describe()

count    21416.000000
mean      2383.389755
std       1684.796417
min        152.000000
25%        914.000000
50%       2222.000000
75%       3237.000000
max      29781.000000
Name: text, dtype: float64

In [23]:
# what about those starters with location and source?
# could we take those out to focus on where most of the information is?
df.head(10)

Unnamed: 0,text,label
0,ATHENS (Reuters) - Turkish President Tayyip Er...,1
1,"Ted Cruz would be fair, honest and most of all...",0
2,WASHINGTON (Reuters) - White House Chief of St...,1
3,DUBAI (Reuters) - Saudi Arabia welcomed the ne...,1
4,"SIGONELLA, Italy (Reuters) - U.S. President Do...",1
5,WASHINGTON (Reuters) - Acting Secretary of Hom...,1
6,WASHINGTON (Reuters) - U.S. President Barack O...,1
7,It has just been announced that Donald Trump a...,0
8,It s no secret that team Trump is all in a tiz...,0
9,"Recently, fans of DC Comics Suicide Squad s...",0


In [24]:
df.text.str[30:].head(10)

0    sident Tayyip Erdogan said on Thursday that U....
1     and most of all, he would follow the law. He ...
2    ouse Chief of Staff John Kelly’s comment that ...
3     welcomed the new U.S. policy toward Iran and ...
4    .S. President Donald Trump arrived in Sicily f...
5    Secretary of Homeland Security Elaine Duke on ...
6    esident Barack Obama on Monday defended his ef...
7    t Donald Trump and his transition team have fo...
8     is all in a tizzy over Jill Stein s recount a...
9    Suicide Squad  started a petition to shut down...
Name: text, dtype: object

### preprocess data

In [25]:
# clip first 30 characters to eliminate location and source information
clip_idx = 30
df.text = df.text.str[clip_idx:]
df.head()

Unnamed: 0,text,label
0,sident Tayyip Erdogan said on Thursday that U....,1
1,"and most of all, he would follow the law. He ...",0
2,ouse Chief of Staff John Kelly’s comment that ...,1
3,welcomed the new U.S. policy toward Iran and ...,1
4,.S. President Donald Trump arrived in Sicily f...,1


In [26]:
# remove rows with fewer than 150 characters
df = df[df.text.str.len() > min_chars]
df.text.str.len().describe()

count    43436.000000
mean      2520.316304
std       2161.483203
min        151.000000
25%       1332.000000
50%       2206.000000
75%       3123.000000
max      51764.000000
Name: text, dtype: float64

In [27]:
# recheck data skew
# it's a little more balanced this way (for better or worse)
df.label.value_counts(normalize=True)

0    0.507344
1    0.492656
Name: label, dtype: float64

In [28]:
# only include file notes less than 500 words
max_words = 500
print(f'original # rows: {df.shape[0]}')
df = df[df.text.str.split().str.len() < max_words]
print(f'clipped data: {df.shape[0]} rows')
df.label.value_counts(normalize=True)

original # rows: 43436
clipped data: 31792 rows


0    0.507235
1    0.492765
Name: label, dtype: float64

In [29]:
df.text.str.split().str.len().describe()

count    31792.000000
mean       276.494118
std        133.347719
min         16.000000
25%        158.000000
50%        302.000000
75%        388.000000
max        499.000000
Name: text, dtype: float64

### split data for train/test

In [30]:
df.head()

Unnamed: 0,text,label
0,sident Tayyip Erdogan said on Thursday that U....,1
1,"and most of all, he would follow the law. He ...",0
2,ouse Chief of Staff John Kelly’s comment that ...,1
3,welcomed the new U.S. policy toward Iran and ...,1
4,.S. President Donald Trump arrived in Sicily f...,1


In [31]:
print(f'processed data: {df.shape[0]} rows, {df.shape[1]} features')

processed data: 31792 rows, 2 features


In [32]:
# start with smaller sample
samp = df.sample(frac=0.2)
samp.shape

(6358, 2)

In [33]:
(X_train, X_val, y_train, y_val) = train_test_split(samp['text'],
                                                      samp['label'],
                                                      test_size=0.2)
(X_val, X_test, y_val, y_test) = train_test_split(X_val,
                                                  y_val,
                                                  test_size=0.2)
print(f'train size: {X_train.shape}')
print(f'val size: {X_val.shape}')
print(f'test size: {X_test.shape}')

train size: (5086,)
val size: (1017,)
test size: (255,)


### get embeddings using BERT

Each file note becomes a 2D tensor:
* Each row is a token or subtoken in the sequence
* Each column is a value in the embedding (vector) for that token

In [34]:
tokenizer = BertTokenizer.from_pretrained(pretrained_fp)
bert_model = BertModel.from_pretrained(pretrained_fp)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
for text in X_train.values[:5]:
    print(text, '\n')

retive actions toward the press are raising a lot of eyebrows and concerns.While Trump is on taxpayer-paid golf vacation in Florida, the undeserving POTUS has made sure that no one will know what the hell he s up to. To symbolize just how transparent his administration is, Trump has blacked out all the windows of his  Winter White House  and banned the press from taking any pictures and keeping tabs on him.Here s what AP reported Jill Colvin saw in the press pool area:Here s another photo of this disturbing activity:PoliticusUSA obtained a White House pool report, which stated: The pool was escorted into the clubhouse after Trump and Abe were out of view and then led downstairs to a filing room, where we re told we ll be staying for awhile. The door and windows are covered with black plastic so we can t see out. Photographers were instructed not to take pictures on the grounds of a  private club. While reporters may not always be blessed with ideal conditions when covering the presiden

In [38]:
texts = np.array(["[CLS] " + text + " [SEP]" for text in X_train.values])
labels = y_train.values

In [52]:
rand_idx = np.random.randint(texts.shape[0])
print(texts[rand_idx])

[CLS] cellor Angela Merkel said on Monday her conservatives would sound out coalition possibilities with the pro-business Free Democrats (FDP) and the Greens as well as with the so far reluctant center-left Social Democrats (SPD).  I think all parties ... have a responsibility to ensure that there will be a stable government,  Merkel told reporters after her conservative CDU/CSU bloc won Sunday s election albeit with its weakest result since 1949. Merkel added that sustainable budget policies and domestic security would be priorities for her conservatives in the upcoming coalition talks.  [SEP]


In [54]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(text) for text in texts]
print (f'Tokenize the first sentence:\n{tokenized_texts[0]}')

Tokenize the first sentence:
['[CLS]', 're', '##tive', 'actions', 'toward', 'the', 'press', 'are', 'raising', 'a', 'lot', 'of', 'eyebrows', 'and', 'concerns', '.', 'while', 'trump', 'is', 'on', 'taxpayer', '-', 'paid', 'golf', 'vacation', 'in', 'florida', ',', 'the', 'und', '##ese', '##r', '##ving', 'pot', '##us', 'has', 'made', 'sure', 'that', 'no', 'one', 'will', 'know', 'what', 'the', 'hell', 'he', 's', 'up', 'to', '.', 'to', 'symbol', '##ize', 'just', 'how', 'transparent', 'his', 'administration', 'is', ',', 'trump', 'has', 'black', '##ed', 'out', 'all', 'the', 'windows', 'of', 'his', 'winter', 'white', 'house', 'and', 'banned', 'the', 'press', 'from', 'taking', 'any', 'pictures', 'and', 'keeping', 'tab', '##s', 'on', 'him', '.', 'here', 's', 'what', 'ap', 'reported', 'jill', 'col', '##vin', 'saw', 'in', 'the', 'press', 'pool', 'area', ':', 'here', 's', 'another', 'photo', 'of', 'this', 'disturbing', 'activity', ':', 'pol', '##itic', '##us', '##usa', 'obtained', 'a', 'white', 'hous

In [56]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
print(f'Input id for first sentence:\n{input_ids[0]}')

Input id for first sentence:
[101, 2128, 6024, 4506, 2646, 1996, 2811, 2024, 6274, 1037, 2843, 1997, 8407, 1998, 5936, 1012, 2096, 8398, 2003, 2006, 26980, 1011, 3825, 5439, 10885, 1999, 3516, 1010, 1996, 6151, 6810, 2099, 6455, 8962, 2271, 2038, 2081, 2469, 2008, 2053, 2028, 2097, 2113, 2054, 1996, 3109, 2002, 1055, 2039, 2000, 1012, 2000, 6454, 4697, 2074, 2129, 13338, 2010, 3447, 2003, 1010, 8398, 2038, 2304, 2098, 2041, 2035, 1996, 3645, 1997, 2010, 3467, 2317, 2160, 1998, 7917, 1996, 2811, 2013, 2635, 2151, 4620, 1998, 4363, 21628, 2015, 2006, 2032, 1012, 2182, 1055, 2054, 9706, 2988, 10454, 8902, 6371, 2387, 1999, 1996, 2811, 4770, 2181, 1024, 2182, 1055, 2178, 6302, 1997, 2023, 14888, 4023, 1024, 14955, 18291, 2271, 10383, 4663, 1037, 2317, 2160, 4770, 3189, 1010, 2029, 3090, 1024, 1996, 4770, 2001, 13127, 2046, 1996, 22067, 2044, 8398, 1998, 14863, 2020, 2041, 1997, 3193, 1998, 2059, 2419, 10025, 2000, 1037, 15242, 2282, 1010, 2073, 2057, 2128, 2409, 2057, 2222, 2022, 6595, 200

In [60]:
from keras.preprocessing.sequence import pad_sequences

In [61]:
input_ids = pad_sequences(input_ids, maxlen=200, dtype='long', truncating='post', padding='post')
print(f'padded input id for first sentence:\n{input_ids[0]}')

padded input id for first sentence:
[  101  2128  6024  4506  2646  1996  2811  2024  6274  1037  2843  1997
  8407  1998  5936  1012  2096  8398  2003  2006 26980  1011  3825  5439
 10885  1999  3516  1010  1996  6151  6810  2099  6455  8962  2271  2038
  2081  2469  2008  2053  2028  2097  2113  2054  1996  3109  2002  1055
  2039  2000  1012  2000  6454  4697  2074  2129 13338  2010  3447  2003
  1010  8398  2038  2304  2098  2041  2035  1996  3645  1997  2010  3467
  2317  2160  1998  7917  1996  2811  2013  2635  2151  4620  1998  4363
 21628  2015  2006  2032  1012  2182  1055  2054  9706  2988 10454  8902
  6371  2387  1999  1996  2811  4770  2181  1024  2182  1055  2178  6302
  1997  2023 14888  4023  1024 14955 18291  2271 10383  4663  1037  2317
  2160  4770  3189  1010  2029  3090  1024  1996  4770  2001 13127  2046
  1996 22067  2044  8398  1998 14863  2020  2041  1997  3193  1998  2059
  2419 10025  2000  1037 15242  2282  1010  2073  2057  2128  2409  2057
  2222  2022  6

In [69]:
rand_idx = np.random.randint(0, len(input_ids))
input_ids[rand_idx]

array([  101,  2129,  2041,  1011,  1997,  1011,  3543,  1998,  2540,
        3238,  1996,  3951,  2283,  2003,  2875,  2308,  1055,  9871,
        1010,  3580,  2343,  3505,  7279,  3401, 19596,  1037,  3116,
        2000,  6848,  1996,  8476,  1997, 23987, 23676,  2729,  5918,
        2013,  2740,  5427,  1998,  9471,  2000, 13260,  2130,  1037,
        2309,  2450,  1012, 17044,  2015,  2000,  2360,  1010,  2019,
        3746,  1997,  1996,  3116,  2003,  2085, 11221,  2039,  2006,
        2591,  2865,  2004,  4841,  4133,  1999,  9860, 21606,  2008,
        2023,  2003,  2129,  3519,  1998,  1996,  2317,  2160,  2228,
        1997,  1996,  2308,  2040,  2191,  2039,  2431,  1996,  2406,
        1012,  3537,  8801,  2020,  8053, 23558,  1012,  4404,  4387,
        3958, 11338,  3995, 23062,  1056, 28394,  3064,  2010,  4963,
        2058,  1996, 11591,  8740,  2850, 12972,  1997,  2023,  3116,
        1012,  1996,  3861,  4415,  4930,  1037,  9113,  2138,  1996,
        1056, 28394,

In [57]:
# enc = tokenizer.encode(X_train.values[0], add_special_tokens=True)
# print(f'encoded file note dimensions: {len(enc)}')

In [151]:
max_seq_len = 100

def tokenize_text(text_arr, max_seq):
    return [tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in text_arr.values]

def pad_text(tokenized_text, max_seq):
    return np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])

def tokenize_and_pad_text(text_arr, max_seq):
    tokenized_text = tokenize_text(text_arr, max_seq)
    padded_text = pad_text(tokenized_text, max_seq)
    return torch.tensor(padded_text)

def targets_to_tensor(label_arr):
    return torch.tensor(label_arr.values, dtype=torch.float32)

In [152]:
train_indices = tokenize_and_pad_text(X_train, max_seq_len)
val_indices = tokenize_and_pad_text(X_val, max_seq_len)
test_indices = tokenize_and_pad_text(X_test, max_seq_len)

In [153]:
from time import time
start = time()
with torch.no_grad():
    X_train_bert = bert_model(train_indices)[0]  # Models outputs are tuples
    X_val_bert = bert_model(val_indices)[0]
    X_test_bert = bert_model(test_indices)[0]
end = time()
elapsed = end - start
if elapsed < 180:
    print(f'code took {elapsed:0.2f} seconds to execute')
else:
    print(f'code took {elapsed / 60:0.2f} minutes to execute')

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:76] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2337484800 bytes.

In [None]:
X_train_bert[0].shape

In [None]:
y_train_bert = targets_to_tensor(y_train)
y_val_bert = targets_to_tensor(y_val)
y_test_bert = targets_to_tensor(y_test)

### build cnn for classification

### run training data through BERT and CNN

### evaluate model performance on test data

### References

https://romanorac.github.io/machine/learning/2019/12/02/identifying-hate-speech-with-bert-and-cnn.html