### BERT Model Implementation For Predicting Unreliable News

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk import ne_chunk
from textblob import TextBlob

# Modelling Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Modelling Helpers
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn import metrics

import matplotlib.pyplot as plt

# Download NLTK resources (if not already downloaded)
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jackieglasheen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jackieglasheen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jackieglasheen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jackieglasheen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jackieglasheen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/jackieglasheen/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to


True

In [2]:

# Additional BERT model packages 
# May need tensorflow==2.10  to run properly 

from transformers import BertModel, TFBertModel, BertTokenizer, AutoTokenizer
import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D,Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.ops.numpy_ops import np_utils
from tensorflow.keras import regularizers



2023-05-24 12:40:49.094590: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#Read in the data
training_data = pd.read_csv(
    '/Applications/Documents/Uchicago/2022_2023/3_Machine_Learning/project/all_preprocessed_data.csv')

### Prepare Data
Complete final data cleaning steps, and then split the data into training and testing sets.

In [5]:
training_data.loc[:,'clean_text'] = training_data.loc[:,'clean_text'].fillna('')
training_data['clean_text'] = training_data['clean_text'].str.replace("'", "")
training_data['clean_text'] = training_data['clean_text'].str.replace("’", "")

#Split Data into testing and training
X_train, X_test, y_train, y_test = train_test_split(training_data.loc[:,'clean_text'], 
                                    training_data.label, test_size=0.20, random_state=0)
print(X_train.head)

<bound method NDFrame.head of 1479     decade-old audio exposes then-senator hillary ...
15559    he called nato obsolete he said germanys accep...
6156     a new report from u s immigration and customs ...
12987    more than five hundred and fifty zero people h...
13964    by kurt nimmo blacklisted news in the video be...
                               ...                        
13123    marietta ga — jen cox bit her tongue for years...
19648    on her february nineteenth episode of “full me...
9845     an armed murder suspect allegedly opened fire ...
10799    be the first to comment leave a reply click he...
2732     chris mcdaniel mississippi state senator said ...
Name: clean_text, Length: 16132, dtype: object>


### Initialize BERT
Initialize the BERT tokenizer and model using the pretrained "bert-base-uncased" version


In [6]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


We created a function to tokenize our text with the BERT Tokenizer. We chose a max_length of 75 (due to runtime concerns) and we truncate upon hitting that length. We return the token id and the attention mask, which informs whether the respective "ID" was padding or real text.

In [7]:
def bert_tokenize(text_input):  
    """
    Tokenize text for the BERT model
    
    Input (list of string): list of text to tokenize
    Returns a "BatchEncoding" object holding the output of the tokenizers encoding. 
        Returns both the token encodings tensor constants(shape is (num_of_articles, 75))
        and attention masks for the constants (shape is (num_of_articles, 75)).
    
    """
    return  bert_tokenizer(text = text_input, max_length = 75, truncation = True, padding = 'max_length',
                    return_tensors = 'tf', return_attention_mask = True, return_token_type_ids = False)


In [8]:
# Tokenize testing and training data
# Caution: will take minutes to run
X_train_token = bert_tokenize(list(X_train))
X_test_token = bert_tokenize(list(X_test))


In [9]:
print("sample return: ",X_train_token)

sample return:  {'input_ids': <tf.Tensor: shape=(16132, 75), dtype=int32, numpy=
array([[  101,  5476,  1011, ...,  1524,  2059,   102],
       [  101,  2002,  2170, ...,  1998,  1996,   102],
       [  101,  1037,  2047, ..., 12211,  2197,   102],
       ...,
       [  101,  2019,  4273, ..., 15106,  3573,   102],
       [  101,  2022,  1996, ...,  4638,  2000,   102],
       [  101,  3782, 11338, ...,  1996,  2627,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(16132, 75), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}


###  Model Architecture
We start by setting the appropriate shape for the first layer and initializing the BERT model. We then define the model architecture, including drop and dense layers, and then initialize the overall network model.


In [10]:
#tf.keras.backend.clear_session() #uncomment if you want to run tokens on different
                                  #models without restarting the kernel
    
# set the shape of the first layer
input_ids=Input(shape=75,dtype=tf.int32)
input_mask=Input(shape=75,dtype=tf.int32)
bert_layer=bert_model([input_ids,input_mask])[1]

# Define the model layers, nest the layers
x=Dropout(0.2)(bert_layer)
x=Dense(50, activation="tanh")(x)
x=Dropout(0.2)(x)
x=Dense(1, activation="sigmoid")(x)

# Initialize the model
model = Model(inputs=[input_ids, input_mask], outputs=x)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 75)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 75)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 75,                                            

### Model Execution

We use “Adam” for stochastic gradient descent optimization, and define the loss function as binary cross entropy. Then, run the model and produce training and test accuracy scores. We batch at 30 observations to allow the model to learn within an epoch, and limit the analysis to one epoch due to run-time concerns.

In [11]:
optimizer = Adam(learning_rate=1e-05, epsilon=1e-08, decay=0.01, clipnorm=1.0)
model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = 'accuracy')


In [12]:
# Caution: Will take hours to run
output = model.fit(x = {'input_1':X_train_token['input_ids'],'input_2':X_train_token['attention_mask']}, y = y_train, epochs = 1, batch_size = 30,  validation_data=({'input_1':X_test_token['input_ids'],'input_2':X_test_token['attention_mask']},  y_test))


