In [25]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [26]:
from transformers import AutoTokenizer
from transformers import TFAutoModel

In [27]:
spam_path = 'https://raw.githubusercontent.com/srk-practicum/2022-knu-nlp/MakarenkoValeriia_Branch1/SPAM%20text%20message%2020170820%20-%20Data.csv'
df = pd.read_csv(spam_path)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
X = df.Message
Y = df.Category

In [29]:
def encoding(text):
    if text == 'ham':
        return 0
    elif text == 'spam':
        return 1
    else:
        return -1
Y = Y.apply(encoding) 

In [30]:
stop_words = stopwords.words("english")
stop_words.extend(['u'])
def clean_text(text):
    text = text.lower()           # converting to lowercase
    text = re.sub("@\S+", " ", text)  
    text = re.sub("https*\S+", " ", text)      #\S - не пробел 
    text = re.sub("www\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d\S+", " ", text) 
    text = re.sub("\d", " ", text) # remove all numbers
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)     # remove punctuations
    text = re.sub('\n', ' ', text)        
    text = re.sub('\s{2,}',' ', text)        # remove extra spaces
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text

def convert(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lem = [lemmatizer.lemmatize(t) for t in tokens]
    return text

X_clean = X.apply(convert)

In [31]:
df = pd.DataFrame(data = {'Category': Y, 'Message': X_clean})
df.head(2)

Unnamed: 0,Category,Message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif oni


In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X_clean, Y, test_size=0.2, random_state=42)
y_train = np.asarray(Y_train).astype("float32")
y_test = np.asarray(Y_test).astype("float32")
x_train = np.array(X_train)
x_test = np.array(X_test)

In [33]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [34]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence, max_length=256,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [35]:
num_of_elements = 4457

Xids = np.zeros((num_of_elements, 256))
Xmask = np.zeros((num_of_elements, 256))

for i, sequence in enumerate(X_train):
    tokens = tokenize(sequence)
    Xids[i, :], Xmask[i, :] = tokens[0], tokens[1]

In [36]:
 Xids[0:10]

array([[  101.,  7163.,  1782., ...,     0.,     0.,     0.],
       [  101., 19082.,  3271., ...,     0.,     0.,     0.],
       [  101.,  1435.,   176., ...,     0.,     0.,     0.],
       ...,
       [  101., 14844.,  5093., ...,     0.,     0.,     0.],
       [  101.,  1930.,  1253., ...,     0.,     0.,     0.],
       [  101., 20049.,  1324., ...,     0.,     0.,     0.]])

In [37]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [38]:
bert.summary()

Model: "tf_bert_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [39]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)

# Classifier head
x = tf.keras.layers.Dense(32, activation ='relu')(embeddings)
y = tf.keras.layers.Dense(1, activation ='sigmoid', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False


In [40]:
bert.bert(input_ids, attention_mask=mask)

TFBaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                                 <KerasTensor: shape=(None, 256, 768) dtype=float32 (created by layer 'bert')>),
                                                ('pooler_output',
                                                 <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'bert')>)])

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
model.summary()
history = model.fit(
    [Xids, Xmask], Y_train,
    validation_split=0.2,
    batch_size = 512,
    epochs=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           