In [1]:
import transformers
from tokenizers import BertWordPieceTokenizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.keras import backend as K
from sklearn.metrics import f1_score

In [2]:
def encode_data(tokenizer, text_list, max_length):
    tokenizer.enable_truncation(max_length)
    tokenizer.enable_padding(length=max_length)
    
    encoded = tokenizer.encode_batch(text_list)
    
    id_list = []
    
    for item in encoded:
        id_list.append(item.ids)
        
    return np.array(id_list)

def load_data(train_data_path, test_data_path):
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)
    
    return train_data, test_data

def impute(dataset : pd.DataFrame):
    dataset_copy = dataset.copy()
    dataset_copy['keyword'].fillna('0', inplace=True)
    dataset_copy['keyword'] = dataset_copy['keyword'].str.split('%20').str.join(',')
    
    return dataset_copy

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall_keras = true_positives / (possible_positives + K.epsilon())
    return recall_keras


def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision_keras = true_positives / (predicted_positives + K.epsilon())
    return precision_keras

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))

def convert_probabilities_to_predictions(probabilities):
    predictions = [np.rint(x) for x in probabilities]
    return predictions

In [3]:
train_data, test_data = load_data('../input/nlp-getting-started/train.csv',
                                 '../input/nlp-getting-started/test.csv')

#Filled keyword column for missing values and split values with '%20' and joined them together with ',' delimiter.
train_data = impute(train_data) 
test_data = impute(test_data)


#Using the train_data, split it into train and valid sets
X_train, X_valid, y_train, y_valid = train_test_split(train_data[['text', 'keyword']],
                                                      train_data.target,   
                                                      train_size=0.8, 
                                                      random_state=1,
                                                      shuffle=True)

#Converted dataframe to numpy since I got errors while trying to train the model.
y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()

train_text = X_train['text'].to_numpy()
valid_text = X_valid['text'].to_numpy()
test_text = test_data['text'].to_numpy()

train_keyword = X_train['keyword'].to_numpy()
valid_keyword = X_valid['keyword'].to_numpy()
test_keyword = test_data['keyword'].to_numpy()

In [4]:
#Creates tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
tokenizer.save_pretrained('.')
bertWordPieceTokenizer = BertWordPieceTokenizer('vocab.txt', strip_accents=True)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
#Encodes all the data
train_text_encoded = encode_data(bertWordPieceTokenizer, train_text, 200)
valid_text_encoded = encode_data(bertWordPieceTokenizer, valid_text, 200)
test_text_encoded = encode_data(bertWordPieceTokenizer, test_text, 200)

train_keyword_encoded = encode_data(bertWordPieceTokenizer, train_keyword, 20)
valid_keyword_encoded = encode_data(bertWordPieceTokenizer, valid_keyword, 20)
test_keyword_encoded = encode_data(bertWordPieceTokenizer, test_keyword, 20)

In [6]:
text_tranformer_layer = transformers.TFBertForSequenceClassification.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

2022-03-10 10:20:27.173130: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-10 10:20:27.174213: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-10 10:20:27.174978: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-10 10:20:27.175945: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [7]:
inputs = layers.Input(shape=(None,), dtype=tf.int32)
outputs = text_tranformer_layer(inputs)[0]
outputs = layers.Dense(1, activation='sigmoid')(outputs)

model = Model(inputs=inputs, outputs=outputs)

In [8]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf_bert_for_sequence_classif TFSequenceClassifierOutpu 108311810 
_________________________________________________________________
dense (Dense)                (None, 1)                 3         
Total params: 108,311,813
Trainable params: 108,311,813
Non-trainable params: 0
_________________________________________________________________


In [9]:
# tf.keras.utils.plot_model(model, show_shapes=True)

In [10]:
learning_rate_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(
    0.01,
    100,
    0.001,
    staircase=False, 
    name=None
)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)


Adam = tf.keras.optimizers.Adam(
    learning_rate=learning_rate_scheduler,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=0.1,
    amsgrad=False,
    name="Adam"
)

model.compile(optimizer=Adam, loss='binary_crossentropy', 
                  metrics=['accuracy',f1]
                 )
model.fit(train_text_encoded, y_train,
          validation_data=(valid_text_encoded, y_valid),
          callbacks=callback,
          batch_size=32,
          epochs=10)

2022-03-10 10:20:41.055435: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.callbacks.History at 0x7f6b4436bcd0>

In [11]:
probabilities = model.predict(valid_text_encoded)
predictions = convert_probabilities_to_predictions(probabilities)

score = f1_score(y_valid, predictions)
score

0.7698744769874478

In [12]:
submission = pd.DataFrame()
submission['id'] = test_data['id']

probabilities = model.predict(test_text_encoded)
predictions = convert_probabilities_to_predictions(probabilities)

submission['target'] = predictions
submission['target'] = submission['target'].astype(int)

submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [13]:
submission.to_csv('submission.csv', index=False)
s = pd.read_csv('submission.csv')
s

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
