In [1]:
import transformers
import numpy as np
import pandas as pd
import torch
import tensorflow as tf

In [2]:
def read_data(filename):
    df = pd.read_csv(filename,engine='python', delimiter=r'\t+', names=['id', 'Label','Statement', 'subject', 'speaker','speaker_title','state', 'party','barely_true','false_counts','half_true','mostly_true','pants_on_fire','context'])
    labels=[]
    for l in df['Label']:
        if l in ['false','barely-true','pants-fire']:
            labels.append(1)
        elif l in ['half-true','mostly-true', 'true']:
            labels.append(0)
        else:
            raise Exception("Error encountered in labelling")
    data = pd.concat([df['Statement'], pd.Series(labels)], axis=1)
    data.columns = ['Statement', 'Label']
    return data

In [50]:
data_train = read_data('./LIAR_dataset/train.tsv')
data_test = read_data('./LIAR_dataset/test.tsv')
data_valid = read_data('./LIAR_dataset/valid.tsv')
data_train = pd.concat([data_train, data_valid])


train_labels = np.asarray(data_train['Label'])
test_labels = np.asarray(data_test['Label'])

In [51]:
data_train = data_train[:5000]

In [52]:
train_labels = train_labels[:5000]

In [53]:
dbert_tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = transformers.TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use i

In [55]:
# small_train = data_train.Statement.iloc[:10]
# train_tokenized = data_train.Statement.apply((lambda x: dbert_tokenizer.encode(x, add_special_tokens=True, max_length=128, padding='max_length')))
train_tokenized = data_train.Statement.apply((lambda x: dbert_tokenizer.encode(x, add_special_tokens=True)))

In [56]:
test_tokenized = data_test.Statement.apply((lambda x: dbert_tokenizer.encode(x, add_special_tokens=True)))

In [59]:
max_len = 0
for i in train_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in train_tokenized.values])

In [64]:
max_len = 98
for i in test_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in test_tokenized.values])

In [63]:
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)

(5000, 98)


In [None]:
train_labels = torch.tensor(train_labels)
train_labels

In [60]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-05, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
dbert_model.compile(optimizer=optimizer,
              loss=loss,
              metrics=[metric])


In [61]:
dbert_model.fit(padded,train_labels, epochs=10)

Train on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa838682160>

In [65]:
score=dbert_model.evaluate(padded,test_labels,verbose=1)
print('Test Accuracy: '+str(score[1]))
# print('Test F1 Score: '+str(score[2]))

from sklearn.metrics import precision_recall_fscore_support,classification_report
y_pred=dbert_model.predict(padded)
#print(y_pred)
y2=[]
for q in y_pred:
    if(q[0]>0.5):
        y2.append(True)
    else:
        y2.append(False)
print('Classification report:\n',classification_report(test_labels,y2))
#print('Classification report:\n',precision_recall_fscore_support(y_test,y_pred))
#print(y_pred)

Test Accuracy: 0.61340606
Classification report:
               precision    recall  f1-score   support

           0       0.43      0.27      0.33       727
           1       0.36      0.53      0.43       556

   micro avg       0.38      0.38      0.38      1283
   macro avg       0.39      0.40      0.38      1283
weighted avg       0.40      0.38      0.37      1283

