In [None]:
import transformers
import numpy as np
import pandas as pd
import torch
import tensorflow as tf

In [None]:
def read_data(filename):
    df = pd.read_csv(filename,engine='python', delimiter=r'\t+', names=['id', 'Label','Statement', 'subject', 'speaker','speaker_title','state', 'party','barely_true','false_counts','half_true','mostly_true','pants_on_fire','context'])
    labels=[]
    for l in df['Label']:
        if l in ['false','barely-true','pants-fire']:
            labels.append(1)
        elif l in ['half-true','mostly-true', 'true']:
            labels.append(0)
        else:
            raise Exception("Error encountered in labelling")
    data = pd.concat([df['Statement'], pd.Series(labels)], axis=1)
    data.columns = ['Statement', 'Label']
    return data

In [None]:
data_train = read_data('./LIAR_dataset/train.tsv')
data_test = read_data('./LIAR_dataset/test.tsv')
data_valid = read_data('./LIAR_dataset/valid.tsv')
data_train = pd.concat([data_train, data_valid])


train_labels = np.asarray(data_train['Label'])
test_labels = np.asarray(data_test['Label'])

In [None]:
dbert_tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = transformers.DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [None]:
# small_train = data_train.Statement.iloc[:10]
# train_tokenized = data_train.Statement.apply((lambda x: dbert_tokenizer.encode(x, add_special_tokens=True, max_length=128, padding='max_length')))
train_tokenized = data_train.Statement.apply((lambda x: dbert_tokenizer.encode(x, add_special_tokens=True)))

In [None]:
test_tokenized = data_test.Statement.apply((lambda x: dbert_tokenizer.encode(x, add_special_tokens=True)))

In [None]:
max_len = 0
for i in train_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in train_tokenized.values])

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)

In [None]:
train_labels = torch.tensor(train_labels)
train_labels

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)
outputs = dbert_model(input_ids,attention_mask = attention_mask,labels = train_labels)