In [None]:
import warnings, re
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

## Data Pre-processing

In [None]:
# data = pd.read_csv('../input/politifact-factcheck-data/politifact.csv')
# data = data[['sources_quote', 'fact']]
# data = data.rename(columns={'sources_quote': 'text', 'fact': 'label'})

In [None]:
# data = data[data.label != 'full-flop']
# data = data[data.label != 'half-flip']
# data = data[data.label != 'no-flip']

# #Change the news' labels into numbers
# for x in range(len(data)):
#     rating_values = data['label'].iloc[x]
#     if rating_values == "true":
#         data['label'].iloc[x] = 0
#     if rating_values == "mostly-true":
#         data['label'].iloc[x] = 1   
#     if rating_values == "half-true":
#         data['label'].iloc[x] = 2    
#     if rating_values == "barely-true":
#         data['label'].iloc[x] = 3    
#     if rating_values == "false":
#         data['label'].iloc[x] = 4
#     if rating_values == "pants-fire":
#         data['label'].iloc[x] = 5

In [None]:
# def text_cleaning(text):
#     """
#     Removing all characters except alphabets
#     """
#     text = re.sub(r'[^a-zA-Z]', ' ', text)
#     return text

# data.fillna('', inplace=True)

# data['text'] = data['text'].apply(text_cleaning)

In [None]:
# train_data, val_test_data = train_test_split(data, test_size=0.2, shuffle=False)
# val_data, test_data = train_test_split(val_test_data, test_size=0.5, shuffle=False)

# len(train_data), len(val_data), len(test_data)

In [None]:
# train_data.to_csv('train6.csv', index=False)
# val_data.to_csv('val6.csv', index=False)
# test_data.to_csv('test6.csv', index=False)

## Import pre-processed data

In [None]:
train_data = pd.read_csv('../input/splitted-6-labels-fake-news/train6.csv')
val_data = pd.read_csv('../input/splitted-6-labels-fake-news/val6.csv')
test_data = pd.read_csv('../input/splitted-6-labels-fake-news/test6.csv')

## Train

In [None]:
import transformers
import pkg_resources

print(pkg_resources.get_distribution('transformers').version)

#configuration
MAX_LEN=512
Train_batch_size=8
Valid_batch_size=4
Epochs=4
Accumulation=2
Bert_path='../input/bert-base-uncased'
model_path='model.pt'
Train_file='../input/splitted-6-labels-fake-news/train6.csv'
Tokenizer=transformers.BertTokenizer.from_pretrained(Bert_path, do_lower_case=True)

In [None]:
import torch

class FakeNewsDataset:
    def __init__(self, text, label):
        self.text=text
        self.label=label
        self.tokenizer=Tokenizer
        self.max_len=MAX_LEN
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text=str(self.text[idx])
        
        inputs=self.tokenizer.encode_plus(text, None, add_special_tokens=True, 
                                         max_length=self.max_len, truncation=True)
        
        ids=inputs['input_ids']
        mask=inputs['attention_mask']
        token_type_ids=inputs['token_type_ids']
        
        padding_length=self.max_len-len(ids)
        ids=ids+([0]*padding_length)
        mask=mask+([0]*padding_length)
        token_type_ids=token_type_ids+([0]*padding_length)
        
        
        return {
            'ids':torch.LongTensor(ids),
            'mask':torch.LongTensor(mask),
            'token_type_ids':torch.LongTensor(token_type_ids),
            'targets':torch.tensor(self.label[idx], dtype=torch.float)
        }
    
    
trainLoader=torch.utils.data.DataLoader(
    FakeNewsDataset(train_data['text'].values, train_data['label'].values),
    batch_size=Train_batch_size,
    num_workers=4)

valLoader=torch.utils.data.DataLoader(
    FakeNewsDataset(val_data['text'].values, val_data['label'].values),
    batch_size=Valid_batch_size,
    num_workers=1)

In [None]:
import torch.nn as nn

class FakeBERTuncased(nn.Module):
    def __init__(self):
        super(FakeBERTuncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(Bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.fc = nn.Linear(768, 6)

    def forward(self, ids, mask, token_type_ids):
        _, pool = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        pool = self.bert_drop(pool)
        output = self.fc(pool)
        return output
    
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

model=FakeBERTuncased()
model.to(device)

print("Training on ",device)
print(sum(p.numel() for p in model.parameters() if p.requires_grad),' trainable prams')

In [None]:
criterion=nn.CrossEntropyLoss()
param_optimizer=list(model.named_parameters())
no_decay=['bias', 'LayerNorm.bias', 'LayerNorm.weight']

optimizer_params=[
    {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay':0.001},
    {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0}]

num_train_steps=int(len(train_data)/ Train_batch_size*Epochs)
optimizer=transformers.AdamW(optimizer_params, lr=2e-5)
scheduler=transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                        num_training_steps=num_train_steps)

def train(model, iterator, optimizer, scheduler, device, accumulation_steps=None):
    model.train()
    
    for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
        ids, token_type_ids, mask, targets= batch['ids'].to(device, dtype=torch.long),batch['token_type_ids'].to(device, dtype=torch.long), batch['mask'].to(device, dtype=torch.long), batch['targets'].to(device, dtype=torch.long)
        
        optimizer.zero_grad()
        preds=model(ids, mask, token_type_ids).squeeze(1)

        loss=criterion(preds, targets)
        loss.backward()
        
        #if (i+1)% accumulation_steps==0:
        optimizer.step()
        scheduler.step()
        
def evaluation(model, iterator, device):
    model.eval()
    
    fin_targets, fin_outputs=[], []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
            ids, token_type_ids, mask, targets= batch['ids'].to(device, dtype=torch.long),batch['token_type_ids'].to(device, dtype=torch.long), batch['mask'].to(device, dtype=torch.long), batch['targets'].to(device, dtype=torch.float)
            preds=model(ids, mask, token_type_ids)

            #loss=criterion(preds, targets)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.argmax(torch.nn.functional.log_softmax(preds,dim=1), dim=1).cpu().detach().numpy().tolist())
            
        return fin_outputs, fin_targets

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# #Done with training
# best_accuracy=0

# for epoch in range(Epochs):
#     train(model, trainLoader, optimizer, scheduler, device)
    
#     preds, targets=evaluation(model, valLoader, device)
    
#     accuracy=accuracy_score(targets, preds)
#     print(epoch+1," Accuracy: ",accuracy)
    
#     if accuracy>best_accuracy:
#         torch.save(model.state_dict(),model_path)
#         best_accuracy=accuracy

In [None]:
#torch.cuda.empty_cache()

## Inference

In [None]:
def infer(text):
    with torch.no_grad():
        inputs=Tokenizer.encode_plus(text, None, max_length=512, truncation=True)
        
        ids, mask, token_type_ids=inputs['input_ids'], inputs['attention_mask'], inputs["token_type_ids"]

        padding_length = 512 - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        ids = torch.LongTensor(ids).unsqueeze(0).to(device)
        mask = torch.LongTensor(mask).unsqueeze(0).to(device)
        token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(device)

        preds = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        preds = torch.argmax(torch.nn.functional.log_softmax(preds,dim=1), dim=1).cpu().detach().numpy()
    return preds

In [None]:
# Test the model accuracy using validation data
model.load_state_dict(torch.load('../input/bert6labelsmodel/model6.pt'))#,map_location='cpu'))
model.eval()

rating=[label for label in val_data.label.values]
predicted=[infer(text) for text in tqdm(val_data.text.values)]

print(classification_report(rating,predicted,target_names=['true', 'mostly-true', 'half-true', 'mostly-false', 'fake', 'pants-on-fire']))

## Testing

In [None]:
# Test the model accuracy using validation data
model.load_state_dict(torch.load('../input/bert6labelsmodel/model6.pt'))#,map_location='cpu'))
model.eval()

test_rating=[label for label in test_data.label.values]
test_predicted=[infer(text) for text in tqdm(test_data.text.values)]

print(confusion_matrix(test_rating , test_predicted),'\n\n\n')
print(classification_report(test_rating,test_predicted,target_names=['true', 'mostly-true', 'half-true', 'mostly-false', 'fake', 'pants-on-fire']))