In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers
!pip install torchmetrics

In [None]:
!pip install Dataset

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertModel, RobertaForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy
from datasets import Dataset
import datasets

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
data = pd.read_csv('/kaggle/input/fake-news/train.csv')

#Handle any missing values by filling placeholders
data['author'] = data['author'].fillna('Unknown',axis=0)
data['title'] = data['title'].fillna('Unknown',axis=0)
data['text'] = data['text'].fillna('Not Available',axis=0)

#Create a complete article writeup by concatenating title and author
data['comb_news'] = 'Title:'+data['title']+'\nAuthor:'+data['author']+'\nBody:'+data['text']

#Extract only the relevant columns from input data
data_in = data[['comb_news','label']]

In [None]:
#Define the tokenizer and the function used to tokenize the data
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(el):
    result =  tokenizer(el['comb_news'],truncation=True,max_length=128,padding='max_length',return_overflowing_tokens=True)
    sample_map = result.pop('overflow_to_sample_mapping')
    for key,value in el.items():
        result[key] = [value[i] for i in sample_map]
        
    return result

In [None]:
#Function used to tokenize the data and create data loaders
def create_loaders(data_in):
    #Split relevant columns into training and validation datasets 
    train_data, test_data, train_label, test_label = train_test_split(data_in['comb_news'],data_in['label'],test_size=0.25,random_state=42)
    
    data_train = Dataset.from_pandas(pd.concat([train_data,train_label],axis=1))
    data_valid = Dataset.from_pandas(pd.concat([test_data,test_label],axis=1))
    
    tr_data = datasets.DatasetDict({'train':data_train,'valid':data_valid})
    
    tok_data = tr_data.map(tokenize,batched=True)
    tok_data = tok_data.remove_columns(['comb_news','__index_level_0__'])
    tok_data.set_format('pandas')
    
    train_in = tok_data['train'][:]
    train_set = TensorDataset(torch.tensor(train_in['input_ids']),torch.tensor(train_in['attention_mask']),torch.tensor(train_in['label']))
    
    valid_in = tok_data['valid'][:]
    valid_set = TensorDataset(torch.tensor(valid_in['input_ids']),torch.tensor(valid_in['attention_mask']),torch.tensor(valid_in['label']))
    
    #Define data loaders for both the training and validation data sets
    train_loader = DataLoader(train_set,batch_size=64,shuffle=True)
    valid_loader = DataLoader(valid_set,batch_size=64,shuffle=False)
    
    return train_loader,valid_loader

In [None]:
train_loader,valid_loader = create_loaders(data_in)
valid_loader

In [None]:
#Define the model
model = RobertaForSequenceClassification.from_pretrained(model_name)

In [None]:
model = model.to(device)
for idx,(name,params) in enumerate(model.named_parameters()):
    if 'classifier' in name or 'encoder.layer.8' in name or 'encoder.layer.9' in name or 'encoder.layer.10' in name:
        params.requires_grad = True
    else:
        params.requires_grad = False
        
total_params = 0
for param in model.parameters():
    if param.requires_grad:
        total_params+= param.numel()
print(total_params)

In [None]:
epochs=2
optimizer = torch.optim.AdamW(model.parameters(),lr=5e-5,eps=1e-8)
criterion = torch.nn.CrossEntropyLoss()
train_acc,valid_acc = Accuracy(task='binary',num_classes=2).to(device),Accuracy(task='binary',num_classes=2).to(device)

In [None]:
for epoch in range(epochs):
    train_loss, valid_loss = list(),list()
    print(f'Epoch:{epoch}----------------------->')
    
    model.train()
    for idx,(x_ids,x_mask,x_label) in tqdm(enumerate(train_loader),total=len(train_loader)):
        optimizer.zero_grad()
        x_ids, x_mask, x_label = x_ids.to(device), x_mask.to(device), x_label.to(device)
        preds = model(x_ids,attention_mask = x_mask)
        loss = criterion(preds.logits,x_label)
        train_loss.append(loss.item())
        train_acc.update(torch.argmax(preds.logits,dim=1),x_label)
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        loss.backward()
        optimizer.step()
        
    model.eval()
    for idx,(v_ids,v_mask,v_label) in tqdm(enumerate(valid_loader),total=len(valid_loader)):
        v_ids, v_mask, v_label = v_ids.to(device), v_mask.to(device), v_label.to(device)
        preds = model(v_ids,attention_mask = v_mask)
        loss = criterion(preds.logits,v_label)
        valid_loss.append(loss.item())
        valid_acc.update(torch.argmax(preds.logits,dim=1),v_label)
        
    avg_train_loss, avg_valid_loss = sum(train_loss)/len(train_loss),sum(valid_loss)/len(valid_loss)
    print(f'Training loss:{avg_train_loss}\tValidation loss:{avg_valid_loss}')
    print(f'Training accuracy:{train_acc.compute().item()}\tValidation accuracy:{valid_acc.compute().item()}')

In [None]:
def flatten_data(data):
    flat_list = list()
    for item in data:
        flat_list += item.tolist()
    return flat_list

def find_issue(d_out):
    check_issue = pd.crosstab(d_out['id'],d_out['label']).reset_index().rename(columns={'id':'index',0:'label_0',1:'label_1'})
    check_issue['issue_flag'] = check_issue.apply(lambda x: x['label_0']>0 & x['label_1']>0,axis=1)
    errors = check_issue.shape[0]-check_issue['issue_flag'].value_counts().to_frame().reset_index().loc[0,'count']
    return errors

In [None]:
#Assess model performance on the test data
d_test = pd.read_csv('/kaggle/input/fake-news/test.csv')

def generate_result(test):
    #Handle any missing values by filling placeholders
    test['author'] = test['author'].fillna('Unknown',axis=0)
    test['title'] = test['title'].fillna('Unknown',axis=0)
    test['text'] = test['text'].fillna('Not Available',axis=0)
    
    #Concatenating title and author with text body to prevent loss of important data
    test['comb_news'] = 'Title:'+test['title']+'\nAuthor:'+test['author']+'\nBody:'+test['text']
    d_test_in = d_test[['id','comb_news']]
    
    test_dset = Dataset.from_pandas(d_test_in)
    #Tokenize the test data with the Roberta tokenizer
    test_tokens = test_dset.map(tokenize,batched=True)
    test_tokens = test_tokens.remove_columns(['comb_news'])
    test_tokens.set_format('pandas')

    torch_t_data = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']))
    test_dataloader = DataLoader(torch_t_data,batch_size=64,shuffle=False)
    label_preds = list()
    
    #Determine the predictions from the model
    model.eval()
    for idx,(t_inputs,t_amask) in tqdm(enumerate(test_dataloader),total=len(test_dataloader)):
        t_inputs,t_amask = t_inputs.to(device),t_amask.to(device)
        preds = model(t_inputs,attention_mask = t_amask)
        p_label = torch.argmax(preds.logits,dim=1)
        label_preds.append(p_label)
    
    test_predictions = flatten_data(label_preds)
    output = pd.concat([test_tokens['id'],pd.DataFrame(test_predictions,columns=['label'])],axis='columns')
    
    if find_issue(output)==0:
        return output.drop_duplicates(subset=['id'])
    else:
        return -1

In [None]:
f_output = generate_result(d_test)

In [None]:
f_output.to_csv('submit.csv',index=False)
