In [1]:
import os
import json
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

import transformers
from transformers import AutoModel, BertTokenizerFast
import torch
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"
def get_dataset():
    path="D:/Courses/VII Sem/Capstone/PHEME_veracity/PHEME_veracity/all-rnr-annotated-threads"
    raw_data={"id":[],"text":[],"label":[]}
    files=os.listdir(path)
    events=[]
    for file in files:
        if os.path.isdir(os.path.join(os.path.abspath(path), file)):
            events.append(file)
    #print(events)
    for event in events:
        event_path_rumours="D:/Courses/VII Sem/Capstone/PHEME_veracity/PHEME_veracity/all-rnr-annotated-threads/"+event+"/rumours"
        rumour_files=os.listdir(event_path_rumours)
        rumour_dirs=[]
        for f in rumour_files:
            if os.path.isdir(os.path.join(os.path.abspath(event_path_rumours), f)):
                rumour_dirs.append(f)
        for rum in rumour_dirs:
            file=open("D:/Courses/VII Sem/Capstone/PHEME_veracity/PHEME_veracity/all-rnr-annotated-threads/"+event+"/rumours/"+rum+"/annotation.json")
            data=json.load(file)
            raw_data["id"].append(rum)
            raw_data["text"].append(data["category"])
            raw_data["label"].append(1)
        #print(raw_data)

        event_path_nonrumours="D:/Courses/VII Sem/Capstone/PHEME_veracity/PHEME_veracity/all-rnr-annotated-threads/"+event+"/non-rumours"
        non_rumour_files=os.listdir(event_path_nonrumours)
        non_rumour_dirs=[]
        for f in non_rumour_files:
            if os.path.isdir(os.path.join(os.path.abspath(event_path_nonrumours), f)):
                non_rumour_dirs.append(f)
        for nrum in non_rumour_dirs:
            file=open("D:/Courses/VII Sem/Capstone/PHEME_veracity/PHEME_veracity/all-rnr-annotated-threads/"+event+"/non-rumours/"+nrum+"/source-tweets/"+nrum+".json")
            data=json.load(file)
            raw_data["id"].append(nrum)
            raw_data["text"].append(data["text"])
            raw_data["label"].append(0)
        
    
    X_train, X_test, y_train, y_test = train_test_split(raw_data['text'], raw_data['label'], test_size=0.3, random_state=2018,stratify=raw_data['label'])
        
    return X_train, X_test, y_train, y_test        


def drop_emoji(x):
    drop_list = []
    for i,_x in enumerate(x):
        if ord(_x) >= 128:
            drop_list.append(i)
    new_x = []
    for i,_x in enumerate(x):
        if i not in drop_list:
            new_x.append(_x)
    return ''.join(new_x)

train_text,temp_text, train_labels, temp_labels = get_dataset()

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)


test_text=list(map(drop_emoji,test_text))
val_text=list(map(drop_emoji,val_text))
train_text=list(map(drop_emoji,train_text))

bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
seq_len = [len(i.split()) for i in train_text]

tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text,
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text,
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

#print(tokens_test)

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels)
print(val_y)
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)


for param in bert.parameters():
    param.requires_grad = False

class BERT_Arch(nn.Module):
    def __init__(self, bert):
        
        super(BERT_Arch, self).__init__()

        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
        
        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)
        
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
        
        # apply softmax activation
        x = self.softmax(x)

        return x


model = BERT_Arch(bert)

# push the model to GPU
# model = model.to(device)

from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)       


from sklearn.utils.class_weight import compute_class_weight


class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print("Class Weights:",class_weights)

weights= torch.tensor(class_weights,dtype=torch.float)

cross_entropy  = nn.NLLLoss(weight=weights) 

epochs = 2

def train():
  model.train()
  total_loss, total_accuracy = 0, 0
  total_preds=[]
  for step,batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))  
    sent_id, mask, labels = batch
    model.zero_grad()        
    preds = model(sent_id, mask)

    loss = cross_entropy(preds, labels)
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds


def evaluate():
      
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # # push the batch to gpu
    # batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds


best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()


preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

ModuleNotFoundError: ignored