# Pegasus

In [None]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW
import torch.nn.functional as F

##Loading processed data

# train_df = pd.read_csv('trainingaugnew.csv', encoding='utf-8')
# train_df2 = pd.read_csv('trainaug.csv')
train_df=pd.read_csv('pegasus.csv', encoding='utf-8')
# train_df= pd.concat([train_df, train_df2], ignore_index=True)
val_df = pd.read_csv('valnew.csv', encoding='utf-8')
test_df = pd.read_csv('testnew.csv', encoding='utf-8')

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


# train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)
# dropping useless features/columns
# train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05

In [None]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__


In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def decode(predicted_values,pv):
    # predicted_values=np.zeros((test_df.shape[0],12))
    # pv=(predicted_rawA+predicted_rawB+predicted_rawC)/3
    for i,text in enumerate(test_df['Tweet']):    
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(pv[i]).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
        predicted_values[i]=predictions

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
        
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))

In [None]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
            # predictions[np.where(probs >= 0.4)] = 1
            # if(sum(predictions))==0:
            #     predictions[np.where(probs >= 0.3)] = 1
            #     if(sum(predictions))==0:
            #         predictions[np.where(probs >= 0.2)] = 1
            #         if(sum(predictions))==0:
            #             predictions[np.where(probs >= 0.1)] = 1
            

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()
        # # Turn predicted id's into actual label names
        # predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]    
        # # Get the predicted label index
        # predicted_label_index = int(np.argmax(probs, axis=0))

      

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [None]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path,ID):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      predicted_values,predicted_raw=LetsAssess(model)
      np.save('predicted_raw_epoch'+str(epoch-1)+str(ID)+'.npy', predicted_raw)

      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'BU')
predicted_valuesBU,predicted_rawBU=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelB.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'seantw')
predicted_valuesSN,predicted_rawSN=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'DL')
predicted_valuesDL,predicted_rawDL=LetsAssess(model)




# Random

In [33]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW
import torch.nn.functional as F

##Loading processed data

# train_df = pd.read_csv('trainingaugnew.csv', encoding='utf-8')
# train_df2 = pd.read_csv('trainaug.csv')
train_df=pd.read_csv('rawoversample.csv', encoding='utf-8')
# train_df= pd.concat([train_df, train_df2], ignore_index=True)
val_df = pd.read_csv('valnew.csv', encoding='utf-8')
test_df = pd.read_csv('testnew.csv', encoding='utf-8')

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


# train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)
# dropping useless features/columns
# train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
train_df.head()

In [36]:
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05

In [37]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping

In [38]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__


In [40]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [41]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [42]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [43]:
def decode(predicted_values,pv):
    # predicted_values=np.zeros((test_df.shape[0],12))
    # pv=(predicted_rawA+predicted_rawB+predicted_rawC)/3
    for i,text in enumerate(test_df['Tweet']):    
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(pv[i]).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
        predicted_values[i]=predictions

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
        
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))

In [44]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
            # predictions[np.where(probs >= 0.4)] = 1
            # if(sum(predictions))==0:
            #     predictions[np.where(probs >= 0.3)] = 1
            #     if(sum(predictions))==0:
            #         predictions[np.where(probs >= 0.2)] = 1
            #         if(sum(predictions))==0:
            #             predictions[np.where(probs >= 0.1)] = 1
            

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()
        # # Turn predicted id's into actual label names
        # predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]    
        # # Get the predicted label index
        # predicted_label_index = int(np.argmax(probs, axis=0))

      

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [45]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path,ID):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      predicted_values,predicted_raw=LetsAssess(model)
      np.save('predicted_raw_epoch'+str(epoch-1)+str(ID)+'.npy', predicted_raw)

      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'BU')
predicted_valuesBU,predicted_rawBU=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelB.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'seantw')
predicted_valuesSN,predicted_rawSN=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'DL')
predicted_valuesDL,predicted_rawDL=LetsAssess(model)




# Chatgpt

In [17]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW
import torch.nn.functional as F

##Loading processed data

# train_df = pd.read_csv('trainingaugnew.csv', encoding='utf-8')
# train_df2 = pd.read_csv('trainaug.csv')
train_df=pd.read_csv('trainGptOversample.csv', encoding='utf-8')
# train_df= pd.concat([train_df, train_df2], ignore_index=True)
val_df = pd.read_csv('valnew.csv', encoding='utf-8')
test_df = pd.read_csv('testnew.csv', encoding='utf-8')

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


# train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)
# dropping useless features/columns
# train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
train_df.head()

In [20]:
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05

In [21]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping

In [22]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__


In [24]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [25]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [26]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [27]:
def decode(predicted_values,pv):
    # predicted_values=np.zeros((test_df.shape[0],12))
    # pv=(predicted_rawA+predicted_rawB+predicted_rawC)/3
    for i,text in enumerate(test_df['Tweet']):    
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(pv[i]).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
        predicted_values[i]=predictions

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
        
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))

In [28]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
            # predictions[np.where(probs >= 0.4)] = 1
            # if(sum(predictions))==0:
            #     predictions[np.where(probs >= 0.3)] = 1
            #     if(sum(predictions))==0:
            #         predictions[np.where(probs >= 0.2)] = 1
            #         if(sum(predictions))==0:
            #             predictions[np.where(probs >= 0.1)] = 1
            

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()
        # # Turn predicted id's into actual label names
        # predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]    
        # # Get the predicted label index
        # predicted_label_index = int(np.argmax(probs, axis=0))

      

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [29]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path,ID):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      predicted_values,predicted_raw=LetsAssess(model)
      np.save('predicted_raw_epoch'+str(epoch-1)+str(ID)+'.npy', predicted_raw)

      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'BU')
predicted_valuesBU,predicted_rawBU=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelB.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'seantw')
predicted_valuesSN,predicted_rawSN=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'DL')
predicted_valuesDL,predicted_rawDL=LetsAssess(model)




# BART

In [49]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW
import torch.nn.functional as F

##Loading processed data

# train_df = pd.read_csv('trainingaugnew.csv', encoding='utf-8')
# train_df2 = pd.read_csv('trainaug.csv')
train_df=pd.read_csv('train_BART.csv', encoding='utf-8')
# train_df= pd.concat([train_df, train_df2], ignore_index=True)
val_df = pd.read_csv('valnew.csv', encoding='utf-8')
test_df = pd.read_csv('testnew.csv', encoding='utf-8')

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


# train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)
# dropping useless features/columns
# train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
train_df.head()

In [52]:
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05

In [53]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping

In [54]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__


In [56]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [57]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [58]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [59]:
def decode(predicted_values,pv):
    # predicted_values=np.zeros((test_df.shape[0],12))
    # pv=(predicted_rawA+predicted_rawB+predicted_rawC)/3
    for i,text in enumerate(test_df['Tweet']):    
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(pv[i]).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
        predicted_values[i]=predictions

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
        
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))

In [60]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
            # predictions[np.where(probs >= 0.4)] = 1
            # if(sum(predictions))==0:
            #     predictions[np.where(probs >= 0.3)] = 1
            #     if(sum(predictions))==0:
            #         predictions[np.where(probs >= 0.2)] = 1
            #         if(sum(predictions))==0:
            #             predictions[np.where(probs >= 0.1)] = 1
            

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()
        # # Turn predicted id's into actual label names
        # predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]    
        # # Get the predicted label index
        # predicted_label_index = int(np.argmax(probs, axis=0))

      

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [61]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path,ID):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      predicted_values,predicted_raw=LetsAssess(model)
      np.save('predicted_raw_epoch'+str(epoch-1)+str(ID)+'.npy', predicted_raw)

      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'BU')
predicted_valuesBU,predicted_rawBU=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelB.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'seantw')
predicted_valuesSN,predicted_rawSN=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'DL')
predicted_valuesDL,predicted_rawDL=LetsAssess(model)




# Random

In [None]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW
import torch.nn.functional as F

##Loading processed data

# train_df = pd.read_csv('trainingaugnew.csv', encoding='utf-8')
# train_df2 = pd.read_csv('trainaug.csv')
train_df=pd.read_csv('rawoversample.csv', encoding='utf-8')
# train_df= pd.concat([train_df, train_df2], ignore_index=True)
val_df = pd.read_csv('valnew.csv', encoding='utf-8')
test_df = pd.read_csv('testnew.csv', encoding='utf-8')

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


# train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)
# dropping useless features/columns
# train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05

In [None]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__


In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def decode(predicted_values,pv):
    # predicted_values=np.zeros((test_df.shape[0],12))
    # pv=(predicted_rawA+predicted_rawB+predicted_rawC)/3
    for i,text in enumerate(test_df['Tweet']):    
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(pv[i]).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
        predicted_values[i]=predictions

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
        
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))

In [None]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
            # predictions[np.where(probs >= 0.4)] = 1
            # if(sum(predictions))==0:
            #     predictions[np.where(probs >= 0.3)] = 1
            #     if(sum(predictions))==0:
            #         predictions[np.where(probs >= 0.2)] = 1
            #         if(sum(predictions))==0:
            #             predictions[np.where(probs >= 0.1)] = 1
            

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()
        # # Turn predicted id's into actual label names
        # predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]    
        # # Get the predicted label index
        # predicted_label_index = int(np.argmax(probs, axis=0))

      

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [None]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path,ID):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      predicted_values,predicted_raw=LetsAssess(model)
      np.save('predicted_raw_epoch'+str(epoch-1)+str(ID)+'.npy', predicted_raw)

      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'BU')
predicted_valuesBU,predicted_rawBU=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
# tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        # self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelB.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'seantw')
predicted_valuesSN,predicted_rawSN=LetsAssess(model)




In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()    

        # self.bert = BertModel.from_pretrained('bert-large-uncased')  
        # self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path,'DL')
predicted_valuesDL,predicted_rawDL=LetsAssess(model)


