In [1]:
import pandas as pd
train_df = pd.read_csv('rawoversample.csv', encoding='utf-8')


In [None]:
train_df.head

In [None]:
pip install sentencepiece

In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

def rephrase_tweets(train_df, num_return_sequences=4, num_beams=4):
    """Rephrases tweets in the 'Tweet' column of a DataFrame using Pegasus.

    Args:
        train_df (pd.DataFrame): The DataFrame containing the tweets.
        num_return_sequences (int): The number of generated sequences.
        num_beams (int): The number of beams for beam search.

    Returns:
        pd.DataFrame: A new DataFrame with the rephrased tweets in a new column.
    """

    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    rephrased_tweets = []
    for tweet in train_df['Tweet']:
        batch = tokenizer([tweet], truncation=True, padding='longest', max_length=120, return_tensors="pt").to(torch_device)
        translated = model.generate(**batch, max_length=120, num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        rephrased_tweets.append(tgt_text)

    train_df['Rephrased_Tweet'] = rephrased_tweets
    return train_df

# Example usage:
# train_df = pd.read_csv('your_data.csv')
rephrased_df = rephrase_tweets(train_df)

In [3]:
rephrased_df.to_csv('pegasus.csv', index=False)


In [1]:
import pandas as pd
train_df = pd.read_csv('rawoversample.csv', encoding='utf-8')


In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

def rephrase_tweets(train_df, num_return_sequences=4, num_beams=4):
    """Rephrases tweets in the 'Tweet' column of a DataFrame using BART.

    Args:
        train_df (pd.DataFrame): The DataFrame containing the tweets.
        num_return_sequences (int): The number of generated sequences.
        num_beams (int): The number of beams for beam search.

    Returns:
        pd.DataFrame: A new DataFrame with the rephrased tweets in a new column.
    """

    model_name = 'eugenesiow/bart-paraphrase'  # Replace with your desired BART model
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    rephrased_tweets = []
    for tweet in train_df['Tweet']:
        batch = tokenizer([tweet], truncation=True, padding='longest', max_length=120, return_tensors="pt").to(torch_device)
        translated = model.generate(**batch, max_length=120, num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        rephrased_tweets.append(tgt_text)

    train_df['Rephrased_Tweet'] = rephrased_tweets
    return train_df

# Example usage:
# train_df = pd.read_csv('your_data.csv')
rephrased_df = rephrase_tweets(train_df)

In [4]:
# train_df.to_csv('train_BART.csv', index=False)
rephrased_df.to_csv('train_BART.csv', index=False)


# Data Loading

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys
from imblearn.over_sampling import RandomOverSampler


In [2]:
# train_path = "trainnew.csv"
train_df = pd.read_csv('pegasus.csv', encoding='utf-8')

val_path="valnew.csv"
test_path = "testnew.csv"

In [3]:
# train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


# train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)

In [5]:
# dropping useless features/columns
# train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
val_df.head

In [None]:
train_df.columns

In [9]:
# rearranging columns
train_df = train_df[['Tweet', 'unnecessary', 'mandatory', 'pharma', 'conspiracy',
       'political', 'country', 'rushed', 'ingredients', 'side-effect',
       'ineffective', 'religious', 'none']]

In [None]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [7]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [8]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
# tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [9]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [10]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      LetsAssess(model)
      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

# Models

In [11]:
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
tokenizer=BertTokenizer.from_pretrained('seantw/covid-19-vaccination-tweet-stance')

# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
class TransModel2(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[16, 32, 64, 128]):
        super().__init__()
        self.bert = BertModel.from_pretrained('seantw/covid-19-vaccination-tweet-stance')
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer


    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :]
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransModel2()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-6)
model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesD,predicted_rawD=LetsAssess(model)

In [34]:
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
tokenizer=BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
!pip install openai==0.28

In [None]:
import pandas as pd
from transformers import pipeline

# Load the CSV file
file_path = 'trainRandomOversample.csv'
df = pd.read_csv(file_path)

# Load the text generation pipeline for rephrasing
rephraser = pipeline("text2text-generation", model="t5-base")

# Define a function to rephrase the tweets using the model
def advanced_rephrase(tweet):
    try:
        rephrased = rephraser(f"paraphrase: {tweet} </s>", max_length=128, do_sample=False)
        return rephrased[0]['generated_text']
    except:
        return tweet

# Apply the rephrasing function to the 'Tweet' column
df['Rephrased_Tweet'] = df['Tweet'].apply(advanced_rephrase)



In [50]:
df.to_csv('paraph-t5-base.csv', index=False)


In [None]:
print(df['Tweet'][3])

In [None]:
class TransModel3(nn.Module):
    def __init__(self,  num_layers=4, output_sizes=[16, 32, 64, 128]):
        super().__init__()
        self.bert = BertModel.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer


    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :]
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransModel3()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-6)

model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesE,predicted_rawE=LetsAssess(model)


In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel4(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('bert-base-uncased')  
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel4()
model.to(device)
print(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesG,predicted_rawG=LetsAssess(model)


In [None]:
y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

predicted_values=np.zeros((test_df.shape[0],12))
# pv=(predicted_rawB+predicted_rawD+predicted_rawC+predicted_rawF+predicted_rawG)/6#+predicted_rawH)/7
# pv=(predicted_rawC+predicted_rawD)/2
# pv=(predicted_rawB+predicted_rawC+predicted_rawD+predicted_rawE+predicted_rawF)/5#+predicted_rawH)/7

# pv=(predicted_rawB+predicted_rawW)/2

# pv=(predicted_rawG+predicted_rawB+predicted_rawC)/3
pv=predicted_rawE
for i,text in enumerate(test_df['Tweet']):    
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.from_numpy(pv[i]).to(device))
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
    if(sum(predictions))==0:
         argmax_index = probs.argmax()
         predictions[argmax_index] = 1
    predicted_values[i]=predictions

    
print(classification_report(y_true, predicted_values,target_names=target_list))

print("Accuracy score",accuracy_score(y_true, predicted_values))

multilabel_confusion_matrix(y_true, predicted_values)

# Calculate Jaccard score for each sample individually
sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

# Calculate the average Jaccard score
average_jaccard = np.mean(sample_jaccard_scores)
print("Average Jaccard: {:.3f}".format(average_jaccard))



In [None]:
from itertools import combinations
from sklearn.metrics import accuracy_score
import torch
import numpy as np
# 'A1':predicted_rawA,
arrays = {'B1': predicted_rawB,'C1': predicted_rawC,'D1': predicted_rawD,'E1': predicted_rawE, 'F1':predicted_rawF,'G1': predicted_rawG,'W1': predicted_rawW}#, 'H1':predicted_rawH, 'T1':predicted_rawT,'R1':predicted_rawR}
best_score = 0
best_combination = None

for r in range(1, len(arrays) + 1):
    for combination in combinations(arrays.items(), r):
        # avg = sum(array for name, array in combination) / len(combination)
        avg = np.mean([array for name, array in combination], axis=0)
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(avg).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        for i in range(predictions.shape[0]):
            if np.all(predictions[i] == 0):
                argmax_index = probs[i].argmax()
                predictions[i][argmax_index] = 1
        
        report = classification_report(y_true, predicted_values, output_dict=True)
        # score = report['macro avg']['f1-score']  # Replace with your preferred metric
        score = accuracy_score(y_true, predictions)
        if score > best_score:
            best_score = score
            best_combination = [name for name, array in combination]

        # print("Combination:", [name for name, array in combination])

print("Best score:", best_score)
print("Best combination:", best_combination)

In [None]:
import torch
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW

##Loading processed data

# train_df = pd.read_csv('trainingaugnew.csv', encoding='utf-8')
# train_df2 = pd.read_csv('trainaug.csv')
train_df=pd.read_csv('trainnew.csv', encoding='utf-8')
# train_df= pd.concat([train_df, train_df2], ignore_index=True)
val_df = pd.read_csv('valnew.csv', encoding='utf-8')
test_df = pd.read_csv('testnew.csv', encoding='utf-8')

In [None]:

classes =['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']

def updatedf(dfold):

    # Add new columns with initial value 0
    dfold = pd.concat([dfold, pd.DataFrame(0, index=dfold.index, columns=classes)], axis=1)

    # Iterate over each row and update the corresponding column to 1 based on Label1, Label2, and Label3
    for index, row in dfold.iterrows():
        if row['Label1'] in classes:
            dfold.at[index, row['Label1']] = 1
        if row['Label2'] in classes:
            dfold.at[index, row['Label2']] = 1
        if row['Label3'] in classes:
            dfold.at[index, row['Label3']] = 1

    # Print the updated DataFrame
    print(dfold)
    return dfold


train_df=updatedf(train_df)
val_df=updatedf(val_df)
test_df=updatedf(test_df)
# dropping useless features/columns
train_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
val_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)
test_df.drop(labels=['ID','Label1','Label2', 'Label3'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
target_list = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'side-effect', 'ineffective', 'religious', 'none']
# hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05

In [None]:
# Assuming 'target_list' and 'train_df' are defined
import numpy as np
full_counts = {}
fsum = 0

for column in target_list:
    # full_counts = {}
    
    count = sum(train_df[column])
    full_counts[column] = count
    fsum += count

full_counts['avg'] = int(fsum / len(target_list))

counts = pd.DataFrame.from_dict(full_counts, orient='index', columns=['full_count'])
counts.index.name = 'label'
print(full_counts)

def set_sample_ratio(x):
    avg = int(counts['full_count'].loc['avg'])
    x = int(x)
    if x >= avg: return 1
    else: return int(np.round(avg / x))

counts['calculated_oversampling_ratio'] = counts['full_count'].apply(set_sample_ratio)
counts.T

In [None]:
X=train_df['Tweet']
y=train_df[target_list]
print(X)
print(y)

In [None]:
!pip install imbalanced-learn scikit-multilearn



In [None]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

# NLP
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
import re

# Warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK data
nltk.download('punkt')

def tokenize_and_clean(text):
    # Changing case of the text to lower case
    lowered = text.lower()
    
    # Cleaning the text
    cleaned = re.sub('@user', '', lowered)
    
    # Tokenization
    tokens = word_tokenize(cleaned)
    filtered_tokens = [token for token in tokens if re.match(r'\w{1,}', token)]
    
    # Stemming
    stemmer = PorterStemmer()
    stems = [stemmer.stem(token) for token in filtered_tokens]
    return stems

# Ensure the tokenizer function is compatible with TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_clean, stop_words='english', token_pattern=None)

# Example DataFrames (replace with your actual data)
import pandas as pd
# train_df = pd.DataFrame({'Tweet': ["I love this!", "This is terrible.", "Feeling great today!"]})
# test_df = pd.DataFrame({'Tweet': ["What a wonderful day!", "I hate this.", "So happy right now!"]})

# Transform the data
X_train_tweets_tfidf = tfidf_vectorizer.fit_transform(train_df['Tweet'])
X_test_tweets_tfidf = tfidf_vectorizer.transform(test_df['Tweet'])

print(X_train_tweets_tfidf.shape, X_test_tweets_tfidf.shape)

In [None]:
# Class Imbalance Check
y_train=train_df['conspiracy']
plt.pie(train_df['conspiracy'].value_counts(), 
        labels=['Label 0 (Positive Tweets)', 'Label 1 (Negative Tweets)'], 
        autopct='%0.1f%%')
plt.axis('equal')
plt.show()
print(y_train)

In [None]:
print(np.shape(X_train_tweets_tfidf))

In [None]:
other_columns = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients', 'ineffective', 'religious', 'none']

side_effect_rows = train_df[(train_df['side-effect'] == 1) & (train_df[other_columns].sum(axis=1) == 0)]
rows_to_remove = side_effect_rows.sample(frac=1/3, random_state=1)
print(rows_to_remove.index)
train_df = train_df.drop(rows_to_remove.index)
train_df = train_df.reset_index(drop=True)

In [None]:
# Assuming 'target_list' and 'train_df' are defined
import numpy as np
full_counts = {}
fsum = 0

for column in target_list:
    # full_counts = {}
    
    count = sum(train_df[column])
    full_counts[column] = count
    fsum += count

full_counts['avg'] = int(fsum / len(target_list))

counts = pd.DataFrame.from_dict(full_counts, orient='index', columns=['full_count'])
counts.index.name = 'label'
print(full_counts)

def set_sample_ratio(x):
    avg = int(counts['full_count'].loc['avg'])
    x = int(x)
    if x >= avg: return 1
    else: return int(np.round(avg / x))

counts['calculated_oversampling_ratio'] = counts['full_count'].apply(set_sample_ratio)
counts.T

# Common for Models

In [None]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import shutil
from torch.optim.lr_scheduler import ReduceLROnPlateau
from early_stopping import EarlyStopping

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['Tweet']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
import torch.cuda

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
!nvcc --version
torch.__version__


In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def decode(predicted_values,pv):
    # predicted_values=np.zeros((test_df.shape[0],12))
    # pv=(predicted_rawA+predicted_rawB+predicted_rawC)/3
    for i,text in enumerate(test_df['Tweet']):    
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(pv[i]).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
        predicted_values[i]=predictions

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
        
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))

In [None]:
def LetsAssess(modelTBA):
    id2label = {idx:label for idx, label in enumerate(target_list)}
    label2id = {label:idx for idx, label in enumerate(target_list)}

    #####Pass on all tweets and find their labels using the trained_model
    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()

    predicted_labels = []
    predicted_single_labels=[]
    predicted_values=np.zeros((test_df.shape[0],12))
    predicted_raw=np.zeros((test_df.shape[0],12))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelTBA.to(device)

    for i,text in enumerate(test_df['Tweet']):
        
        encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move the encodings to the device
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        # # Perform the forward pass
        with torch.no_grad():
            output = modelTBA(input_ids, attention_mask, token_type_ids)
        
        # Apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output.squeeze().cpu())    
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        if(sum(predictions))==0:
            argmax_index = probs.argmax()
            predictions[argmax_index] = 1
            # predictions[np.where(probs >= 0.4)] = 1
            # if(sum(predictions))==0:
            #     predictions[np.where(probs >= 0.3)] = 1
            #     if(sum(predictions))==0:
            #         predictions[np.where(probs >= 0.2)] = 1
            #         if(sum(predictions))==0:
            #             predictions[np.where(probs >= 0.1)] = 1
            

        predicted_values[i]=predictions
        predicted_raw[i]=output.squeeze().cpu()
        # # Turn predicted id's into actual label names
        # predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]    
        # # Get the predicted label index
        # predicted_label_index = int(np.argmax(probs, axis=0))

      

    y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
    print(classification_report(y_true, predicted_values,target_names=target_list))

    print("Accuracy score",accuracy_score(y_true, predicted_values))

    multilabel_confusion_matrix(y_true, predicted_values)

    # Calculate Jaccard score for each sample individually
    sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

    # Calculate the average Jaccard score
    average_jaccard = np.mean(sample_jaccard_scores)
    print("Average Jaccard: {:.3f}".format(average_jaccard))
    return predicted_values, predicted_raw


In [None]:
from tqdm import tqdm
val_targets=[]
val_outputs=[]

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      # early_stopping(valid_loss, model)
        
       
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
      LetsAssess(model)
      # decode(predicted_values, predicted_raw )
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
      #   save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      # if early_stopping.early_stop:
      #       print("Early stopping")
      #       break 
    # scheduler.step()
    print('############# Epoch {}  Done   #############\n'.format(epoch))
  # model.load_state_dict(torch.load(checkpoint_path))
  return model

# Models

In [None]:
tokenizer=DistilBertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('CovRelex-SE/CORD19-BERT')
# tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        self.bert =DistilBertModel.from_pretrained("bert-base-uncased")
        # self.bert = BertModel.from_pretrained('CovRelex-SE/CORD19-BERT')  
        # self.bert = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')  
        # self.bert = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')  

        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.4)
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.dropout(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesR,predicted_rawR=LetsAssess(model)




In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        # self.bert = BertModel.from_pretrained('CovRelex-SE/CORD19-BERT')  
        self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.5)
        # self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=8), num_layers=1)
        # self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        # self.dense1 = nn.Linear(sum(output_sizes)*31, 256)  # First dense layer
        # self.dense2 = nn.Linear(256, 128)  # Second dense layer
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        # cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        # x = cls_hs[0]
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        
        x = self.dropout(x)
        # x = self.transformer(x)
        # x = x.transpose(1, 2)  # Transpose the last two dimensions
        # parallel_outputs = [layer(x).squeeze(2) for layer in self.parallel_layers]
        # x = torch.cat(parallel_outputs, dim=1)
        # x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        # x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        # x = F.relu(self.dense2(x))  # Apply ReLU activation function after the second dense layer
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesR,predicted_rawR=LetsAssess(model)


In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel2(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128]):
        super().__init__()        
        # self.bert = BertModel.from_pretrained('CovRelex-SE/CORD19-BERT')  
        self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout1 = nn.Dropout(0.3)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=4), num_layers=1)
        self.dropout2 = nn.Dropout(0.3)
        self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.dense1 = nn.Linear(sum(output_sizes)*15, 256)  # Adjust the input size of the first dense layer
        self.dropout3 = nn.Dropout(0.3)
        self.clf = nn.Linear(256, 12)  # Adjust the input size of the final layer

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0]
        x = self.dropout1(x)
        x = self.transformer(x)
        x = self.dropout2(x)
        x = x.transpose(1, 2)  # Transpose the last two dimensions
        parallel_outputs = [self.maxpool(layer(x)).squeeze(2) for layer in self.parallel_layers]
        x = torch.cat(parallel_outputs, dim=1)
        x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        x = self.dropout3(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel2()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesA,predicted_rawA=LetsAssess(model)


In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        # self.bert = BertModel.from_pretrained('CovRelex-SE/CORD19-BERT')  
        self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.5)
        # self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=8), num_layers=1)
        # self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        # self.dense1 = nn.Linear(sum(output_sizes)*31, 256)  # First dense layer
        # self.dense2 = nn.Linear(256, 128)  # Second dense layer
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        # cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        # x = cls_hs[0]
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        
        x = self.dropout(x)
        # x = self.transformer(x)
        # x = x.transpose(1, 2)  # Transpose the last two dimensions
        # parallel_outputs = [layer(x).squeeze(2) for layer in self.parallel_layers]
        # x = torch.cat(parallel_outputs, dim=1)
        # x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        # x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        # x = F.relu(self.dense2(x))  # Apply ReLU activation function after the second dense layer
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesB,predicted_rawB=LetsAssess(model)


In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        # self.bert = BertModel.from_pretrained('CovRelex-SE/CORD19-BERT')  
        self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.norm = nn.BatchNorm1d(self.bert.config.hidden_size)
        self.dropout = nn.Dropout(0.3)
        self.clf2 = nn.Linear(self.hidden_size, 256)
        self.clf = nn.Linear(256, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        x = self.norm(x)
        
        x = self.dropout(x)
        x = self.clf2(x)
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(20, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesT,predicted_rawT=LetsAssess(model)


In [None]:
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel3(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('bert-base-uncased')  
        # self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.5)
        # self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=8), num_layers=1)
        # self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        # self.dense1 = nn.Linear(sum(output_sizes)*31, 256)  # First dense layer
        # self.dense2 = nn.Linear(256, 128)  # Second dense layer
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        # cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        # x = cls_hs[0]
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        
        x = self.dropout(x)
        # x = self.transformer(x)
        # x = x.transpose(1, 2)  # Transpose the last two dimensions
        # parallel_outputs = [layer(x).squeeze(2) for layer in self.parallel_layers]
        # x = torch.cat(parallel_outputs, dim=1)
        # x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        # x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        # x = F.relu(self.dense2(x))  # Apply ReLU activation function after the second dense layer
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel3()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesC,predicted_rawC=LetsAssess(model)


In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel4(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('CovRelex-SE/CORD19-BERT')  
        # self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout1 = nn.Dropout(0.3)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=4), num_layers=1)
        self.dropout2 = nn.Dropout(0.3)
        self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.dense1 = nn.Linear(sum(output_sizes)*15, 256)  # Adjust the input size of the first dense layer
        self.dropout3 = nn.Dropout(0.3)
        self.clf = nn.Linear(256, 12)  # Adjust the input size of the final layer

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0]
        x = self.dropout1(x)
        x = self.transformer(x)
        x = self.dropout2(x)
        x = x.transpose(1, 2)  # Transpose the last two dimensions
        parallel_outputs = [self.maxpool(layer(x)).squeeze(2) for layer in self.parallel_layers]
        x = torch.cat(parallel_outputs, dim=1)
        x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        x = self.dropout3(x)
        x = self.clf(x)
        return x
    
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel4()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesD,predicted_rawD=LetsAssess(model)


In [None]:
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel5(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')  
        # self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.5)
        # self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=8), num_layers=1)
        # self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        # self.dense1 = nn.Linear(sum(output_sizes)*31, 256)  # First dense layer
        # self.dense2 = nn.Linear(256, 128)  # Second dense layer
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        # cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        # x = cls_hs[0]
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        
        x = self.dropout(x)
        # x = self.transformer(x)
        # x = x.transpose(1, 2)  # Transpose the last two dimensions
        # parallel_outputs = [layer(x).squeeze(2) for layer in self.parallel_layers]
        # x = torch.cat(parallel_outputs, dim=1)
        # x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        # x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        # x = F.relu(self.dense2(x))  # Apply ReLU activation function after the second dense layer
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel5()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesE,predicted_rawE=LetsAssess(model)


In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel6(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')  
        # self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout1 = nn.Dropout(0.3)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=4), num_layers=1)
        self.dropout2 = nn.Dropout(0.3)
        self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.dense1 = nn.Linear(sum(output_sizes)*15, 256)  # Adjust the input size of the first dense layer
        self.dropout3 = nn.Dropout(0.3)
        self.clf = nn.Linear(256, 12)  # Adjust the input size of the final layer

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0]
        x = self.dropout1(x)
        x = self.transformer(x)
        x = self.dropout2(x)
        x = x.transpose(1, 2)  # Transpose the last two dimensions
        parallel_outputs = [self.maxpool(layer(x)).squeeze(2) for layer in self.parallel_layers]
        x = torch.cat(parallel_outputs, dim=1)
        x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        x = self.dropout3(x)
        x = self.clf(x)
        return x
    
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel6()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesF,predicted_rawF=LetsAssess(model)


In [None]:
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
# tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel7(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128,256]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')  
        # self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.5)
        # self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=8), num_layers=1)
        # self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        # self.dense1 = nn.Linear(sum(output_sizes)*31, 256)  # First dense layer
        # self.dense2 = nn.Linear(256, 128)  # Second dense layer
        self.clf = nn.Linear(self.hidden_size, 12)  # Adjust the input size of the final layer
        

    def forward(self, inputs, mask, labels):
        # cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        # x = cls_hs[0]
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0][:, 0, :] 
        
        x = self.dropout(x)
        # x = self.transformer(x)
        # x = x.transpose(1, 2)  # Transpose the last two dimensions
        # parallel_outputs = [layer(x).squeeze(2) for layer in self.parallel_layers]
        # x = torch.cat(parallel_outputs, dim=1)
        # x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        # x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        # x = F.relu(self.dense2(x))  # Apply ReLU activation function after the second dense layer
        x = self.clf(x)
        return x
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel7()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesG,predicted_rawG=LetsAssess(model)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

import torch.nn.functional as F
class CovModel8(nn.Module):
    def __init__(self, num_layers=4, output_sizes=[64,128]):
        super().__init__()        
        self.bert = BertModel.from_pretrained('bert-base-uncased')  
        # self.bert =DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.dropout1 = nn.Dropout(0.3)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.hidden_size, nhead=4), num_layers=1)
        self.dropout2 = nn.Dropout(0.3)
        self.parallel_layers = torch.nn.ModuleList([torch.nn.Conv1d(self.hidden_size, output_size, kernel_size=5, stride=4) for output_size in output_sizes])
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.dense1 = nn.Linear(sum(output_sizes)*15, 256)  # Adjust the input size of the first dense layer
        self.dropout3 = nn.Dropout(0.3)
        self.clf = nn.Linear(256, 12)  # Adjust the input size of the final layer

    def forward(self, inputs, mask, labels):
        cls_hs = self.bert(input_ids=inputs, attention_mask=mask, return_dict=False)
        x = cls_hs[0]
        x = self.dropout1(x)
        x = self.transformer(x)
        x = self.dropout2(x)
        x = x.transpose(1, 2)  # Transpose the last two dimensions
        parallel_outputs = [self.maxpool(layer(x)).squeeze(2) for layer in self.parallel_layers]
        x = torch.cat(parallel_outputs, dim=1)
        x = x.view(x.size(0), -1)  # Flatten the last two dimensions
        x = F.relu(self.dense1(x))  # Apply ReLU activation function after the first dense layer
        x = self.dropout3(x)
        x = self.clf(x)
        return x
    
best_model_path = "modelA.pt"    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CovModel8()
model.to(device)
print(model)
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-6)

model = train_model(5, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
predicted_valuesH,predicted_rawH=LetsAssess(model)


# Testing 

In [None]:
predicted_values=np.zeros((test_df.shape[0],12))
pv=(predicted_rawB+predicted_rawD+predicted_rawC+predicted_rawF+predicted_rawG)/6#+predicted_rawH)/7
# pv=(predicted_rawC+predicted_rawD)/2
# pv=(predicted_rawB+predicted_rawC+predicted_rawD+predicted_rawE+predicted_rawF)/5#+predicted_rawH)/7

# pv=(predicted_rawD+predicted_rawF)/2
# pv=(predicted_rawA+predicted_rawD+predicted_rawF)/2
# pv=predicted_rawA
for i,text in enumerate(test_df['Tweet']):    
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.from_numpy(pv[i]).to(device))
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
    if(sum(predictions))==0:
         argmax_index = probs.argmax()
         predictions[argmax_index] = 1
        # predictions[np.where(probs.cpu() >= 0.4)] = 1
        # if(sum(predictions))==0:
        #     predictions[np.where(probs.cpu() >= 0.3)] = 1
        #     if(sum(predictions))==0:
        #         predictions[np.where(probs.cpu() >= 0.2)] = 1
        #         if(sum(predictions))==0:
        #             predictions[np.where(probs.cpu() >= 0.1)] = 1
        #         else:
        #              argmax_index = probs.argmax()
        #              predictions[argmax_index] = 1
            
    predicted_values[i]=predictions

y_true=test_df[['unnecessary','mandatory','pharma','conspiracy','political','country','rushed','ingredients','side-effect','ineffective','religious','none']].to_numpy()
    
print(classification_report(y_true, predicted_values,target_names=target_list))

print("Accuracy score",accuracy_score(y_true, predicted_values))

multilabel_confusion_matrix(y_true, predicted_values)

# Calculate Jaccard score for each sample individually
sample_jaccard_scores = [metrics.jaccard_score(y_true[i], predicted_values[i]) for i in range(len(y_true))]

# Calculate the average Jaccard score
average_jaccard = np.mean(sample_jaccard_scores)
print("Average Jaccard: {:.3f}".format(average_jaccard))



In [None]:
import numpy as np

# # Save the arrays to .npy files
# np.save('predicted_rawA.npy', predicted_rawA)
# np.save('predicted_rawB.npy', predicted_rawB)
# np.save('predicted_rawC.npy', predicted_rawC)

# Load the arrays from .npy files
# predicted_rawA = np.load('predicted_rawA.npy')
# predicted_rawB = np.load('predicted_rawB.npy')
# predicted_rawC = np.load('predicted_rawC.npy')

In [None]:
from itertools import combinations
from sklearn.metrics import accuracy_score
import torch
import numpy as np

arrays = {'A1':predicted_rawA,'B1': predicted_rawB,'C1': predicted_rawC,'D1': predicted_rawD,'E1': predicted_rawE, 'F1':predicted_rawF,'G1': predicted_rawG, 'H1':predicted_rawH, 'T1':predicted_rawT,'R1':predicted_rawR}
best_score = 0
best_combination = None

for r in range(1, len(arrays) + 1):
    for combination in combinations(arrays.items(), r):
        # avg = sum(array for name, array in combination) / len(combination)
        avg = np.mean([array for name, array in combination], axis=0)
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.from_numpy(avg).to(device))
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs.cpu().numpy() >= 0.5)] = 1    
        for i in range(predictions.shape[0]):
            if np.all(predictions[i] == 0):
                argmax_index = probs[i].argmax()
                predictions[i][argmax_index] = 1
        
        # report = classification_report(y_true, predicted_values, output_dict=True)
        # score = report['weighted avg']['f1-score']  # Replace with your preferred metric
        score = accuracy_score(y_true, predictions)
        if score > best_score:
            best_score = score
            best_combination = [name for name, array in combination]

        # print("Combination:", [name for name, array in combination])

print("Best score:", best_score)
print("Best combination:", best_combination)