In [1]:
import json
from json import loads,dumps
import ast

import pickle
import pandas as pd
import numpy as np
from PIL import Image

import os
import time
import math
from sklearn.metrics import roc_auc_score, confusion_matrix, matthews_corrcoef, roc_curve, auc, average_precision_score
from scipy.special import softmax
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss

from transformers import BeitForImageClassification, BeitFeatureExtractor, Trainer, TrainingArguments
from transformers import FlavaConfig, FlavaModel, FlavaForPreTraining
from transformers import VisionEncoderDecoderModel, AutoTokenizer
from transformers import CLIPProcessor, CLIPModel, CLIPFeatureExtractor
from transformers import TrOCRProcessor, DebertaV2ForSequenceClassification, DebertaV2Config
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor
from transformers import AutoProcessor, GitVisionModel, AutoModelForCausalLM
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import BertPreTrainedModel, RobertaConfig, RobertaTokenizerFast

from transformers.optimization import AdamW, get_linear_schedule_with_warmup

from transformers.models.roberta.modeling_roberta import RobertaClassificationHead, RobertaConfig, RobertaModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.version.cuda)

11.8


device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
caption_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
caption_feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
caption_model.to(device)

max_length = 32
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
    images = []
    for image_path in image_paths:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        images.append(i_image)

    pixel_values = caption_feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = caption_model.generate(pixel_values, **gen_kwargs)

    preds = caption_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

predict_step(['hm_data/img/01236.png'])

processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")

model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

image = Image.open('hm_data/img/01236.png')

pixel_values = processor(images=image, return_tensors="pt").pixel_values

generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(generated_caption)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer_sent = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model_sent = AutoModelForSequenceClassification.from_pretrained(MODEL)
# #model.save_pretrained(MODEL)
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)
# scores = output[0][0].detach().numpy()
# scores = softmax(scores)
# ranking = np.argsort(scores)
# ranking = ranking[::-1]
# for i in range(scores.shape[0]):
#     l = config.id2label[ranking[i]]
    
sentiment_task = pipeline("sentiment-analysis", model=model_sent, tokenizer=tokenizer_sent)


print(sentiment_task("a man with his head turned to the side of the road."))

img_dir = "hm_data/img"

with open("train.jsonl",encoding='utf8') as f:
    data = [json.loads(line) for line in f]

inc = 1

for img in data:
    img['text-sentiment'] = sentiment_task(img['text'])[0]['label']
    image = Image.open('hm_data/'+img['img'])
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    img['caption'] = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    img['caption-sentiment'] = sentiment_task(img['caption'])[0]['label']
    inc += 1
    if inc%250 == 0:
        print(inc)
#     print(img)
with open('train_sentiment.jsonl','w',encoding='utf8') as f:
    for item in data:
        json.dump(item,f)
        print(file=f)

with open("train_dev_all.entity.jsonl",encoding='utf8') as m:
    datah = [json.loads(line) for line in m]

dfdata = pd.DataFrame(data)
dfdatah = pd.DataFrame(datah)

dfdata.head()

dfdata.shape

dfdatah.head()

dfmerge = dfdata.merge(dfdatah[['img', 'partition_description']], how = 'left', on='img')

dfmerge.shape

dfmerge.head()

# https://stackoverflow.com/questions/17864466/flatten-a-list-of-strings-and-lists-of-strings-and-lists-in-python
def flatten_to_strings(listOfLists):
    """Flatten a list of (lists of (lists of strings)) for any level 
    of nesting"""
    result = []

    for i in listOfLists:
        # Only append if i is a basestring (superclass of string)
        if isinstance(i, str):
            result.append(i)
        # Otherwise call this function recursively
        else:
            result.extend(flatten_to_strings(i))
    return result


dfmerge['partition_description'] = dfmerge['partition_description'].apply(flatten_to_strings)

dfmerge.head()

dfmerge['partition_description'] = ",".join(dfmerge['partition_description'][0])
dfmerge.head()

dfmerge['texty'] = dfmerge['text'] + "[SEP]" + dfmerge['text-sentiment'] + "[SEP]" + dfmerge['caption'] + "[SEP]" + dfmerge['caption-sentiment'] + "[SEP]" + dfmerge['partition_description']

dfmerge.head()

outs = dfmerge[['id','img','label','texty']].to_dict('records')

with open('train_text.jsonl', "w",encoding='utf8') as f:
    for i in range(len(outs)):
        f.write(str(outs[i])+"\n")

# print(outs)

img_dir = "hm_data/img"

with open("dev_seen.jsonl",encoding='utf8') as f:
    data = [json.loads(line) for line in f]

inc = 1

for img in data:
    img['text-sentiment'] = sentiment_task(img['text'])[0]['label']
    image = Image.open('hm_data/'+img['img'])
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    img['caption'] = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    img['caption-sentiment'] = sentiment_task(img['caption'])[0]['label']
    inc += 1
    if inc%250 == 0:
        print(inc)
#     print(img)
with open('dev_seen_sentiment.jsonl','w',encoding='utf8') as f:
    for item in data:
        json.dump(item,f)
        print(file=f)

dfdata = pd.DataFrame(data)
dfmerge = dfdata.merge(dfdatah[['img', 'partition_description']], how = 'left', on='img')
dfmerge['partition_description'] = dfmerge['partition_description'].apply(flatten_to_strings)
dfmerge['partition_description'] = ",".join(dfmerge['partition_description'][0])
dfmerge['texty'] = dfmerge['text'] + "[SEP]" + dfmerge['text-sentiment'] + "[SEP]" + dfmerge['caption'] + "[SEP]" + dfmerge['caption-sentiment'] + "[SEP]" + dfmerge['partition_description']
outs = dfmerge[['id','img','label','texty']].to_dict('records')

with open('dev_seen_text.jsonl', "w",encoding='utf8') as f:
    for i in range(len(outs)):
        f.write(str(outs[i])+"\n")

img_dir = "hm_data/img"

with open("dev_unseen.jsonl",encoding='utf8') as f:
    data = [json.loads(line) for line in f]

inc = 1

for img in data:
    img['text-sentiment'] = sentiment_task(img['text'])[0]['label']
    image = Image.open('hm_data/'+img['img'])
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    img['caption'] = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    img['caption-sentiment'] = sentiment_task(img['caption'])[0]['label']
    inc += 1
    if inc%250 == 0:
        print(inc)
#     print(img)
with open('dev_unseen_sentiment.jsonl','w',encoding='utf8') as f:
    for item in data:
        json.dump(item,f)
        print(file=f)

dfdata = pd.DataFrame(data)
dfmerge = dfdata.merge(dfdatah[['img', 'partition_description']], how = 'left', on='img')
dfmerge['partition_description'] = dfmerge['partition_description'].apply(flatten_to_strings)
dfmerge['partition_description'] = ",".join(dfmerge['partition_description'][0])
dfmerge['texty'] = dfmerge['text'] + "[SEP]" + dfmerge['text-sentiment'] + "[SEP]" + dfmerge['caption'] + "[SEP]" + dfmerge['caption-sentiment'] + "[SEP]" + dfmerge['partition_description']
outs = dfmerge[['id','img','label','texty']].to_dict('records')

with open('dev_unseen_text.jsonl', "w",encoding='utf8') as f:
    for i in range(len(outs)):
        f.write(str(outs[i])+"\n")

In [3]:
# https://github.com/pchanda/pchanda.github.io/blob/master/_posts/2021-04-15-Roberta-FineTuning-for-Classification.md

In [4]:
with open("train_text.jsonl", "r", encoding='utf8') as f:
    train_df = pd.DataFrame([ast.literal_eval(line) for line in f.readlines()])
train_df.head()

Unnamed: 0,id,img,label,texty
0,42953,img/42953.png,0,its their character not their color that matte...
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet[SEP]neutral[SEP]a gra...
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [5]:
def county(st):
    return st.count(' ')

In [6]:
max(train_df['texty'].apply(county))

88

In [7]:
with open("dev_seen_text.jsonl", "r", encoding='utf8') as f:
    test_df = pd.DataFrame([ast.literal_eval(line) for line in f.readlines()])
test_df.head()

Unnamed: 0,id,img,label,texty
0,8291,img/08291.png,1,white people is this a shooting range[SEP]nega...
1,46971,img/46971.png,1,bravery at its finest[SEP]negative[SEP]a man w...
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime[SEP]neutral[SEP]a poste...


In [8]:
# model_name = 'pchanda/pretrained-smiles-pubchem10m'
model_name = "microsoft/deberta-large"
num_labels = 2
device = torch.device("cuda")

tokenizer_name = model_name

max_seq_length = 100 
train_batch_size = 8
test_batch_size = 8
warmup_ratio = 0.06
weight_decay= 2e-05#1e-04 # from .0
gradient_accumulation_steps = 1
num_train_epochs = 25
learning_rate = 1e-05 
adam_epsilon = 1e-05#1e-04 # from 1e-08

num_hidden_layers = 12
num_attention_heads = 12

In [9]:
class RobertaForSmilesClassification(BertPreTrainedModel):
    
    def __init__(self, config):
        super(RobertaForSmilesClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.roberta(input_ids,attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)


In [10]:
config_class = DebertaV2Config
# model_class = RobertaForSmilesClassification
model_class = DebertaV2ForSequenceClassification

# tokenizer_class = RobertaTokenizerFast
tokenizer_class = AutoTokenizer

config = config_class.from_pretrained(model_name, num_labels=num_labels)#, num_hidden_layers = num_hidden_layers,num_attention_heads=num_attention_heads)

model = model_class.from_pretrained(model_name, config=config)
print('Model=\n',model,'\n')

tokenizer = tokenizer_class.from_pretrained(tokenizer_name, do_lower_case=False)
print('Tokenizer=',tokenizer,'\n')


You are using a model of type deberta to instantiate a model of type deberta-v2. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaV2ForSequenceClassification: ['deberta.encoder.layer.14.attention.self.pos_proj.weight', 'deberta.encoder.layer.13.attention.self.pos_proj.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'deberta.encoder.layer.6.attention.self.v_bias', 'deberta.encoder.layer.16.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.14.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.6.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.11.attention.self.pos_q_proj.weight', 'lm_predictions.lm_head.dense.weight', 'deberta.encoder.layer.7.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.10.attention.self.pos_proj.weight', 'deberta.encoder.layer.8.attention.self.pos_proj.weight', 'deberta.encoder.layer.8.attention.self.pos_

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['deberta.encoder.layer.5.attention.self.pos_query_proj.weight', 'deberta.encoder.layer.14.attention.self.pos_key_proj.bias', 'deberta.encoder.layer.6.attention.self.value_proj.weight', 'deberta.encoder.layer.16.attention.self.query_proj.weight', 'deberta.encoder.layer.13.attention.self.value_proj.weight', 'deberta.encoder.layer.5.attention.self.key_proj.bias', 'deberta.encoder.layer.20.attention.self.query_proj.weight', 'deberta.encoder.layer.10.attention.self.value_proj.bias', 'deberta.encoder.layer.2.attention.self.pos_key_proj.bias', 'deberta.encoder.layer.0.attention.self.key_proj.bias', 'deberta.encoder.layer.14.attention.self.value_proj.weight', 'deberta.encoder.layer.11.attention.self.query_proj.weight', 'deberta.encoder.layer.21.attention.self.query_proj.weight', 'deberta.encoder.layer.13.attention.self.pos_query_proj.bias', 'd

Model=
 DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (pos_key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
      

In [11]:
class MyClassificationDataset(Dataset):
    
    def __init__(self, data, tokenizer):
        text, labels = data
        self.examples = tokenizer(text=text,text_pair=None,truncation=True,padding="max_length",
                                  max_length=max_seq_length,return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)
        

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, index):
        return {key: self.examples[key][index] for key in self.examples}, self.labels[index]


train_examples = (train_df.iloc[:, 3].astype(str).replace('[SEP]',' [SEP] ').tolist(), train_df.iloc[:, 2].tolist())
train_dataset = MyClassificationDataset(train_examples,tokenizer)

test_examples = (test_df.iloc[:, 3].astype(str).replace('[SEP]',' [SEP] ').tolist(), test_df.iloc[:, 2].tolist())
test_dataset = MyClassificationDataset(test_examples,tokenizer)

In [12]:
def get_inputs_dict(batch):
    inputs = {key: value.squeeze(1).to(device) for key, value in batch[0].items()}
    inputs["labels"] = batch[1].to(device)
    return inputs

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,sampler=train_sampler,batch_size=train_batch_size)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=test_batch_size)

#Extract a batch as sanity-check
batch = get_inputs_dict(next(iter(train_dataloader)))
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)

print(batch)

{'input_ids': tensor([[    1,  2527,    47,  4161,     7, 11619,   930,    53,    47,   214,
         22169,    30,    22, 29126,    24,    18,  2569,   751,   113,     2,
         33407,     2,   102,   313,  1826,    10, 18896,    11,   760,     9,
            10, 18896,     4,     2, 12516,     2,   597, 27015,  2549,     6,
          3628,     6, 43457,  2943, 24876,     6,   397,     2,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    1,  8310,    89,    18,    10,  2173,     2, 12516,     2,   102,
          9843,  1580,     9,  2356,     9,  8676,  1311,    10,  1901,     4,
             2, 12516,     2,   597, 

In [13]:
t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
optimizer_grouped_parameters = []
custom_parameter_names = set()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters.extend(
    [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
)

warmup_steps = math.ceil(t_total * warmup_ratio)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)



In [14]:
def compute_metrics(preds, model_outputs, labels, eval_examples=None, multi_label=False):
    assert len(preds) == len(labels)
    mismatched = labels != preds
    wrong = [i for (i, v) in zip(eval_examples, mismatched) if v.any()]
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
    scores = np.array([softmax(element)[1] for element in model_outputs])
    fpr, tpr, thresholds = roc_curve(labels, scores)
    auroc = auc(fpr, tpr)
    auprc = average_precision_score(labels, scores)
    return (
        {
            **{"mcc": mcc, "tp": tp, "tn": tn, "fp": fp, "fn": fn, "auroc": auroc, "auprc": auprc},
        },
        wrong,
    )

def print_confusion_matrix(result):
    print('confusion matrix:')
    print('            predicted    ')
    print('          0     |     1')
    print('    ----------------------')
    print('   0 | ',format(result['tn'],'5d'),' | ',format(result['fp'],'5d'))
    print('gt -----------------------')
    print('   1 | ',format(result['fn'],'5d'),' | ',format(result['tp'],'5d'))
    print('---------------------------------------------------')

In [None]:
model.to(device)

model.zero_grad()

for epoch in range(num_train_epochs):

    model.train()
    epoch_loss = []
    
    for batch in train_dataloader:
        batch = get_inputs_dict(batch)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        epoch_loss.append(loss.item())
        
    #evaluate model with test_df at the end of the epoch.
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(test_dataloader)
    preds = np.empty((len(test_dataset), num_labels))
    out_label_ids = np.empty((len(test_dataset)))
    model.eval()
    
    for i,test_batch in enumerate(test_dataloader):
        with torch.no_grad():
            test_batch = get_inputs_dict(test_batch)
            input_ids = test_batch['input_ids'].to(device)
            attention_mask = test_batch['attention_mask'].to(device)
            labels = test_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(test_dataset)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    result, wrong = compute_metrics(preds, model_outputs, out_label_ids, test_examples)
    
    print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))
    print('epoch',epoch,'Testing  avg loss',eval_loss)
    print(result)
    print_confusion_matrix(result)
    print('---------------------------------------------------\n')

epoch 0 Training avg loss 0.6545800213839575
epoch 0 Testing  avg loss 0.7509521860924978
{'mcc': 0.0, 'tp': 0, 'tn': 253, 'fp': 0, 'fn': 247, 'auroc': 0.5011441647597255, 'auprc': 0.4924028614603228}
confusion matrix:
            predicted    
          0     |     1
    ----------------------
   0 |    253  |      0
gt -----------------------
   1 |    247  |      0
---------------------------------------------------
---------------------------------------------------

epoch 1 Training avg loss 0.6558560478384498
epoch 1 Testing  avg loss 0.7349526063790397
{'mcc': 0.0, 'tp': 0, 'tn': 253, 'fp': 0, 'fn': 247, 'auroc': 0.5087132547086781, 'auprc': 0.5098741806397499}
confusion matrix:
            predicted    
          0     |     1
    ----------------------
   0 |    253  |      0
gt -----------------------
   1 |    247  |      0
---------------------------------------------------
---------------------------------------------------

epoch 2 Training avg loss 0.6535120451102701
epo