# Data

In [None]:
from google.colab import files, drive
import pandas as pd
import re
from tqdm.auto import tqdm
import json


drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('./drive/My Drive/work/qa_data.csv', sep='\t', encoding='utf-8')

In [None]:
with open('./drive/My Drive/d/SeqDataSetAllData_TestDataExcluded_Flair_train.json', 'r', encoding='utf-8') as f:
  js_frame = json.load(f)

prefix for few-shot learning:

In [None]:
#inds = [0, 150, 317, 321, 499, 3, 539, 561, 562] # for t5-large experiment
inds = [150, 321, 3, 0] 

full_text=''
for i in inds:
  text = f"""What are the objects and the aspect of comparison in the sentence '{df.iloc[i,0]}'
Objects: '{df.iloc[i,1]}', '{df.iloc[i,2]}'
Aspect: {df.iloc[i,3]}"""
  full_text+=text+'\n\n'


## split to sentence, obj1, obj2, aspect

In [None]:
with open('./drive/My Drive/SeqDataSetAllData_TestDataExcluded_Flair_train.tsv', encoding='utf-8') as f:
  train = [sent.split('\n') for sent in f.read().split('\n\n')]
  
del train[-1]

In [None]:
sentences = []
objects1 = []
objects2 = []
aspects = []

for sent in tqdm(train[:500], total=len(train[:500])):
  obj1 = []
  obj2 = []
  asp = []
  txt = []
  for word in sent:
    w = word.split('\t')
    txt.append(w[0])
    if w[1] == 'OBJ-1':
      obj1.append(w[0])
    elif w[1] == 'OBJ-2':
      obj2.append(w[0])
    elif w[1] == 'SHARED':
      obj1.append(w[0])
      obj2.append(w[0])
    elif w[1] == 'ASPECT':
      asp.append(w[0])
  object1 = ' '.join(obj1)
  object2 = ' '.join(obj2)
  sent_text = ' '.join(txt)
  aspect = ' '.join(asp)
  
  sentences.append(sent_text)
  objects1.append(object1)
  objects2.append(object2)
  aspects.append(aspect)
    


  0%|          | 0/500 [00:00<?, ?it/s]

## csv

In [None]:
df = pd.DataFrame({'sentence':sentences, 'obj-1':objects1, 'obj-2':objects2, 'aspect':aspects})

In [None]:
df.loc[df['aspect'] == '', 'aspect'] = '-'

In [None]:
df.to_csv('./drive/My Drive/qa_data.csv', sep='\t', encoding='utf-8', index=False)

## json

In [None]:
json_data = []
for i in range(len(sentences)):
  instruction = f"What are the objects and the aspect of comparison in the sentence '{sentences[i]}'?"
  input = ''
  output = f"There are two objects: {objects1[i]} and {objects2[i]}, and the aspect: {aspects[i]}."

  json_data.append({'instruction':instruction, 'input':'', 'output':output})

In [None]:
js_frame = json.dumps(json_data)

In [None]:
with open('./drive/My Drive/SeqDataSetAllData_TestDataExcluded_Flair_train.json', 'w', encoding='utf-8') as f:
  f.write(js_frame)

# LLaMA

In [None]:
!pip install bitsandbytes
!pip install -q datasets loralib sentencepiece
!pip install -q git+https://github.com/zphang/transformers@c3dc391
!pip install -q git+https://github.com/huggingface/peft.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bitsandbytes
  Downloading bitsandbytes-0.38.1-py3-none-any.whl (104.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.38.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m


In [None]:
from peft import PeftModel
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig

tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)

In [None]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [None]:
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    num_beams=4,
)

def evaluate(instruction, input=None):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        return output.split("### Response:")[1].strip()

In [None]:
answers = []
for i in tqdm(range(len(js_frame))):
  sent = js_frame[i]
  answer = evaluate(sent['instruction']).split('\n### Instruction:')[0]
  answer = re.sub('\n', ' ', answer)
  answers.append(answer.strip())

In [None]:
df['llama_zero'] = answers

In [None]:
df.to_csv('./drive/My Drive/llama_zeroshot.csv', sep='\t', encoding='utf-8', index=False)

# Dolly 

In [None]:
%pip install "accelerate>=0.16.0,<1" "transformers[torch]>=4.28.1,<5" "torch>=1.13.1,<2" -q


In [None]:
import torch
from transformers import pipeline

generate_text = pipeline(model="databricks/dolly-v2-3b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto") 


In [None]:
answers = []
for i in tqdm(range(len(js_frame))): 
  sent = js_frame[i]
  answer = generate_text(sent['instruction'])[0]['generated_text']
  answers.append(answer)


In [None]:
df['dolly_zero'] = answers

In [None]:
df.to_csv('./drive/My Drive/dolly_zeroshot.csv', sep='\t', index=False, encoding='utf-8')

In [None]:
answers = []
for i in tqdm(range(500)): 
  sent = js_frame[i]
  answer = generate_text(full_text+sent['instruction'])[0]['generated_text']
  answers.append(answer)

In [None]:
df['dolly_few'] = answers

In [None]:
df.to_csv('./drive/My Drive/dolly_fewshot.csv', sep='\t', index=False, encoding='utf-8')

# NeoGPT

In [None]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import pipeline
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', max_length=250)


In [None]:
answers = []
for i in tqdm(range(js_frame)): 
  sent = js_frame[i]
  answer = generator(sent['instruction'])[0]['generated_text']
  answers.append(answer)

In [None]:
df['neogpt_zero'] = answers

In [None]:
df.to_csv('./drive/My Drive/neogpt_zeroshot.csv', sep='\t', index=False, encoding='utf-8')

In [None]:
answers = []
for i in tqdm(range(500)): 
  sent = js_frame[i]
  answer = generator(full_text+sent['instruction'])[0]['generated_text']
  answers.append(answer)

In [None]:
df['neogpt_few'] = answers

df.to_csv('./drive/My Drive/neogpt_fewshot.csv', sep='\t', index=False, encoding='utf-8')

# GPT-J

In [None]:
!pip install transformers -q
!pip install accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") 
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")



Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [None]:
def generate_text(inp_text):
  input_ids = tokenizer(inp_text, return_tensors="pt").input_ids
  gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=300, # zero - 150
  )
  gen_text = tokenizer.batch_decode(gen_tokens)[0]
  return gen_text

In [None]:
answers = []
for i in tqdm(range(len(js_frame))): 
  sent = js_frame[i]
  answer = generate_text(sent['instruction'])
  answers.append(answer)


In [None]:
df['gptj_zero'] = answers
df.to_csv('./drive/My Drive/gptj_zeroshot.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
answers = []
for i in tqdm(range(500)): 
  sent = js_frame[i]
  answer = generate_text(full_text+sent['instruction'])
  answers.append(answer)


In [None]:
df['gptj_few'] = answers
df.to_csv('./drive/My Drive/gptj_fewshot.csv', sep='\t', encoding='utf-8', index=False)

# GPT-2

In [None]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel 
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

text_generation = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [None]:
answers = []
for i in tqdm(range(js_frame)): 
  sent = js_frame[i]
  answer = text_generation(sent['instruction'], max_length=300, do_sample=False)[0]['generated_text']
  answers.append(answer)

In [None]:
df['gpt2_zero'] = answers

In [None]:
df.to_csv('./drive/My Drive/gpt2_zeroshot.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
answers = []
for i in tqdm(range(500)): 
  sent = js_frame[i]
  answer = text_generation(full_text+sent['instruction'], max_length=300, do_sample=False)[0]['generated_text']
  answers.append(answer)


In [None]:
df['gpt2_few'] = answers

In [None]:
df.to_csv('./drive/My Drive/gpt2_fewshot.csv', sep='\t', encoding='utf-8', index=False)

# T5


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def generator(inp_text):
  input_ids = tokenizer(inp_text, return_tensors="pt").input_ids

  outputs = model.generate(input_ids, max_length=100)
  preds = [
    tokenizer.decode(out, skip_special_tokens=True)
    for out in outputs
    ]
  return preds[0]

In [None]:
answers = []
for i in tqdm(range(js_frame)):
  sent = js_frame[i]
  answer = generator(sent['instruction'])
  answers.append(answer)

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
df['t5_zero'] = answers

In [None]:
df.to_csv('./drive/My Drive/t5_zeroshot.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
answers = []
for i in tqdm(range(500)): 
  sent = js_frame[i]
  answer = generator(full_text+sent['instruction'])
  answers.append(answer)

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
df['t5_few'] = answers

In [None]:
df.to_csv('./drive/My Drive/t5_fewshot.csv', sep='\t', encoding='utf-8', index=False)

# FLAN-T5

In [None]:
!pip install transformers -q

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large") 


Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
answers = []
for i in tqdm(range(len(js_frame))):
  sent = js_frame[i]
  inputs = tokenizer(sent['instruction'], return_tensors="pt") 
  outputs = model.generate(**inputs, max_length=100) 
  answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  answers.append(answer)

In [None]:
df['flant5_zero'] = answers

In [None]:
df.to_csv('./drive/My Drive/flant5_zeroshot.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") # xl 

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
answers = []
for i in tqdm(range(500)):
  sent = js_frame[i]
  inputs = tokenizer(full_text+sent['instruction'], return_tensors="pt") 
  outputs = model.generate(**inputs, max_length=100) 
  answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  answers.append(answer)

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
df['flant5_few'] = answers

In [None]:
df.to_csv('./drive/My Drive/flant5xl_fewshot.csv', sep='\t', encoding='utf-8', index=False)