In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# main library required is transformer with pytorch backend
# no tensorflow

!pip install transformers

In [3]:
# import statements for stance tasks.
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import pandas as pd

In [10]:
# loading the final reranked file which should query and document
stance_df = pd.read_csv("/content/drive/MyDrive/Touche/Final_Reranked_Duo.csv")

In [11]:
stance_df.head()

Unnamed: 0,title_id,title,New_title,doc_id,score,content
0,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop? computer",clueweb12-1804wb-90-15876___2,79.028076,Therefore hard drives in a laptop usually have...
1,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop? computer",clueweb12-0301wb-84-10147___2,85.443459,Many times consumers may opt to have both but ...
2,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop? computer",clueweb12-0608wb-66-03868___2,76.021535,Laptop computers are portable have the virtual...
3,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop? computer",clueweb12-0013wb-19-15392___6,74.094092,Keep in mind a laptop is one unit If you break...
4,2,"Which is better, a laptop or a desktop?","Which is better, a laptop or a desktop? computer",clueweb12-0907wb-60-03588___2,77.725021,There has been a turnaround in pricing trends ...


In [4]:
# make sure, that we have cpu only for TIRA submissions.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device "{device}"')

Using device "cuda"


In [5]:
# model output 1: for object/neutral_no separator model.
# it separates first and second model from the neutral/no stance
def object_separator(evidence, claim, tokenizer, model):
    model.eval()
    def encode(claim, rationale):
        encoding = tokenizer(claim, rationale, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        return input_ids, attention_mask

    def predict(model, evidence, claim):
        with torch.no_grad():
            input_ids, attention_mask = encode(claim, evidence)
            logits = model(input_ids.to(device)).logits
            output = logits.argmax(dim=1).tolist()[0]
        return output


    return predict(model, evidence, claim)

In [6]:
# model output 1: for object/neutral_no detector model.
# it predicts first and second models as outputs.
def object_detector(evidence, claim, tokenizer, model):
    model.eval()
    def encode(claim, rationale):
        encoding = tokenizer(claim, rationale, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        return input_ids, attention_mask

    def predict(model, evidence, claim):
        with torch.no_grad():
            input_ids, attention_mask = encode(claim, evidence)
            logits = model(input_ids.to(device)).logits
            output = logits.argmax(dim=1).tolist()[0]
        return output

    return predict(model, evidence, claim)

In [22]:
# this function has the logic for converting prediction numbers to labels.
# please, take time and try to understand it.
# also, feel free to call if required for understanding.
count_wrong=0
def detect(evidence, claim, tokenizer_n, model_n, tokenizer_e, model_e):
  global count_wrong
  # first step
  # predicting neutral or not
  first = object_separator(evidence, claim, tokenizer_n, model_n)
  final=1
  if first == 1:
      final = 1
  elif first == 0:
      final = 0
  else:
      #predicting support or not
      second = object_detector(evidence, claim, tokenizer_e, model_e)
      if second == 0:
          final = 2
      elif second == 1:
          final = 3
      else:
        count_wrong = count_wrong+1
  return final

In [9]:
# loading andreea's models into the 'n' and 's' two-step models.
model_n_path = "/content/drive/MyDrive/Touche/first_model_classifier-epoch-17-f1-5888"
tokenizer_n = AutoTokenizer.from_pretrained(model_n_path)
config_n = AutoConfig.from_pretrained(model_n_path, num_labels=3)
model_n = AutoModelForSequenceClassification.from_pretrained(model_n_path, config=config_n).to(device)

model_s_path = "/content/drive/MyDrive/Touche/OBJECT1_OBJECT2-CLASSIFIER"

tokenizer_s = AutoTokenizer.from_pretrained(model_s_path)
config_s = AutoConfig.from_pretrained(model_s_path, num_labels=3)
model_s = AutoModelForSequenceClassification.from_pretrained(model_s_path, config=config_s).to(device)

In [12]:
# for conversion in below code
LABELS = ['NO', 'NEUTRAL', 'FIRST', 'SECOND']

In [13]:
import csv
method = ['levirank_baseline', 'levirank_dense_initial_retrieval', 'levirank_dense_vote_initial_retrieval','levirank_psuedo_relevance_feedback']


In [14]:
stance_df.columns

Index(['title_id', 'title', 'New_title', 'doc_id', 'score', 'content'], dtype='object')

In [23]:
# stance prediction and output writing code.
data = []
for title in tqdm(stance_df["title"].unique()):
  passages=stance_df.loc[stance_df["title"]==title].sort_values(by="score",ascending=False).iloc[0:1000,:]
  rank=1
  for qid_,did_, q_, d_, s_ in list(zip(passages['title_id'], passages['doc_id'],passages['title'], passages['content'],passages['score'])):
      label_index = detect(d_, q_, tokenizer_n, model_n, tokenizer_s, model_s)
      label = LABELS[label_index]
      # print('query :'+ q_)
      # print('document :'+ d_)
      # print(label)
      # print('\n')
      row = [qid_, label, did_, rank, s_, method[3]] #change here the method used for the model
      data.append(row)
      rank=rank+1


100%|██████████| 50/50 [1:15:56<00:00, 91.12s/it]


In [None]:
data

In [24]:
with open('/content/drive/MyDrive/Touche/captain_levi_psuedo_relevance_run_2022.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write multiple rows
    writer.writerows(data)

In [25]:
csv_file = r'/content/drive/MyDrive/Touche/captain_levi_psuedo_relevance_run_2022.csv'
txt_file = r'/content/drive/MyDrive/Touche/captain_levi_psuedo_relevance_run_2022.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()