In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# main library required is transformer with pytorch backend
# no tensorflow

!pip install transformers

In [5]:
# import statements for stance tasks.
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import pandas as pd

In [6]:
# loading the final reranked file which should query and document
stance_df = pd.read_csv("/content/drive/MyDrive/Touche/duoT5-voting-reranked.csv")

In [7]:
stance_df

Unnamed: 0,title_id,title,doc_id,score,content
0,2,Which is better a laptop or a desktop,clueweb12-1804wb-90-15876___2,78.821897,Therefore hard drives in a laptop usually have...
1,2,Which is better a laptop or a desktop,clueweb12-0301wb-84-10147___2,85.427993,Many times consumers may opt to have both but ...
2,2,Which is better a laptop or a desktop,clueweb12-0907wb-60-03588___2,79.229755,There has been a turnaround in pricing trends ...
3,2,Which is better a laptop or a desktop,clueweb12-0013wb-19-15392___6,72.569362,Keep in mind a laptop is one unit If you break...
4,2,Which is better a laptop or a desktop,clueweb12-0608wb-66-03868___2,71.743426,Laptop computers are portable have the virtual...
...,...,...,...,...,...
49995,100,Should I learn Python or R for data analysis,clueweb12-0204wb-38-15941___33,-11.528502,cells which cell setval ord char ord Attribute...
49996,100,Should I learn Python or R for data analysis,clueweb12-0909wb-33-12181___40,-11.529103,Lisp has been around a long time and has watch...
49997,100,Should I learn Python or R for data analysis,clueweb12-0311wb-17-16828___2,-11.530167,Its teaching should therefore be regarded as a...
49998,100,Should I learn Python or R for data analysis,clueweb12-0001wb-56-00448___192,-11.530614,After that we plot these N estimates on the y ...


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device "{device}"')

Using device "cuda"


In [9]:
# model output 1: for object/neutral_no separator model.
# it separates first and second model from the neutral/no stance
def object_separator(evidence, claim, tokenizer, model):
    model.eval()
    def encode(claim, rationale):
        encoding = tokenizer(claim, rationale, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        return input_ids, attention_mask

    def predict(model, evidence, claim):
        with torch.no_grad():
            input_ids, attention_mask = encode(claim, evidence)
            logits = model(input_ids.to(device)).logits
            output = logits.argmax(dim=1).tolist()[0]
        return output


    return predict(model, evidence, claim)

In [10]:
# model output 1: for object/neutral_no detector model.
# it predicts first and second models as outputs.
def object_detector(evidence, claim, tokenizer, model):
    model.eval()
    def encode(claim, rationale):
        encoding = tokenizer(claim, rationale, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        return input_ids, attention_mask

    def predict(model, evidence, claim):
        with torch.no_grad():
            input_ids, attention_mask = encode(claim, evidence)
            logits = model(input_ids.to(device)).logits
            output = logits.argmax(dim=1).tolist()[0]
        return output

    return predict(model, evidence, claim)

In [11]:
# this function has the logic for converting prediction numbers to labels.

def detect(evidence, claim, tokenizer_n, model_n, tokenizer_e, model_e):
  # first step
  # predicting neutral or not
  first = object_separator(evidence, claim, tokenizer_n, model_n)
  final=1
  if first == 1:
      final = 1
  elif first == 0:
      final = 0
  else:
      #predicting support or not
      second = object_detector(evidence, claim, tokenizer_e, model_e)
      if second == 0:
          final = 2
      elif second == 1:
          final = 3
  return final

In [12]:
# loading pretrained models into the 'n' and 's' two-step models.
model_n_path = "/content/drive/MyDrive/Touche/first_model_classifier-epoch-17-f1-5888"
tokenizer_n = AutoTokenizer.from_pretrained(model_n_path)
config_n = AutoConfig.from_pretrained(model_n_path, num_labels=3)
model_n = AutoModelForSequenceClassification.from_pretrained(model_n_path, config=config_n).to(device)

model_s_path = "/content/drive/MyDrive/Touche/OBJECT1_OBJECT2-CLASSIFIER"

tokenizer_s = AutoTokenizer.from_pretrained(model_s_path)
config_s = AutoConfig.from_pretrained(model_s_path, num_labels=3)
model_s = AutoModelForSequenceClassification.from_pretrained(model_s_path, config=config_s).to(device)

In [13]:
# for conversion in below code
LABELS = ['NO', 'NEUTRAL', 'FIRST', 'SECOND']

In [14]:
import csv
method = ['levirank_baseline', 'levirank_dense_initial_retrieval', 'levirank_dense_vote_initial_retrieval','levirank_psuedo_relevance_feedback',
          'levirank_voting_retrieval','levirank_psuedo_relevance_feedback_and_voting_retrieval']


In [15]:
stance_df.columns

Index(['title_id', 'title', 'doc_id', 'score', 'content'], dtype='object')

In [17]:
# stance prediction and output writing code.
data = []
for title in tqdm(stance_df["title"].unique()):
  passages=stance_df.loc[stance_df["title"]==title].sort_values(by="score",ascending=False).iloc[0:750,:]
  rank=1
  for qid_,did_, q_, d_, s_ in list(zip(passages['title_id'], passages['doc_id'],passages['title'], passages['content'],passages['score'])):
      label_index = detect(d_, q_, tokenizer_n, model_n, tokenizer_s, model_s)
      label = LABELS[label_index]
      # print('query :'+ q_)
      # print('document :'+ d_)
      # print(label)
      # print('\n')
      row = [qid_, label, did_, rank, s_, method[4]] #change here the method used for the model
      data.append(row)
      rank=rank+1

100%|██████████| 50/50 [2:31:37<00:00, 181.95s/it]


In [None]:
data

In [18]:
with open('/content/drive/MyDrive/Touche/captain_levi__voting_run_2022.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write multiple rows
    writer.writerows(data)

In [19]:
csv_file = r'/content/drive/MyDrive/Touche/captain_levi__voting_run_2022.csv'
txt_file = r'/content/drive/MyDrive/Touche/captain_levi__voting_run_2022.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()