In [None]:
import collections
import csv
import random

random.seed(89)

In [None]:
ANNOTATORS = ["KK", "NK", "RZ"]

data = {
  "paper1": {
    "title": "title1",
    "sysA": "title2",
    "sysB": "title3",
    "sysC": "title4",
  },
 "paper5": {

    "title": "title5",
    "sysA": "title6",
    "sysB": "title7",
    "sysC": "title8",
  },
 "paper9": {

    "title": "title9",
    "sysA": "title10",
    "sysB": "title11",
    "sysC": "title12",
  },

}

FIELDS = "id_2017 system title_2017 title_2018 rating".split()

AnnotationRow = collections.namedtuple("AnnotationRow",
  FIELDS)

def get_spsm_data():
  """TODO: complete this function. Pretty sure the logic is not correct.

    Should return a map from 2017 paper IDs to AnnotationRows.
  """
  by_strata = collections.defaultdict(dict)
  spsm_papers = {}
  with open('df_SPSM_results.csv', 'r') as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader):
      by_strata[row['strata']][row['conf_year']] = (row['paper_id'],
        row['title'])

  for _strata, year_map in by_strata.items():
    paper_id, title_2017  = year_map["2017"]
    _, title_2018  = year_map["2018"]

    spsm_papers[paper_id] = AnnotationRow(paper_id, "spsm",
      title_2017, title_2018, '')

  return spsm_papers

def get_cluster_data():
  """TODO: complete this function.

    Should return a map from 2017 paper IDs to AnnotationRows.
  """


  cluster_papers = {}
  with open('df_cluster_results_10_match_cosine_23.csv', 'r') as f:
    reader = csv.DictReader(f)

def get_naive_title_data(cluster_data):
  """TODO: complete this function.

    Should return a map from 2017 paper IDs to AnnotationRows.
  """
  naive_papers = {}
  with open('naive_titles_2018.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
      assert row["conf_year"] == "2018"
      assert row['id'] == row['paper_id']
      naive_papers[row["conf_year"]] = row['title']

  return naive_papers



def main():

  # First, we get information for paper pairs from each system.

  spsm_data = get_spsm_data()
  cluster_data = get_cluster_data()
  # Naive data will need one of the other data dicts as input, to know what the
  # 2017 titles should be
  naive_data = get_naive_data(cluster_data)


  # The code below uses the format of the data variable declared above.
  # TODO: convert info from spsm_data, cluster_data, naive_data into this
  # format.

  # Each annotator is annotating the same set of examples, they are just going
  # to be ordered differently.
  examples = []
  for paper_id, related_paper_map in data.items():
    title = related_paper_map['title']
    for system, paper2_title in related_paper_map.items():
      if system == 'title':
        continue
      examples.append((paper_id, title, system, paper2_title))

  for annotator in ANNOTATORS:
    # Create three tsv files
    with open(f"annotations_{annotator}.tsv", 'w') as f:
      writer = csv.DictWriter(f, fieldnames=FIELDS, delimiter='\t')
      writer.writeheader()
      random.shuffle(examples)
      # This is a stable sort, so it will only use the given key and leave
      # other things shuffled.
      resorted = sorted(examples, key=lambda x:x[0])
      for i, (p1, t1, s, t2) in enumerate(resorted):
        writer.writerow(
        AnnotationRow(p1, s, t1, t2, "")._asdict()
        )
        if i % 3 == 2: # Just a fast way to add some spacing for visual clarity
          writer.writerow(AnnotationRow("", "", "", "", "")._asdict())


if __name__ == "__main__":
  main()

