In [11]:
import torch
import numpy as np
import pandas as pd
import re
import pprint
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pp = pprint.PrettyPrinter(indent=2)

In [12]:
def orcas_to_raw_dataframe(path_file):
    """
    Reads the orcas dataset and returns a pandas dataframe.
    """
    f = open(path_file, 'r')
    lines = f.readlines()
    raw_df = pd.DataFrame(columns=['q_id', 'q', 'url_id', 'url'])
    for line in lines:
        re_gr = re.findall(r'(\d+)[\t ](.*)[\t ](D\d+)[\t ](.*)', line)[0]
        q_id, q, url_id, url = re_gr
        q.strip("\t ")
        url.strip("\t ")
        raw_df.loc[len(raw_df)] = [q_id, q, url_id, url]
    f.close()
    return raw_df


In [28]:
def orcas_to_agg_df(path):
    """
    Reads the orcas dataset and returns a pandas dataframe.
    """
    f = open(path, 'r')
    agg_df = pd.DataFrame(columns=['q_id', 'q', '[url_id, url]'])
    qid_urlid_dict = {}
    lines = f.readlines()
    for line in lines:
        re_gr = re.findall(r'(\d+)[\t ](.*)[\t ](D\d+)[\t ](.*)', line)[0]
        q_id, q, url_id, url = re_gr
        q.strip("\t ")
        url.strip("\t ")
        if (q_id, q) not in qid_urlid_dict:
            qid_urlid_dict[(q_id, q)] = [[url_id, url]]
        else:
            qid_urlid_dict[(q_id, q)].append([url_id, url])
    f.close()

    for key, value in qid_urlid_dict.items():
        agg_df.loc[len(agg_df)] = [key[0], key[1], str(value)]
    agg_df.set_index('q_id', inplace=True)
    return agg_df

In [29]:
orcas_path = "./orcas_subset.tsv"

# raw_orcas_df = orcas_to_raw_dataframe(orcas_path)
# print(raw_orcas_df.head())

orcas_df = orcas_to_agg_df(orcas_path)
print(orcas_df.head())

                q                                      [url_id, url]
q_id                                                                
9265503    github  [['D1265400', 'https://desktop.github.com/'], ...
6832981   youtube          [['D2923232', 'https://au.youtube.com/']]
9571352         !  [['D1238374', 'https://www.englishclub.com/wri...
4896888       ! c  [['D579719', 'https://en.wikipedia.org/wiki/C_...
10001890    ! c++  [['D1094795', 'https://en.wikipedia.org/wiki/O...
