In [1]:
import pandas as pd

# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_orig.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    def get_breadcrumbs(self, separator=" | ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics[topics_orig.parent == self.id].index]

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_orig.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


In [2]:
ROOT = "../data/"

In [3]:
topics = pd.read_csv(ROOT + "topics.csv")
contents = pd.read_csv(ROOT + "content.csv")
correlations = pd.read_csv(ROOT + "correlations.csv")

In [4]:
topics = topics.fillna({"title": "", "description": ""})
contents = contents.fillna({"title": "", "description": ""})

In [5]:
topics = topics.set_index("id", drop=False)
contents = contents.set_index("id", drop=False)

In [6]:
topics_orig = topics.copy()

In [7]:
topics["breadcrumb"] = topics["id"].transform(lambda x: Topic(x).get_breadcrumbs())
contents["breadcrumb"] = ""

In [8]:
contents["type"] = "content"
topics["type"] = "topic"

topics["text"] = ""

In [9]:
contents["category"] = contents.kind

In [10]:
from sklearn.model_selection import GroupKFold, KFold, train_test_split
import numpy as np
topics = topics.reset_index(drop=True)
for fold, (_, val_idx) in enumerate(
    list(
        GroupKFold(n_splits=5).split(
            np.arange(len(topics)), groups=topics.id
        )
    )
):
    topics.loc[val_idx, "fold"] = fold
topics["fold"] = topics["fold"].astype(int)
topics = topics.set_index("id", drop=False)

In [11]:
samples = []

label = 0
for row in correlations.itertuples():
        
    topic = topics.loc[row.topic_id]
    
    sample = {
        "type": "topic",
        "title": topic.title,
        "breadcrumb": topic.breadcrumb,
        "description": topic.description,
        "text": topic.text,
        "label": label,
        "topic_id": row.topic_id,
        "fold": topic.fold,
        "content_id": "",
        "category": topic.category,
        "language": topic.category,
    }
    samples.append(sample)
    
    cids = row.content_ids.split(" ")
    #print(len(cids))
    for cid in cids:
        content = contents.loc[cid]
    
        sample = {
            "type": "content",
            "title": content.title,
            "breadcrumb": content.breadcrumb,
            "description": content.description,
            "text": content.text,
            "label": label,
            "topic_id": "",
            "fold": topic.fold,
            "content_id": cid,
            "category": content.category,
            "language": content.language,
        }
        samples.append(sample)
    label += 1

In [12]:
df = pd.DataFrame(samples)

In [13]:
df.to_csv(ROOT + "train_folded_v9.csv", index=False)