In [1]:
import pandas as pd

# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)
        
    @property
    def parent_title(self):
        parent_id = topics.loc[self.id].parent
        if pd.isna(parent_id):
            return ""
        else:
            return Topic(parent_id).title

    @property
    def parent_description(self):
        parent_id = topics.loc[self.id].parent
        if pd.isna(parent_id):
            return ""
        else:
            return Topic(parent_id).description

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors
    
    def ancestors_title(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            if parent.title != "":
                return parent.title
            parent = parent.parent
            
        return ""
    
    def ancestors_description(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            if parent.description != "":
                return parent.description
            parent = parent.parent
            
        return ""

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs_title(self, separator=" | ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join([a.title for a in ancestors])
    
    def get_breadcrumbs_description(self, separator=" | ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join([a.description for a in ancestors])

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics[topics.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" | ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

In [2]:
ROOT = "../data/"

In [3]:
topics = pd.read_csv(ROOT + "topics.csv")
contents = pd.read_csv(ROOT + "content.csv")
correlations = pd.read_csv(ROOT + "correlations.csv")

In [4]:
topics = topics.set_index("id", drop=False)
contents = contents.set_index("id", drop=False)

In [5]:
topics = topics.fillna({"title": "", "description": ""})
contents = contents.fillna({"title": "", "description": "", "text": ""})

In [6]:
topics["title"] = topics["id"].transform(lambda x: Topic(x).get_breadcrumbs_title())

In [7]:
topics["description"] = topics["id"].transform(lambda x: Topic(x).get_breadcrumbs_description())

In [8]:
contents["description"] = contents["description"] + " " + contents["text"]

In [9]:
from sklearn.model_selection import GroupKFold, KFold, train_test_split
import numpy as np
topics = topics.reset_index(drop=True)
for fold, (_, val_idx) in enumerate(
    list(
        GroupKFold(n_splits=5).split(
            np.arange(len(topics)), groups=topics.id
        )
    )
):
    topics.loc[val_idx, "fold"] = fold
topics["fold"] = topics["fold"].astype(int)
topics = topics.set_index("id", drop=False)

In [10]:
samples = []

label = 0
label2 = 1_000_000
for row in correlations.itertuples():
        
    topic = topics.loc[row.topic_id]
    

    cids = row.content_ids.split(" ")
    #print(len(cids))
    for cid in cids:

        sample = {
            "type": "topic",
            "title": topic.title,
            "level": topic.level,
            "description": topic.description,
            "label": label2,
            "topic_id": row.topic_id,
            "fold": -1,
            "content_id": "",
            "category": topic.category,
            "language": topic.language,
            "topic_category": topic.category
        }

        samples.append(sample)

        content = contents.loc[cid]

        sample = {
            "type": "content",
            "title": content.title,
            "level": "",
            "description": content.description,
            "label": label2,
            "topic_id": "",
            "fold": -1,
            "content_id": cid,
            "category": content.kind,
            "language": content.language,
            "topic_category": topic.category
        }
        samples.append(sample)

        label2 += 1
    
    # if label >= 1000:
    #     break

In [11]:
df = pd.DataFrame(samples)

In [None]:
df.to_csv(ROOT + "train_folded_v8_reverse.csv", index=False)
