In [4]:
import sys
sys.path.append('../../')

from text2kg.core import Pipeline, PipelineEngine, Task, MultiTask, LLMTask
from text2kg.task import (
    LoadFolder,
    ExtractTranscripts,
    SummarizeTranscripts,
    SplitTranscripts,
)
import logging
from typing import List

In [2]:
class SimplePipelineEngine(PipelineEngine):
    """Simple pipeline implementation using Vanilla Python"""

    def execute(self, tasks: List[Task]):
        data = None
        for task in tasks:
            data = task.process(data)
            print(f"Completed task {task}")
        
        return data

In [17]:
class SelectFew(Task):
    """
    Select few outputs for multi-output task.
    Useful for testing :D
    """

    def __init__(self, start_index=0, count=1):
        """
        Args:
        - start_index: index to start selection from multi-output
        - count: number of outputs to select
        """
        self.start_index = start_index
        self.count = count

    def process(self, data):
        logging.info(f"Selecting one data from={data}")
        result = data[self.start_index:self.start_index + self.count]
        logging.info(f"Selected result={result}")
        return result


In [71]:
class GroupByFile(Task):
    """
    Group array-like data by file
    - input: [{file_path, summary}]
    - output: [[{file_path1, summary}], [{file_path2, summary}]]
    """

    def process(self, data):
        logging.info(f"Creating buckets for data={data}")
        buckets = {}

        for single_data in data:
            file_path = single_data["file_path"]
            if file_path not in buckets:
                buckets[file_path] = []

            buckets[file_path].append(single_data)

        result = list(buckets.values())
        logging.info(f"Created {len(result)} buckets")
        logging.info(f"Buckets={result}")
        return result

In [108]:
class CreateKnowledgeGraphs(MultiTask, LLMTask):
    """
    Invoke an LLM to create knowledge graphs based on summaries
    - single_input: [{file_path, summary}]
    - single_output: [{file_path, kg, contributors: [str]}]
    """

    DEFAULT_SYSTEM_PROMPT = """
    Given the following key concepts extracted from a university lecture transcript, list the nodes and edges in JSON format that can form a knowledge graph based on the key topics.
    Only create nodes based on large concepts. Only create an edge if the source concept is a prequisite to understand the target concept.
    """

    DEFAULT_FORMAT_PROMPT = """
    # Output Format Instructions
    Produce only JSON output and nothing else like so: ```json\n{nodes: [{id: <name of concept>}...], edges: [{source: <id of source concept node>, target: <id of target concept node>}]}\n```
    # Key Concepts
    """

    DEFAULT_PROMPT = DEFAULT_SYSTEM_PROMPT + DEFAULT_FORMAT_PROMPT

    def __init__(self, prompt=DEFAULT_PROMPT, *args, **kwargs):
        super().__init__(prompt, *args, **kwargs)

    def process_single(self, single_data):
        file_path = single_data[0]["file_path"]

        logging.info(f"Creating KG on {len(single_data)} summaries")
        joint_summaries = list(map(lambda x: x["summary"], single_data))
        joint_summaries_text = "\n".join(joint_summaries)
        # transcript = single_data['summary']

        prompt = self.prompt.strip() + joint_summaries_text
        logging.info(f"Creating prompt of length={len(prompt)}")
        logging.info(f"Creating prompt={prompt}")
        # print(f"Creating prompt={prompt}")

        response = self._llm.generate(self.model, prompt)
        llm_response = response["response"]
        logging.info(f"Received LLM response={llm_response}")

        result = {
            "file_path": file_path,
            "kg": llm_response
        }
        return result

In [111]:
pipeline = Pipeline(
    tasks=[
        LoadFolder(folder_path="/home/student/course-materials"),
        # SelectFew(),
        # SelectFew(start_index=11),
        # ExtractTranscripts(),
        # SelectFew(),
        # SplitTranscripts(max_tokens=6000),
        # SelectFew(count=4),
        # SummarizeTranscripts(host="localhost:11434"),
        # GroupByFile(),
        # CreateKnowledgeGraphs(),
    ],
    pipeline_engine=SimplePipelineEngine(),
)
output = pipeline.run()


Completed task <text2kg.task.LoadFolder object at 0x7f722f469e10>


In [114]:
# print(output)
from functools import reduce

count = len(output)
print(count)
print(output)

files = set(map(lambda x: x["file_path"], output))
print(files)

key="summary"
sum_content_length = reduce(lambda x, y: x + len(y[key]), output, 0)
min_content_length = reduce(lambda x, y: min(x, len(y[key])), output, 10_000)
max_content_length = reduce(lambda x, y: max(x, len(y[key])), output, 0)
print(f'sum_content_length={sum_content_length}')
print(f'avg_content_length={sum_content_length/len(output)}')
print(f'min_content_length={min_content_length}')
print(f'max_content_length={max_content_length}')

88
['/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-dec2_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/(Supplementary) Data Representation Lecture Recording.mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov20_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov27_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov18_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-noc4_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-sept25_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405b-sept4_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov25_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-oct30

TypeError: string indices must be integers

In [49]:
# for x in output:
#     print(x["summary"])

In [109]:
task_create_kg = CreateKnowledgeGraphs(host="localhost:11434")
kg_output = task_create_kg.process(transcript_output)
print(kg_output)

[{'file_path': '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-sept30_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', 'kg': '## Knowledge Graph in JSON format:\n\n```json\n{\n  "nodes": [\n    { "id": "Loops" },\n    { "id": "Iteration Variable" },\n    { "id": "Range Command" },\n    { "id": "For Loop Structure" },\n    { "id": "Multiplication Example" },\n    { "id": "Complex For Loop" },\n    { "id": "Nested Loops" },\n    { "id": "Number Mod Three" },\n    { "id": "For Loop" },\n    { "id": "Nested Loops" },\n    { "id": "Combinations" },\n    { "id": "Time Calculations" },\n    { "id": "12-Hour Clock" }\n  ],\n  "edges": [\n    { "source": "Loops", "target": "For Loop Structure" },\n    { "source": "Iteration Variable", "target": "For Loop Structure" },\n    { "source": "Range Command", "target": "For Loop Structure" },\n    { "source": "For Loop Structure", "target": "Multiplication Example" },\n    { "source": "For Loop Structure", "target": "Complex For Loop" },\n 

In [110]:
print(len(kg_output[0]["kg"]))

1391


In [6]:
print(f"Number of summaries: {len(output)}")
joint_summaries = "\n".join([
    single_output['summary']
    for single_output in output
])
print(joint_summaries)

Number of summaries: 31


KeyError: 'summary'

In [15]:
system_instructions = """
Given the following key concepts extracted from a university lecture transcript, list the nodes and edges that can form a knowledge graph based on the key topics.
Only create nodes based on large concepts. Only create an edge if the source concept is a prequisite to understand the target concept.
"""

format_instructions = """
# Output Format Instructions
Produce only JSON output and nothing else like so: ```json\n{nodes: [{id: <name of concept>}...], edges: [{source: <id of source concept node>, target: <id of target concept node>}]}\n```
"""

summary_instructions = """# Key Concepts{summaries}\n""".format(summaries=joint_summaries)

prompt = system_instructions + format_instructions + summary_instructions

print(prompt)


Given the following key concepts extracted from a university lecture transcript, list the nodes and edges that can form a knowledge graph based on the key topics.
Only create nodes based on large concepts. Only create an edge if the source concept is a prequisite to understand the target concept.

# Output Format Instructions
Produce only JSON output and nothing else like so: ```json
{nodes: [{id: <name of concept>}...], edges: [{source: <id of source concept node>, target: <id of target concept node>}]}
```
# Key Concepts**Key Concepts:**

* **Graduate school:** Can be challenging but rewarding, with potential for failure and low salary.
* **Midterm exam:** Scheduled for this week, covering chapters 3 and 5.
* **Loops:** Important concepts in Python programming, including while loops, range commands, and for loops.
* **Computational thinking:** Principles such as identifying similarities and recognizing patterns are essential for solving complex problems.
* **Practice problems:** Com

In [18]:
import ollama

llm = ollama.Client(host="localhost:11434")

response = llm.generate(model="gemma:7b", prompt=prompt)
print(response["response"])

```json
{
  "nodes": [
    {
      "id": "Graduate school"
    },
    {
      "id": "Loops"
    },
    {
      "id": "Computational thinking"
    },
    {
      "id": "Practice problems"
    },
    {
      "id": "Range command"
    },
    {
      "id": "For loop"
    },
    {
      "id": "Iteration variable"
    },
    {
      "id": "Range operator"
    },
    {
      "id": "Iteration range"
    },
    {
      "id": "Body of the loop"
    },
    {
      "id": "Index number"
    },
    {
      "id": "List length"
    }
  ],
  "edges": [
    {
      "source": "Graduate school",
      "target": "Practice problems"
    },
    {
      "source": "Loops",
      "target": "Practice problems"
    },
    {
      "source": "Computational thinking",
      "target": "Practice problems"
    },
    {
      "source": "Range command",
      "target": "For loop"
    },
    {
      "source": "For loop",
      "target": "Iteration variable"
    },
    {
      "source": "For loop",
      "target": "Range o

In [1]:
kg_output = [{'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Search Lecture Recording.mp4.csv', 'kg': '## Knowledge Graph in JSON format:\n\n```json\n{\n "nodes": [\n {\n "id": "Document Indexing"\n },\n {\n "id": "Search Queries"\n },\n {\n "id": "Search Results"\n },\n {\n "id": "Elastic Lunar"\n }\n ],\n "edges": [\n {\n "source": "Document Indexing",\n "target": "Search Queries"\n },\n {\n "source": "Search Queries",\n "target": "Search Results"\n },\n {\n "source": "Search Results",\n "target": "Elastic Lunar"\n }\n ]\n}\n```\n\n**Notes:**\n\n* This knowledge graph includes only the key concepts extracted from the lecture transcript, not the details of each concept.\n* The edges represent the relationships between the concepts, where the source concept is a prerequisite to understand the target concept.\n* Additional nodes could be added for more specific concepts within each key concept category, such as specific indexing techniques or query operators.'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Page Rank Lecture Recording.mp4.csv', 'kg': '```json\n{\n "nodes": [\n {\n "id": "PageRank Algorithm"\n },\n {\n "id": "Content-Based Search"\n },\n {\n "id": "Keyword Stuffing"\n },\n {\n "id": "Structural Analysis"\n },\n {\n "id": "Incoming Links"\n },\n {\n "id": "Highly Cited Papers"\n },\n {\n "id": "Markov Chain"\n },\n {\n "id": "Memoryless"\n },\n {\n "id": "Discrete Time"\n },\n {\n "id": "Stochastic"\n },\n {\n "id": "Adjacency Matrix"\n },\n {\n "id": "Page Importance Calculation"\n },\n {\n "id": "Final Probability Matrix"\n },\n {\n "id": "Power Iteration"\n },\n {\n "id": "Steady-State Vector"\n },\n {\n "id": "PageRank Value"\n }\n ],\n "edges": [\n {\n "source": "PageRank Algorithm",\n "target": "Content-Based Search"\n },\n {\n "source": "Content-Based Search",\n "target": "Keyword Stuffing"\n },\n {\n "source": "Structural Analysis",\n "target": "Incoming Links"\n },\n {\n "source": "Incoming Links",\n "target": "Highly Cited Papers"\n },\n {\n "source": "Markov Chain",\n "target": "Memoryless"\n },\n {\n "source": "Memoryless",\n "target": "Discrete Time"\n },\n {\n "source": "Discrete Time",\n "target": "Stochastic"\n },\n {\n "source": "Stochastic",\n "target": "Adjacency Matrix"\n },\n {\n "source": "Adjacency Matrix",\n "target": "Page Importance Calculation"\n },\n {\n "source": "Page Importance Calculation",\n "target": "Final Probability Matrix"\n },\n {\n "source": "Final Probability Matrix",\n "target": "Power Iteration"\n },\n {\n "source": "Power Iteration",\n "target": "Steady-State Vector"\n },\n {\n "source": "Steady-State Vector",\n "target": "PageRank Value"\n }\n ]\n}\n```'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Intro to Recommender Systems Lecture Recording.mp4.csv', 'kg': '## Knowledge Graph in JSON format based on key concepts from the lecture transcript:\n\n```json\n{\n "nodes": [\n {\n "id": "Recommender Systems"\n },\n {\n "id": "Similar to Search"\n },\n {\n "id": "Personalization"\n },\n {\n "id": "Large Data Sets"\n },\n {\n "id": "Efficiency"\n },\n {\n "id": "Explicit vs. Implicit Data Collection"\n },\n {\n "id": "Implicit Ratings"\n },\n {\n "id": "User Profiling"\n },\n {\n "id": "Collaborative Recommendation"\n },\n {\n "id": "Community-Based Recommendations"\n },\n {\n "id": "Text Extraction"\n },\n {\n "id": "Data Matrix"\n },\n {\n "id": "Missing Ratings"\n },\n {\n "id": "Nearest Neighbor Recommendation"\n },\n {\n "id": "No One Reviews"\n }\n ],\n "edges": [\n {\n "source": "Recommender Systems",\n "target": "Similar to Search"\n },\n {\n "source": "Similar to Search",\n "target": "Personalization"\n },\n {\n "source": "Personalization",\n "target": "Large Data Sets"\n },\n {\n "source": "Large Data Sets",\n "target": "Efficiency"\n },\n {\n "source": "Explicit vs. Implicit Data Collection",\n "target": "Implicit Ratings"\n },\n {\n "source": "Implicit Ratings",\n "target": "User Profiling"\n },\n {\n "source": "User Profiling",\n "target": "Collaborative Recommendation"\n },\n {\n "source": "Collaborative Recommendation",\n "target": "Community-Based Recommendations"\n },\n {\n "source": "Community-Based Recommendations",\n "target": "Text Extraction"\n },\n {\n "source": "Text Extraction",\n "target": "Data Matrix"\n },\n {\n "source": "Data Matrix",\n "target": "Missing Ratings"\n },\n {\n "source": "Missing Ratings",\n "target": "Nearest Neighbor Recommendation"\n }\n ]\n}\n```\n\nThis knowledge graph represents the key concepts from the lecture transcript and their relationships to each other. The nodes represent the key concepts, and the edges represent the relationships between them.'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Evaluating Recommender Systems Lecture Recording.mp4.csv', 'kg': '## Knowledge Graph\n\n```json\n{\n "nodes": [\n {\n "id": "Recommender Systems Evaluation"\n },\n {\n "id": "Accuracy of Recommendations"\n },\n {\n "id": "Synthetic Data Sets"\n },\n {\n "id": "Advantages of Synthetic Data Sets"\n },\n {\n "id": "Drawbacks of Synthetic Data Sets"\n },\n {\n "id": "Real-World Applicability"\n }\n ],\n "edges": [\n {\n "source": "Recommender Systems Evaluation",\n "target": "Accuracy of Recommendations"\n },\n {\n "source": "Accuracy of Recommendations",\n "target": "Synthetic Data Sets"\n },\n {\n "source": "Synthetic Data Sets",\n "target": "Advantages of Synthetic Data Sets"\n },\n {\n "source": "Advantages of Synthetic Data Sets",\n "target": "Real-World Applicability"\n }\n ]\n}\n```\n\n**Notes:**\n\n* Only large concepts are represented as nodes.\n* An edge is drawn if the source concept is a prerequisite to understand the target concept.'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Web Scraping Lecture Recording.mp4.csv', 'kg': '```json\n{\n "nodes": {\n "Web Scraping": null,\n "Information Extraction": null,\n "Meta Tags": null,\n "Structured Data": null,\n "RESTful APIs": null,\n "Cheerio": {"id": "Cheerio"},\n "Paragraph Tags": null,\n "Crawl Your Own Web Page": null\n },\n "edges": {\n "Web Scraping" -> "Information Extraction",\n "Information Extraction" -> "Structured Data",\n "Structured Data" -> "RESTful APIs",\n "Cheerio" -> "Data Extraction",\n "Data Extraction" -> "Crawl Your Own Web Page"\n }\n}\n```'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Web Crawling Lecture Recording.mp4.csv', 'kg': '```json\n{\n"nodes": [{\n"id": "Web Crawling"\n}, {\n"id": "Information Extraction"\n}, {\n"id": "PageRank Algorithm"\n}, {\n"id": "Network Structure"\n}, {\n"id": "URL"\n}, {\n"id": "RESTful Systems"\n}, {\n"id": "Multithreaded Downloader"\n}, {\n"id": "Queue"\n}, {\n"id": "Scheduler"\n}, {\n"id": "Scalable Web Crawling"\n}, {\n"id": "RESTful Design"\n}, {\n"id": "Distributed Web Crawling"\n}, {\n"id": "Crawler Policies"\n}, {\n"id": "Page Importance"\n}, {\n"id": "Page Popularity"\n}, {\n"id": "Network Structure Analysis"\n}, {\n"id": "High-PageRank pages"\n}, {\n"id": "Outgoing links"\n}, {\n"id": "Link probability"\n}, {\n"id": "Freshness"\n}, {\n"id": "Time Decay Function"\n}, {\n"id": "Backlink Count"\n}, {\n"id": "Selection Policy"\n}, {\n"id": "Revisit Policy"\n}, {\n"id": "Politeness Policy"\n}, {\n"id": "Freshness Measure"\n}, {\n"id": "Age Measure"\n}, {\n"id": "Revisit Policy"\n}, {\n"id": "Dynamic Assignment"\n}, {\n"id": "Static Assignment"\n}, {\n"id": "Robot Exclusion Standard"\n}, {\n"id": "Server Overload"\n}, {\n"id": "Parallelization"\n}, {\n"id": "Distributed Shared File Systems"\n}, {\n"id": "MapReduce"\n}, {\n"id": "Blackboard Approaches"\n}\n],\n"edges": [{\n"source": "Web Crawling",\n"target": "Information Extraction"\n}, {\n"source": "Information Extraction",\n"target": "PageRank Algorithm"\n}, {\n"source": "PageRank Algorithm",\n"target": "Network Structure"\n}, {\n"source": "Network Structure",\n"target": "URL"\n}, {\n"source": "URL",\n"target": "RESTful Systems"\n}, {\n"source": "RESTful Systems",\n"target": "Multithreaded Downloader"\n}, {\n"source": "Multithreaded Downloader",\n"target": "Queue"\n}, {\n"source": "Queue",\n"target": "Scheduler"\n}, {\n"source": "Scalable Web Crawling",\n"target": "RESTful Design"\n}, {\n"source": "RESTful Design",\n"target": "Distributed Web Crawling"\n}, {\n"source": "Distributed Web Crawling",\n"target": "Crawler Policies"\n}, {\n"source": "Crawler Policies",\n"target": "Page Importance"\n}, {\n"source": "Page Importance",\n"target": "Page Popularity"\n}, {\n"source": "Page Popularity",\n"target": "Network Structure Analysis"\n}, {\n"source": "High-PageRank pages",\n"target": "Outgoing links"\n}, {\n"source": "Outgoing links",\n"target": "Link probability"\n}, {\n"source": "Freshness",\n"target": "Time Decay Function"\n}, {\n"source": "Time Decay Function",\n"target": "Backlink Count"\n}, {\n"source": "Backlink Count",\n"target": "Selection Policy"\n}, {\n"source": "Selection Policy",\n"target": "Revisit Policy"\n}, {\n"source": "Revisit Policy",\n"target": "Politeness Policy"\n}, {\n"source": "Politeness Policy",\n"target": "Freshness Measure"\n}, {\n"source": "Fresh\n},\n}'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Attacks on Recommender Systems Lecture Recording.mp4.csv', 'kg': '**JSON format:**\n\n```json\n{\n "nodes": [\n { "id": "Recommender Systems" },\n { "id": "Shilling Attacks" },\n { "id": "Push and Nuke Attacks" },\n { "id": "Cost, Algorithm Dependence, and Detectability" },\n { "id": "Review Bombing" },\n { "id": "Nearest Neighbor Recommendations" },\n { "id": "Naïve Attack" },\n { "id": "Account Security Checks" }\n ],\n "edges": [\n { "source": "Recommender Systems", "target": "Shilling Attacks" },\n { "source": "Shilling Attacks", "target": "Push and Nuke Attacks" },\n { "source": "Push and Nuke Attacks", "target": "Cost, Algorithm Dependence, and Detectability" },\n { "source": "Review Bombing", "target": "Nearest Neighbor Recommendations" },\n { "source": "Nearest Neighbor Recommendations", "target": "Naïve Attack" },\n { "source": "Naïve Attack", "target": "Account Security Checks" }\n ]\n}\n```\n\n**Key concepts:**\n\n* Recommender Systems\n* Shilling Attacks\n* Push and Nuke Attacks\n* Cost, Algorithm Dependence, and Detectability\n* Review Bombing\n* Nearest Neighbor Recommendations\n* Naïve Attack\n* Account Security Checks\n* Cost of the Attack\n* Random Attack\n* Average Attack\n* Bandwagon Attack\n* Segment Attack'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Trust-Aware Recommender Systems Lecture Recording.mp4.csv', 'kg': '## Knowledge Graph\n\n```json\n{\n "nodes": [\n {\n "id": "Trust"\n },\n {\n "id": "Local Trust"\n },\n {\n "id": "Global Trust"\n },\n {\n "id": "Trust Matrix"\n },\n {\n "id": "Ratings Matrix"\n },\n {\n "id": "Estimated Trust Matrix"\n },\n {\n "id": "Global Trust Based on Overall Trust Ratings"\n }\n ],\n "edges": [\n {\n "source": "Trust",\n "target": "Local Trust"\n },\n {\n "source": "Trust",\n "target": "Global Trust"\n },\n {\n "source": "Local Trust",\n "target": "Trust Matrix"\n },\n {\n "source": "Global Trust",\n "target": "Trust Matrix"\n },\n {\n "source": "Trust Matrix",\n "target": "Estimated Trust Matrix"\n }\n ]\n}\n```\n\n**Notes:**\n\n* This knowledge graph includes only the key concepts from the lecture transcript. Other concepts mentioned in the transcript may not be included.\n* The edges in the knowledge graph represent the relationships between the concepts.\n* The nodes in the knowledge graph represent the concepts themselves.'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Sparsity and the Cold Start Problem Lecture Recording.mp4.csv', 'kg': '```json\n{\n"nodes": [\n { "id": "Sparsity" },\n { "id": "Cold Start Problem" },\n { "id": "Network-Based Approaches" },\n { "id": "Hybrid Approaches" },\n { "id": "Associative Retrieval" },\n { "id": "Graph-Based Approach" },\n { "id": "Bipartite network" },\n { "id": "Spreading activation" },\n { "id": "Path length" },\n { "id": "Recommendation generation" },\n { "id": "Item ranking" },\n { "id": "Path traversal" },\n { "id": "Multi-step paths" },\n { "id": "Activation level" }\n],\n"edges": [\n { "source": "Sparsity", "target": "Cold Start Problem" },\n { "source": "Cold Start Problem", "target": "Network-Based Approaches" },\n { "source": "Network-Based Approaches", "target": "Hybrid Approaches" },\n { "source": "Hybrid Approaches", "target": "Associative Retrieval" },\n { "source": "Associative Retrieval", "target": "Graph-Based Approach" },\n { "source": "Graph-Based Approach", "target": "Bipartite network" },\n { "source": "Bipartite network", "target": "Spreading activation" },\n { "source": "Spreading activation", "target": "Path length" },\n { "source": "Path length", "target": "Recommendation generation" },\n { "source": "Recommendation generation", "target": "Item ranking" },\n { "source": "Item ranking", "target": "Path traversal" },\n { "source": "Path traversal", "target": "Multi-step paths" },\n { "source": "Multi-step paths", "target": "Activation level" }\n]\n}\n```'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Item-Based Recommendation Lecture Recording.mp4.csv', 'kg': '## Knowledge Graph in JSON format based on key concepts from lecture transcript:\n\n```json\n{\n"nodes": [\n { "id": "Item-Based Nearest Neighbor Recommendation" },\n { "id": "User-Based Recommendations" },\n { "id": "Pearson Correlation" },\n { "id": "All-Pairs Similarity" },\n { "id": "Computational Complexity" },\n { "id": "Threshold" },\n { "id": "Item Similarity" },\n { "id": "Cosine Similarity" },\n { "id": "Adjusted Cosine Similarity" },\n { "id": "Vector Similarity" },\n { "id": "Euclidean Length" },\n { "id": "Dot Product" }\n],\n"edges": [\n { "source": "Item-Based Nearest Neighbor Recommendation", "target": "Item-Based Similarity" },\n { "source": "User-Based Recommendations", "target": "User-Based Similarity" },\n { "source": "Pearson Correlation", "target": "All-Pairs Similarity" },\n { "source": "All-Pairs Similarity", "target": "Computational Complexity" },\n { "source": "Computational Complexity", "target": "Threshold" },\n { "source": "Item Similarity", "target": "Item-Based Recommendation" },\n { "source": "Item Similarity", "target": "User-Based Recommendations" }\n]\n}\n```\n\n**Notes:**\n\n* This knowledge graph includes only the key concepts discussed in the lecture transcript. It does not include other concepts that may be related to the topic.\n* The edges in the graph represent the relationships between the concepts. The source concept is a prerequisite to understanding the target concept.\n* The graph does not include any information about the order in which the concepts were discussed in the lecture.'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/Indexing Lecture Recording.mp4.csv', 'kg': '```json\n{\n "nodes": [\n {\n "id": "Indexing"\n },\n {\n "id": "Search Engine"\n },\n {\n "id": "Information Retrieval"\n },\n {\n "id": "Relevance"\n },\n {\n "id": "Scalability"\n },\n {\n "id": "Search Query"\n },\n {\n "id": "Substrings"\n },\n {\n "id": "Binary Search"\n },\n {\n "id": "Precision"\n },\n {\n "id": "Recall"\n },\n {\n "id": "Term-document incidence matrix"\n },\n {\n "id": "Inverted Index"\n },\n {\n "id": "Term List"\n },\n {\n "id": "Document Incidence Matrix"\n },\n {\n "id": "Document Ranking"\n },\n {\n "id": "Term Frequency"\n }\n ],\n "edges": [\n {\n "source": "Indexing",\n "target": "Search Engine"\n },\n {\n "source": "Search Engine",\n "target": "Information Retrieval"\n },\n {\n "source": "Information Retrieval",\n "target": "Relevance"\n },\n {\n "source": "Relevance",\n "target": "Scalability"\n },\n {\n "source": "Scalability",\n "target": "Search Query"\n },\n {\n "source": "Search Query",\n "target": "Substrings"\n },\n {\n "source": "Substrings",\n "target": "Binary Search"\n },\n {\n "source": "Inverted Index",\n "target": "Term List"\n },\n {\n "source": "Term List",\n "target": "Document Incidence Matrix"\n },\n {\n "source": "Document Incidence Matrix",\n "target": "Document Ranking"\n },\n {\n "source": "Document Ranking",\n "target": "Term Frequency"\n }\n ]\n}\n```'}, {'file_path': '/opt/course-materials/COMP4601-F23/Lecture Captions/RESTful Design Lecture Recording.mp4.csv', 'kg': '## Knowledge Graph\n\n```json\n{\n "nodes": [\n {\n "id": "RESTful Design",\n "description": "Architectural style for web services that follows principles of interlinking, scalability, and other qualities found on the web."\n },\n {\n "id": "Resources",\n "description": "Conceptual representations of objects or services that are identified by URIs."\n },\n {\n "id": "Client-Server Interactions",\n "description": "Exchange of resource representations between clients and servers."\n },\n {\n "id": "Stateless",\n "description": "Services do not maintain state on the server, each request is independent."\n },\n {\n "id": "Catchable",\n "description": "Resources are designed to be easily intercepted for debugging purposes."\n },\n {\n "id": "Uniform Interface",\n "description": "Resources are treated uniformly regardless of their underlying implementation."\n },\n {\n "id": "Layered",\n "description": "Services are organized into layers, each layer has its own responsibilities."\n },\n {\n "id": "Hypermedia",\n "description": "Provides links to other resources, allowing clients to navigate and make choices."\n }\n ],\n "edges": [\n {\n "source": "Client-Server Interactions",\n "target": "Stateless"\n },\n {\n "source": "Stateless",\n "target": "Catchable"\n },\n {\n "source": "Uniform Interface",\n "target": "Layered"\n },\n {\n "source": "Layered",\n "target": "Hypermedia"\n }\n ]\n}\n```\n\n**Notes:**\n\n* This knowledge graph includes only the key concepts from the transcript that are relevant to the class.\n* The graph structure represents the relationships between the concepts.\n* Each node has a unique identifier and a description.\n* Each edge represents a relationship between two concepts.'}]

In [15]:
kg_output[0]["kg"] += "{"

In [3]:
print(len(kg_output))

12


In [27]:
import json


class ExtractJSON(MultiTask):
    """
    Extract JSON content from knowledge graph response
    Failed extractions will be documented but excluded from task output
    - single_input: {file_path, kg}
    - single_output: {file_path, nodes, edges}
    """

    def process(self, data):
        results = super().process(data)

        # Exclude failures
        succesful_results = list(filter(lambda x: x != None, results))
        num_failures = len(results) - len(succesful_results)
        logging.info(f"Number of failed extractions={num_failures}")
        return succesful_results

    def process_single(self, single_data):
        file_path = single_data["file_path"]
        raw_kg = single_data["kg"]
        logging.info(f"Extracting JSON from kg for file={file_path}")

        start_delim = "{"
        start_index = raw_kg.find(start_delim)
        end_delim = "}"
        end_index = raw_kg.rfind(end_delim)
        json_like_kg = raw_kg[start_index:end_index + len(end_delim)]
        logging.info(f"KG after string processing={json_like_kg}")

        try:
            json_kg = json.loads(json_like_kg)
            logging.info(f"Succesfully parsed as JSON={json_kg}")
        except Exception as e:
            logging.info(f"Unable to parse as JSON error={e}")
            return

        result = {
            "file_path": file_path,
            "nodes": json_kg["nodes"],
            "edges": json_kg["edges"],
        }
        return result


task_extract_json = ExtractJSON()
kg_json_output = task_extract_json.process(kg_output)

In [28]:
print(len(kg_json_output))
print(list(filter(lambda x: x == None, kg_json_output)))

10
[]
