In [50]:
import sys
sys.path.append('../../')

from text2kg.core import Pipeline, PipelineEngine, Task, MultiTask, LLMTask
from text2kg.task import (
    LoadFolder,
    ExtractTranscripts,
    SummarizeTranscripts,
    SplitTranscripts,
)
import logging
from typing import List

In [2]:
class SimplePipelineEngine(PipelineEngine):
    """Simple pipeline implementation using Vanilla Python"""

    def execute(self, tasks: List[Task]):
        data = None
        for task in tasks:
            data = task.process(data)
            print(f"Completed task {task}")
        
        return data

In [17]:
class SelectFew(Task):
    """
    Select few outputs for multi-output task.
    Useful for testing :D
    """

    def __init__(self, start_index=0, count=1):
        """
        Args:
        - start_index: index to start selection from multi-output
        - count: number of outputs to select
        """
        self.start_index = start_index
        self.count = count

    def process(self, data):
        logging.info(f"Selecting one data from={data}")
        result = data[self.start_index:self.start_index + self.count]
        logging.info(f"Selected result={result}")
        return result


In [71]:
class GroupByFile(Task):
    """
    Group array-like data by file
    - input: [{file_path, summary}]
    - output: [[{file_path1, summary}], [{file_path2, summary}]]
    """

    def process(self, data):
        logging.info(f"Creating buckets for data={data}")
        buckets = {}

        for single_data in data:
            file_path = single_data["file_path"]
            if file_path not in buckets:
                buckets[file_path] = []

            buckets[file_path].append(single_data)

        result = list(buckets.values())
        logging.info(f"Created {len(result)} buckets")
        logging.info(f"Buckets={result}")
        return result

In [108]:
class CreateKnowledgeGraphs(MultiTask, LLMTask):
    """
    Invoke an LLM to create knowledge graphs based on summaries
    - single_input: [{file_path, summary}]
    - single_output: [{file_path, kg, contributors: [str]}]
    """

    DEFAULT_SYSTEM_PROMPT = """
    Given the following key concepts extracted from a university lecture transcript, list the nodes and edges in JSON format that can form a knowledge graph based on the key topics.
    Only create nodes based on large concepts. Only create an edge if the source concept is a prequisite to understand the target concept.
    """

    DEFAULT_FORMAT_PROMPT = """
    # Output Format Instructions
    Produce only JSON output and nothing else like so: ```json\n{nodes: [{id: <name of concept>}...], edges: [{source: <id of source concept node>, target: <id of target concept node>}]}\n```
    # Key Concepts
    """

    DEFAULT_PROMPT = DEFAULT_SYSTEM_PROMPT + DEFAULT_FORMAT_PROMPT

    def __init__(self, prompt=DEFAULT_PROMPT, *args, **kwargs):
        super().__init__(prompt, *args, **kwargs)

    def process_single(self, single_data):
        file_path = single_data[0]["file_path"]

        logging.info(f"Creating KG on {len(single_data)} summaries")
        joint_summaries = list(map(lambda x: x["summary"], single_data))
        joint_summaries_text = "\n".join(joint_summaries)
        # transcript = single_data['summary']

        prompt = self.prompt.strip() + joint_summaries_text
        logging.info(f"Creating prompt of length={len(prompt)}")
        logging.info(f"Creating prompt={prompt}")
        # print(f"Creating prompt={prompt}")

        response = self._llm.generate(self.model, prompt)
        llm_response = response["response"]
        logging.info(f"Received LLM response={llm_response}")

        result = {
            "file_path": file_path,
            "kg": llm_response
        }
        return result

In [111]:
pipeline = Pipeline(
    tasks=[
        LoadFolder(folder_path="/home/student/course-materials"),
        # SelectFew(),
        # SelectFew(start_index=11),
        # ExtractTranscripts(),
        # SelectFew(),
        # SplitTranscripts(max_tokens=6000),
        # SelectFew(count=4),
        # SummarizeTranscripts(host="localhost:11434"),
        # GroupByFile(),
        # CreateKnowledgeGraphs(),
    ],
    pipeline_engine=SimplePipelineEngine(),
)
output = pipeline.run()


Completed task <text2kg.task.LoadFolder object at 0x7f722f469e10>


In [114]:
# print(output)
from functools import reduce

count = len(output)
print(count)
print(output)

files = set(map(lambda x: x["file_path"], output))
print(files)

key="summary"
sum_content_length = reduce(lambda x, y: x + len(y[key]), output, 0)
min_content_length = reduce(lambda x, y: min(x, len(y[key])), output, 10_000)
max_content_length = reduce(lambda x, y: max(x, len(y[key])), output, 0)
print(f'sum_content_length={sum_content_length}')
print(f'avg_content_length={sum_content_length/len(output)}')
print(f'min_content_length={min_content_length}')
print(f'max_content_length={max_content_length}')

88
['/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-dec2_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/(Supplementary) Data Representation Lecture Recording.mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov20_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov27_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov18_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-noc4_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-sept25_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405b-sept4_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-nov25_(Source).mp4.csv', '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-oct30

TypeError: string indices must be integers

In [49]:
# for x in output:
#     print(x["summary"])

In [109]:
task_create_kg = CreateKnowledgeGraphs(host="localhost:11434")
kg_output = task_create_kg.process(transcript_output)
print(kg_output)

[{'file_path': '/home/student/course-materials/COMP1405-F19/LectureCaptions/1405-sept30_(SD_Large_-_WEB_MBL_(H264_1500)).mp4.csv', 'kg': '## Knowledge Graph in JSON format:\n\n```json\n{\n  "nodes": [\n    { "id": "Loops" },\n    { "id": "Iteration Variable" },\n    { "id": "Range Command" },\n    { "id": "For Loop Structure" },\n    { "id": "Multiplication Example" },\n    { "id": "Complex For Loop" },\n    { "id": "Nested Loops" },\n    { "id": "Number Mod Three" },\n    { "id": "For Loop" },\n    { "id": "Nested Loops" },\n    { "id": "Combinations" },\n    { "id": "Time Calculations" },\n    { "id": "12-Hour Clock" }\n  ],\n  "edges": [\n    { "source": "Loops", "target": "For Loop Structure" },\n    { "source": "Iteration Variable", "target": "For Loop Structure" },\n    { "source": "Range Command", "target": "For Loop Structure" },\n    { "source": "For Loop Structure", "target": "Multiplication Example" },\n    { "source": "For Loop Structure", "target": "Complex For Loop" },\n 

In [110]:
print(len(kg_output[0]["kg"]))

1391


In [6]:
print(f"Number of summaries: {len(output)}")
joint_summaries = "\n".join([
    single_output['summary']
    for single_output in output
])
print(joint_summaries)

Number of summaries: 31


KeyError: 'summary'

In [15]:
system_instructions = """
Given the following key concepts extracted from a university lecture transcript, list the nodes and edges that can form a knowledge graph based on the key topics.
Only create nodes based on large concepts. Only create an edge if the source concept is a prequisite to understand the target concept.
"""

format_instructions = """
# Output Format Instructions
Produce only JSON output and nothing else like so: ```json\n{nodes: [{id: <name of concept>}...], edges: [{source: <id of source concept node>, target: <id of target concept node>}]}\n```
"""

summary_instructions = """# Key Concepts{summaries}\n""".format(summaries=joint_summaries)

prompt = system_instructions + format_instructions + summary_instructions

print(prompt)


Given the following key concepts extracted from a university lecture transcript, list the nodes and edges that can form a knowledge graph based on the key topics.
Only create nodes based on large concepts. Only create an edge if the source concept is a prequisite to understand the target concept.

# Output Format Instructions
Produce only JSON output and nothing else like so: ```json
{nodes: [{id: <name of concept>}...], edges: [{source: <id of source concept node>, target: <id of target concept node>}]}
```
# Key Concepts**Key Concepts:**

* **Graduate school:** Can be challenging but rewarding, with potential for failure and low salary.
* **Midterm exam:** Scheduled for this week, covering chapters 3 and 5.
* **Loops:** Important concepts in Python programming, including while loops, range commands, and for loops.
* **Computational thinking:** Principles such as identifying similarities and recognizing patterns are essential for solving complex problems.
* **Practice problems:** Com

In [18]:
import ollama

llm = ollama.Client(host="localhost:11434")

response = llm.generate(model="gemma:7b", prompt=prompt)
print(response["response"])

```json
{
  "nodes": [
    {
      "id": "Graduate school"
    },
    {
      "id": "Loops"
    },
    {
      "id": "Computational thinking"
    },
    {
      "id": "Practice problems"
    },
    {
      "id": "Range command"
    },
    {
      "id": "For loop"
    },
    {
      "id": "Iteration variable"
    },
    {
      "id": "Range operator"
    },
    {
      "id": "Iteration range"
    },
    {
      "id": "Body of the loop"
    },
    {
      "id": "Index number"
    },
    {
      "id": "List length"
    }
  ],
  "edges": [
    {
      "source": "Graduate school",
      "target": "Practice problems"
    },
    {
      "source": "Loops",
      "target": "Practice problems"
    },
    {
      "source": "Computational thinking",
      "target": "Practice problems"
    },
    {
      "source": "Range command",
      "target": "For loop"
    },
    {
      "source": "For loop",
      "target": "Iteration variable"
    },
    {
      "source": "For loop",
      "target": "Range o