<a href="https://colab.research.google.com/github/polyexplorer/open-llm/blob/main/KG_Builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# #Dependencies
# ! pip install git+https://github.com/huggingface/transformers.git@72958fcd3c98a7afdc61f953aa58c544ebda2f79
# ! pip install optimum
# ! pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
# ! pip install langchain
# ! pip install "unstructured[pdf]"

In [4]:
! pip install helpers

Collecting helpers
  Downloading helpers-0.2.0-py3-none-any.whl (2.3 kB)
Installing collected packages: helpers
Successfully installed helpers-0.2.0


In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
inputdirectory = Path("/content/drive/MyDrive/ai_songwriter/literature/pdfs")
## This is where the output csv files will be written
outputdirectory = Path(f"/content/drive/MyDrive/ai_songwriter/literature/graphs")

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

100%|██████████| 6/6 [03:47<00:00, 37.92s/it]

Number of chunks =  841
Summary....................................................................................................................................................... 86 Chapter 3: The Foundations Scale-Steps and Scales ................................................... 87 Scales and Scale-Steps ................................................................................................................................. 88

Heptatonic Scales: The Major Scale, The Three Forms of the Minor Scale............................................ 91

Solfége Revisited ........................................................................................................................................ 102

Heptatonic Scales: Introduction to Modes .............................................................................................. 106

Other Commonly Used Scales .........................................................................................................




In [3]:
import uuid
import pandas as pd

def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

df = documents2Dataframe(pages)
print(df.shape)
df.head()

(841, 3)


Unnamed: 0,text,source,chunk_id
0,Music Theory\n\nv. 1.0\n\nThis is the book Mus...,/content/drive/MyDrive/ai_songwriter/literatur...,9435620944ae4747b2b25025a31b9cf9
1,About the Author ................................,/content/drive/MyDrive/ai_songwriter/literatur...,7d8df3f0131d4ca283b45d75196f3880
2,Summary..........................................,/content/drive/MyDrive/ai_songwriter/literatur...,31a7cacca4dd4285b0725d377d0fff71
3,Summary..........................................,/content/drive/MyDrive/ai_songwriter/literatur...,d14a494367b146ea841905d785365ad1
4,The Cycle of Fifths as a Mnemonic Device ........,/content/drive/MyDrive/ai_songwriter/literatur...,1d255581ed954b77a6a45d06029d373b


# LLM

In [5]:
# Mistral Wrapper
from transformers import AutoModelForCausalLM, AutoTokenizer,GPTQConfig, pipeline,TextStreamer
import torch
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

class MistralModel:
    def __init__(self):
        # Refresh CUDA Memory
        torch.cuda.empty_cache()
        self.model,self.tokenizer = self.get_model()
        streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_new_tokens=4092,
            do_sample=True,
            temperature=0.1,
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.15,
            streamer=streamer,
        )


    def format_prompt(self,prompt):
        return f"""<s>[INST] {prompt} [/INST]"""

    def generate_instruction(
        self,
        prompt:str,
        instruction:str = 'Think carefully and answer the given question as truthfully as possible',
        llm_template = None
    ):
        # if not llm_template:
        #     llm_template = self.format_prompt
        instruction_format = f"""### Instruction: {instruction}:

    ### Input:
    {prompt}

    ### Response:
    """
        if llm_template:
            return llm_template(instruction_format)
        else:
            return instruction_format


    def get_model(self):
        # model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
        model_name_or_path = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
        # To use a different branch, change revision
        # For example: revision="main"
        quantization_config_loading = GPTQConfig(bits=4, use_exllama = False)
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path,

                                                  quantization_config=quantization_config_loading,
                                                  device_map="cuda",
                                                  trust_remote_code=True,
                                                  revision="gptq-4bit-32g-actorder_True")

        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
        return model, tokenizer

    def _predict(self, prompt):
        torch.cuda.empty_cache()
        response =  self.pipe(self.format_prompt(prompt))[0]['generated_text']
        return response

    def predict(self,prompt):
        return self._predict(prompt).split(r'INST]')[-1].strip()

    def ask(self,question,instruction = None):
        formatted_prompt = self.generate_instruction(prompt=question,instruction=instruction)
        return self.predict(formatted_prompt)

class MistralLLM(LLM):
    mistral_model: MistralModel

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        # if stop is not None:
        #     raise ValueError("stop kwargs are not permitted.")
        return self.mistral_model.ask(prompt)

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"model": self.mistral_model}

In [6]:
model = MistralModel()

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
answer = model.ask("What is the view of philosophy on giving importance to happiness?")

Philosophy has a diverse range of views when it comes to the importance of happiness. Some philosophers, like Epicurus and Aristotle, believed that happiness was the ultimate goal in life and should be pursued above all else. They argued that a happy life involves both physical and mental well-being, as well as engaging in activities that bring pleasure and satisfaction.

   Other philosophers, such as Socrates and Plato, focused more on knowledge and virtue than on personal happiness. They believed that true happiness could only be achieved through understanding and living according to moral principles. In their view, seeking happiness for its own sake might lead to selfishness or hedonism, which would ultimately harm society as a whole.

   Despite these differing perspectives, many philosophical traditions agree that happiness can be found through a balance between personal fulfillment and contributing positively to one's community. This idea is reflected in various ethical systems,

In [27]:
print(answer)

Philosophy has a diverse range of views when it comes to the importance of happiness. Some philosophers, like Epicurus and Aristotle, believed that happiness was the ultimate goal in life and should be pursued above all else. They argued that a happy life involves both physical and mental well-being, as well as engaging in activities that bring pleasure and satisfaction.

    Other philosophers, such as Socrates and Plato, focused more on knowledge and virtue than on personal happiness. They believed that true happiness could only be achieved through understanding and living according to moral principles. In their view, seeking happiness for its own sake might lead to selfishness or hedonism, which would ultimately harm society as a whole.

    Despite these differing perspectives, many philosophical traditions agree that happiness can be found through a balance between personal fulfillment and contributing positively to one's community. This idea is reflected in various ethical system

In [7]:
print(df.iloc[100]['text'])

3.4 Heptatonic Scales: Introduction to Modes

107

Chapter 3 The Foundations Scale-Steps and Scales

by the use of musia ficta: composers routinely altered pitches to achieve the desired result. For example, the “softening” of the fourth scale degree in Lydian, or adding a Leading Tone to Dorian and Mixolydian.Because of its unique character, Phrygian was resistant to any alteration.

Figure 3.17 Modes and music ficta

Greater Modal System

In practical composition, the altered version of the mode became the version used. The resulting mixtures of mode and alteration in time yielded new scales, recognized as such by established practice. This was codified in the Greater Modal System.

Figure 3.18 The Greater Modal System (Abbreviated)

Audio 12

The Modes

(click to see video)

Note that Ionian is the Major scale and Aeolian is the Natural Minor scale. The other earlier modes (again by established practice) gradually polarized toward one or the other of these two forms. Due to the perc

In [22]:
instruction = \
"""
You are a good text editor who reconstructs a given context.
You are provided a context chunk (delimited by ```). This context is an excerpt from a book/article. Your task is to Edit out unnecassary information (chapter metadata, book info, extra spaces, figures, audio etc)
Thought 1: While traversing through each sentence, If it is related to book/article metadata, delete it.
Thought 2: Think about the rest of the sentences and see if they have any errors
  Errors are Spelling Mistakes, Grammatical Errors
  Out of place special characters.
  Gibberish words/sentences
Thought 3: Create an almost verbatim version of the text (DON'T mention the author/chapters or any meta information) having all information that is present in the context. Fill out details if needed.
  Words, phrases that seem incomplete (Sic).

Make a clear consice version of the text
Remove any reference to the book/author/chapters.
"""

question = f"```{df.iloc[101]['text']}```"

sample_processed = model.ask(question = question, instruction=instruction)





   Heptatonic Scales: Introduction to Modes

The Associative Method is a technique used for learning modes and understanding their characteristics. It involves classifying modes based on whether they share the same basic qualities as Major or Minor scales, and then identifying the differences between them.

Major sounding modes include Ionian, Lydian, Dorian, Mixolydian, and Locrian (with variations). On the other hand, minor sounding modes consist of Aeolian and Phrygian.

The Associative Method can be helpful in various situations, particularly when it comes to recognizing and singing modes. Some individuals suggest employing a comparable approach for this purpose.


In [23]:
from pprint import pprint
pprint(sample_processed)

('Heptatonic Scales: Introduction to Modes\n'
 '\n'
 'The Associative Method is a technique used for learning modes and '
 'understanding their characteristics. It involves classifying modes based on '
 'whether they share the same basic qualities as Major or Minor scales, and '
 'then identifying the differences between them.\n'
 '\n'
 'Major sounding modes include Ionian, Lydian, Dorian, Mixolydian, and Locrian '
 '(with variations). On the other hand, minor sounding modes consist of '
 'Aeolian and Phrygian.\n'
 '\n'
 'The Associative Method can be helpful in various situations, particularly '
 'when it comes to recognizing and singing modes. Some individuals suggest '
 'employing a comparable approach for this purpose.')


In [30]:
instruction = \
        """You are a network graph maker who extracts important terms and their relations from a given Summary.
        You are provided with a context chunk (delimited by ```) Your task is to extract the ontology
        of terms mentioned in the given context. These terms should represent the key concepts as per the context.
        Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.
            Terms may include object, entity, location, organization, person,
            condition, acronym, documents, service, concept, etc.
            Terms should be as atomistic as possible

        Thought 2: Think about how these terms can have one on one relation with other terms.
            Terms that are mentioned in the same sentence or the same paragraph are typically related to each other.
            Terms can be related to many other terms
        Thought 3: Find out the relation between each such related pair of terms.
        Format your output as a list of json. Each element of the list contains a pair of terms"
        and the relation between them, like the follwing:
        [
           {
               "node_1": "A concept from extracted ontology",
               "node_2": "A related and similar concept from extracted ontology",
               "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences",
               "weight":"weight of the relationship between 1 and 10"
           }, {...} at leest 5
        ]
    """
question = f"```{sample_processed}```"
sample_concept = model.ask(question = question, instruction=instruction)



   [
      {
         "node_1": "Associative Method",
         "node_2": "learning modes",
         "edge": "The Associative Method is a technique used for learning modes and understanding their characteristics.",
         "weight": 10
      },
      {
         "node_1": "classification",
         "node_2": "modes",
         "edge": "It involves classifying modes based on whether they share the same basic qualities as Major or Minor scales, and then identifying the differences between them.",
         "weight": 9
      },
      {
         "node_1": "major sounding modes",
         "node_2": "minor sounding modes",
         "edge": "Major sounding modes include Ionian, Lydian, Dorian, Mixolydian, and Locrian (with variations), while minor sounding modes consist of Aeolian and Phrygian.",
         "weight": 8
      },
      {
         "node_1": "recognition",
         "node_2": "singing modes",
         "edge": "The Associative Method can be helpful in various situations, particularly 

In [32]:
import ast
def get_concepts_from_text(text:str,model:MistralModel):
  instruction_1 = \
"""
You are a good text editor who reconstructs a given context.
You are provided a context chunk (delimited by ```). This context is an excerpt from a book/article. Your task is to Edit out unnecassary information (chapter metadata, book info, extra spaces, figures, audio etc)
Thought 1: While traversing through each sentence, If it is related to book/article metadata, delete it.
Thought 2: Think about the rest of the sentences and see if they have any errors
  Errors are Spelling Mistakes, Grammatical Errors
  Out of place special characters.
  Gibberish words/sentences
Thought 3: Create an almost verbatim version of the text (DON'T mention the author/chapters or any meta information) having all information that is present in the context. Fill out details if needed.
  Words, phrases that seem incomplete (Sic).

Make a clear consice version of the text
Remove any reference to the book/author/chapters.
"""
  question = f"```{text}```"
  text_processed = model.ask(question = question, instruction=instruction_1)
  instruction_2 = \
        """You are a network graph maker who extracts important terms and their relations from a given Summary.
        You are provided with a context chunk (delimited by ```) Your task is to extract the ontology
        of terms mentioned in the given context. These terms should represent the key concepts as per the context.
        Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.
            Terms may include object, entity, location, organization, person,
            condition, acronym, documents, service, concept, etc.
            Terms should be as atomistic as possible

        Thought 2: Think about how these terms can have one on one relation with other terms.
            Terms that are mentioned in the same sentence or the same paragraph are typically related to each other.
            Terms can be related to many other terms
        Thought 3: Find out the relation between each such related pair of terms.
        Format your output as a list of json. Each element of the list contains a pair of terms"
        and the relation between them, like the follwing:
        [
           {
               "node_1": "A concept from extracted ontology",
               "node_2": "A related and similar concept from extracted ontology",
               "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences",
               "weight":"weight of the relationship between 1 and 10"
           }, {...} at leest 5
        ]
    """
  question = f"```{text_processed}```"
  text_concept = model.ask(question = question, instruction=instruction_2)
  try:
    result = ast.literal_eval(text_concept)
  except Exception as e:
    print("Buggy Output:",text_concept)
    result = None
  return result


In [33]:
sample_text = df.iloc[101]['text']
print(sample_text)

3.4 Heptatonic Scales: Introduction to Modes

108

Chapter 3 The Foundations Scale-Steps and Scales

described in the treatise de Musica of the Spanish composer and theoretician Bartolomé Ramos de Pareja.

Associative Method

This sense of polarization toward either Major or Minor becomes one useful technique for learning modes and familiarization with their characteristics. The Associative Method22 classifies modes as having the same basic characteristics as either Major or Minor and then recognizes the variances.

Major Sounding Modes

Minor Sounding Modes

Ionian: Major

Aeolian: Natural Minor

Lydian: Major, raised 4

Dorian: Minor, raised 6

Mixolydian: Major, lowered 7 Phrygian: Minor, lowered 2

Locrian: Minor, lowered 2 & 5

(or Locrian: Phrygian, lowered 5)

Figure 3.19 The Associative Method for Modes

Audio 13

Associative Modes

22. Recognition of modes by association with either the Major or the Minor scale and observing the variances from these.

(click to see video)

Thi

In [34]:
concepts = get_concepts_from_text(sample_text,model)





   Heptatonic Scales: Introduction to Modes

The Associative Method is a useful technique for learning modes and familiarizing oneself with their characteristics. It involves classifying modes based on whether they share the same fundamental traits as Major or Minor scales, while also identifying differences between them.

In Major sounding modes, Ionian has a major quality, Aeolian is natural minor, Lydian features a raised 4, Dorian has a raised 6, Mixolydian has a lowered 7, and Phrygian has a lowered 2. For minor sounding modes, Aeolian is natural minor, Dorian has a minor quality with a raised 6, Lydian has a major quality with a raised 4, Mixolydian has a lowered 7, and Phrygian has a lowered 2 and 5.

The Associative Method can be visually represented in Figure 3.19. This approach proves beneficial for mode recognition and learning to hear and sing various modes. Some individuals suggest employing a comparable strategy for recognizing heptatonic scales.


   [
      {
        

In [35]:
print(type(concepts))
print("Concepts:",concepts)

<class 'list'>
Concepts: [{'node_1': 'Associative Method', 'node_2': 'learning modes', 'edge': 'The Associative Method is a useful technique for learning modes and familiarizing oneself with their characteristics.', 'weight': 8}, {'node_1': 'Major', 'node_2': 'minor', 'edge': 'Modes are classified based on whether they share the same fundamental traits as Major or Minor scales.', 'weight': 7}, {'node_1': 'Ionian', 'node_2': 'major quality', 'edge': 'Ionian has a major quality.', 'weight': 6}, {'node_1': 'Aeolian', 'node_2': 'natural minor', 'edge': 'Aeolian is natural minor.', 'weight': 6}, {'node_1': 'Lydian', 'node_2': 'raised 4', 'edge': 'Lydian features a raised 4.', 'weight': 6}, {'node_1': 'Dorian', 'node_2': 'raised 6', 'edge': 'Dorian has a raised 6.', 'weight': 6}, {'node_1': 'Mixolydian', 'node_2': 'lowered 7', 'edge': 'Mixolydian has a lowered 7.', 'weight': 6}, {'node_1': 'Phrygian', 'node_2': 'lowered 2 and 5', 'edge': 'Phrygian has a lowered 2 and 5.', 'weight': 6}]


In [36]:
sample_df = df.head(30)

In [38]:
sample_df.columns

Index(['text', 'source', 'chunk_id'], dtype='object')

In [42]:
def _df_concepts(row):
  row['concepts'] = get_concepts_from_text(row['text'], model)
  with open('logs.txt','a+') as f:
    f.write(row['concepts'])
  return row
sample_df = sample_df.apply(lambda row: _df_concepts(row), axis = 1)

"node_1":[INST] ### Instruction: 
You are a good text editor who reconstructs a given context. 
You are provided a context chunk (delimited by ```). This context is an excerpt from a book/article. Your task is to Edit out unnecassary information (chapter metadata, book info, extra spaces, figures, audio etc)
Thought 1: While traversing through each sentence, If it is related to book/article metadata, delete it. 
Thought 2: Think about the rest of the sentences and see if they have any errors
  Errors are Spelling Mistakes, Grammatical Errors
  Out of place special characters.
  Gibberish words/sentences
Thought 3: Create an almost verbatim version of the text (DON'T mention the author/chapters or any meta information) having all information that is present in the context. Fill out details if needed.
  Words, phrases that seem incomplete (Sic).

Make a clear consice version of the text
Remove any reference to the book/author/chapters.
:

    ### Input:
    ```Music Theory

v. 1.0

This 



[/INST]

   Music Theory

   This book is licensed under a Creative Commons by-nc-sa 3.0 license. See the license for more details, but that basically means you can share this book as long as you credit the author (but 

KeyboardInterrupt: ignored