In [1]:
import git
import os
repo_url = 'https://github.com/sahajsoft/Pravarthan'
# clone the repo if it doesn't exist
expanded_path = os.path.expanduser('~/Pravarthan')

if not os.path.exists(expanded_path):
    repo = git.Repo.clone_from(repo_url, expanded_path)
else:
    repo = git.Repo(expanded_path)    


In [None]:
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.node_parser import SentenceSplitter
import chardet

text = ""

def read_file_contents(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        if encoding:
            try:
                return raw_data.decode(encoding)
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError: Unable to decode file '{file_path}' with detected encoding '{encoding}'.")
                return None
        else:
            print(f"Encoding detection failed for file '{file_path}'.")
            return None
        
for root, dirs, files in os.walk(expanded_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        # Read and print contents of each file
        file_contents = read_file_contents(file_path)
        if file_contents is not None:            
            text+= file_contents
            text+= "\n"



In [None]:
nodes = []
node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=64)
nodes = node_parser.get_nodes_from_documents(
    [Document(text=text)], show_progress=True
)

In [None]:
import weaviate
import weaviate.classes.config as wc
import weaviate.classes as wvc

from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-xl')

def embeddings(text):
    return model.encode(text)


client = weaviate.connect_to_local()

if not client.collections.exists(name = 'code_embeddings'):
    client.collections.create(name = 'code_embeddings')

5679


In [None]:
from weaviate.classes.query import Filter
from weaviate.classes.config import Property, DataType

collection = client.collections.get('code_embeddings')
weaviate_docs = []
print('Preparing Weaviate docs...')

for index, node in enumerate(nodes):
    weaviate_doc = wvc.data.DataObject(
        properties={
            'message': node.text,
        },
        vector=list(embeddings(node.text))
    )
    weaviate_docs.append(weaviate_doc)
    print(index)
        


In [None]:
collection.data.insert_many(weaviate_docs)

In [None]:
retrieved_docs = collection.query.near_vector(
    near_vector=list(embeddings('''When a user changes the work status, the data must be logged in the system. The data to be stored must include the following:

Who changed the status
When was the change made
What is the change made''')),
    limit=15,
    return_metadata=wvc.query.MetadataQuery(certainty=True)

)

retrieved_messages = [o.properties['message'] for o in retrieved_docs.objects]


print(retrieved_messages)