In [2]:
output_dir = 'us_election_json'
input_dir = 'us_election'
weaviate_url = "http://localhost:8080"
embedding_model_name = 'all-MiniLM-L6-v2' # sentence-transformers model 
device = 'cuda' # multi-processing server

In [3]:
import subprocess
import os
from typing import List, Dict
from userpaths import get_my_documents


# def process_local(output_dir: str, num_processes: int, input_path: str = get_my_documents()):
def process_local(output_dir: str, num_processes: int, input_path: str):
        command = [
          "unstructured-ingest",
          "local",
          "--input-path", input_path,
          "--output-dir", output_dir,
          "--num-processes", str(num_processes),
          "--recursive",
          "--verbose",
        ]

        # Run the command
        process = subprocess.Popen(command, stdout=subprocess.PIPE)
        output, error = process.communicate()

        # Print output
        if process.returncode == 0:
            print('Command executed successfully. Output:')
            print(output.decode())
        else:
            print('Command failed. Error:')
            print(error.decode())

def get_result_files(folder_path) -> List[Dict]:
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    return file_list

In [4]:
process_local(output_dir=output_dir, num_processes=2, input_path=input_dir)
files = get_result_files(output_dir)

2023-11-30 18:10:35,441 MainProcess DEBUG    options: {'input_path': 'us_election', 'output_dir': 'us_election_json', 'num_processes': 2, 'recursive': True, 'verbose': True, 'file_glob': None, 'download_dir': None, 're_download': False, 'preserve_downloads': False, 'download_only': False, 'max_docs': None, 'pdf_infer_table_structure': False, 'strategy': 'auto', 'reprocess': False, 'ocr_languages': 'eng', 'encoding': None, 'fields_include': ['element_id', 'text', 'type', 'metadata'], 'flatten_metadata': False, 'metadata_include': [], 'metadata_exclude': [], 'partition_by_api': False, 'partition_endpoint': 'https://api.unstructured.io/general/v0/general', 'api_key': None}
2023-11-30 18:10:35,893 MainProcess INFO     Processing 1 docs
2023-11-30 18:10:41,906 SpawnPoolWorker-1 INFO     Processing us_election/2012_U.S._Presidential_Election_Results_by_State.pdf
2023-11-30 18:10:41,906 SpawnPoolWorker-1 DEBUG    Using local partition
The ocr_languages kwarg will be deprecated in a future ver

Command executed successfully. Output:



In [7]:
import uuid
import weaviate
from weaviate.util import get_valid_uuid

def create_local_weaviate_client(db_url: str):
    return weaviate.Client(
        url=db_url,
    )

def get_schema(vectorizer: str = "none"):
    return {
        "classes": [
            {
                "class": "Doc",
                "description": "A generic document class",
                "vectorizer": vectorizer,
                "properties": [
                    {
                        "name": "text",
                        "dataType": ["text"],
                        "description": "Text content for the document",
                    },
                    {
                        "name": "candidate",
                        "dataType": ["text"],
                        "description": "candidate related to the document",
                    },
                    {
                        "name": "state",
                        "dataType": ["text"],
                        "description": "state related to the document",
                    },
                ],
            },
        ],
    }

def upload_schema(my_schema, weaviate):
    weaviate.schema.delete_all()
    weaviate.schema.create(my_schema)

def count_documents(client: weaviate.Client) -> Dict:
    response = (
        client.query
        .aggregate("Doc")
        .with_meta_count()
        .do()
    )
    count = response
    return count

In [8]:
client = create_local_weaviate_client(db_url=weaviate_url)
my_schema = get_schema()
upload_schema(my_schema, weaviate=client)

In [9]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.json import partition_json
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(embedding_model_name, device=device)

def compute_embedding(chunk_text: List[str]):
    embeddings = embedding_model.encode(chunk_text, device=device)
    return embeddings
    

def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for element in elements:
        if not type(element.metadata.data_source) is DataSourceMetadata:
            delattr(element.metadata, "data_source")

        if hasattr(element.metadata, "coordinates"):
            delattr(element.metadata, "coordinates")

    chunks = chunk_by_title(
        elements,
        combine_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
    )

    for i in range(len(chunks)):
        chunks[i] = {"text": chunks[i].text}

    chunk_texts = [x['text'] for x in chunks]
    embeddings = compute_embedding(chunk_texts)
    return chunks, embeddings


def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_json(filename=filename)
            chunks, embeddings = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            client.batch.add_data_object(
                data_object=chunk,
                class_name="doc",
                uuid=get_valid_uuid(uuid.uuid4()),
                vector=embeddings[i]
            )
        
    client.batch.flush()

/home/nvidia/anaconda3/envs/llamaRAG/lib/python3.9/site-packages/huggingface_hub/inference/_text_generation.py:121: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.3/migration/
  @validator("best_of")
/home/nvidia/anaconda3/envs/llamaRAG/lib/python3.9/site-packages/huggingface_hub/inference/_text_generation.py:140: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.3/migration/
  @validator("repetition_penalty")
/home/nvidia/anaconda3/envs/llamaRAG/lib/python3.9/site

In [10]:
add_data_to_weaviate(
    files=files,
    client=client,
    chunk_under_n_chars=250,
    chunk_new_after_n_chars=500
)

print(count_documents(client=client)['data']['Aggregate']['Doc'])

Uploading 4 chunks for us_election_json/2012_U.S._Presidential_Election_Results_by_State.pdf.json.
[{'meta': {'count': 4}}]


In [11]:
from langchain.llms import LlamaCpp
from langchain.vectorstores.weaviate import Weaviate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

In [12]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 100  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="model_files/llama-2-7b-chat.Q4_K_S.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True, # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from model_files/llama-2-7b-chat.Q4_K_S.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     

In [13]:
def question_answer(question: str, vectorstore: Weaviate):
    embedding = compute_embedding(question)
    similar_docs = vectorstore.max_marginal_relevance_search_by_vector(embedding)
    content = [x.page_content for x in similar_docs]
    prompt_template = PromptTemplate.from_template(
    """\
    Given context about the subject, answer the question based on the context provided to the best of your ability.
    Context: {context}
    Question:
    {question}
    Answer:
    """
    )
    prompt = prompt_template.format(context=content, question=question)
    answer = llm(prompt)
    return answer, similar_docs


In [14]:
client = weaviate.Client(weaviate_url)
vectorstore = Weaviate(client, "Doc", "text")

question = "Give a summary the election in US States"

answer, similar_docs = question_answer(question, vectorstore)

print("\n\n\n-------------------------")
print(f"QUERY: {question}")
print("\n\n\n-------------------------")
print(f"Answer: {answer}")
print("\n\n\n-------------------------")
for index, result in enumerate(similar_docs):
    print(f"\n\n-- RESULT {index+1}:\n")
    print(result)




 According to the provided context, the 2012 United States presidential election results were as follows:
* Democrats won 20 states, including California, Oregon, Washington, Colorado, and New Mexico.
* Republicans won 28 states, including Texas, Florida, Georgia, North Carolina, and South Carolina.
* The remaining two states, Hawaii and Vermont, were won by the Democratic candidate, Barack Obama.
In terms of electoral votes, Democrats received 332 votes, while Republicans received 206 votes. The District of Columbia also supported the Democratic candidate with over 55% of the vote.
Overall, the election results indicate a narrow victory for President Obama, who won 51% of the popular vote to Governor Romney's 47%.


-------------------------
QUERY: Give a summary the election in US States



-------------------------
Answer:  According to the provided context, the 2012 United States presidential election results were as follows:
* Democrats won 20 states, including California, Oregon,


llama_print_timings:        load time =    2829.06 ms
llama_print_timings:      sample time =     107.32 ms /   177 runs   (    0.61 ms per token,  1649.32 tokens per second)
llama_print_timings: prompt eval time =   20258.47 ms /   715 tokens (   28.33 ms per token,    35.29 tokens per second)
llama_print_timings:        eval time =   13333.66 ms /   176 runs   (   75.76 ms per token,    13.20 tokens per second)
llama_print_timings:       total time =   34325.24 ms
