# Create Local Vectorstore

Connect to Google Drive API

In [None]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
import json
import re
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents.base import Document

# Set up the credentials and service
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
SERVICE_ACCOUNT_FILE = ''
DOCUMENT_ID = ''

creds = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('docs', 'v1', credentials=creds)
document = service.documents().get(documentId=DOCUMENT_ID).execute()

Use regex patterns to extract question and answer data from the google doc

In [None]:
def extract_qa_from_doc(doc):
    content = doc.get('body').get('content')
    faq_list = []
    current_faq = {}
    collecting_answer = False
    collecting_question = False

    question_patterns = [
        re.compile(r'^(Question|Problem|Issue)\s*:\s*', re.IGNORECASE)
    ]
    answer_patterns = [
        re.compile(r'^(Answer|Solution)\s*:\s*', re.IGNORECASE)
    ]

    def match_patterns(text, patterns):
        for pattern in patterns:
            if pattern.match(text):
                return pattern
        return None

    # Loop through every FAQ block and extract relevant information
    for element in content:
        if 'paragraph' in element:
            paragraph = element.get('paragraph')
            text = ''.join([run.get('textRun', {}).get('content', '') for run in paragraph.get('elements')]).strip()
            italic_found = any(run.get('textRun', {}).get('textStyle', {}).get('italic', False) for run in paragraph.get('elements'))

            if text.startswith("FAQ #"):
                if 'question' in current_faq and 'answer' in current_faq:
                    faq_list.append(current_faq)
                current_faq = {"faq_number": text}
                collecting_answer = False
                collecting_question = False
            elif text.startswith("Title:"):
                current_faq["title"] = text.replace("Title:", "").strip()
            elif text.startswith("Tag:"):
                current_faq["tag"] = text.replace("Tag:", "").replace('’', '\'').strip()
            elif text.startswith("Author:"):
                current_faq["author"] = text.replace("Author:", "").strip()
            elif match_patterns(text, question_patterns):
                if 'question' in current_faq and 'answer' in current_faq:
                    faq_list.append(current_faq)
                    current_faq = {"faq_number": current_faq["faq_number"]}
                current_faq["question"] = match_patterns(text, question_patterns).sub("", text).strip()
                collecting_answer = False
                collecting_question = True
            elif match_patterns(text, answer_patterns):
                current_faq["answer"] = [match_patterns(text, answer_patterns).sub("", text).strip()]
                collecting_answer = True
                collecting_question = False
            elif collecting_answer:
                if italic_found:
                    collecting_answer = False
                else:
                    if text:
                        current_faq["answer"].append(text)
            elif collecting_question and text:
                current_faq["question"] += " " + text

    # Add the last FAQ entry
    if 'question' in current_faq and 'answer' in current_faq:
        faq_list.append(current_faq)

    # Convert answer lists to single strings
    for faq in faq_list:
        faq["answer"] = '\n'.join(faq["answer"])

    return faq_list

qa_pairs = extract_qa_from_doc(document)

Convert the FAQs to Document format to store in the vectorstore

In [None]:
def transform_json_to_documents(json_data):
    documents = []
    for item in json_data:
        metadata = {
            'document': item['faq_number'],
            'title': item['title'] if 'title' in item else '',
            'tag': item['tag'],
            'author': item['author'] if 'author' in item else ''
        }
        page_content = f"Question: {item['question']}\nAnswer: {item['answer']}"
        documents.append(Document(metadata=metadata, page_content=page_content))
    return documents

jsondocs = transform_json_to_documents(qa_pairs)

Read incidents csv file to get more data and convert to Document format

In [None]:
incidents = pd.read_csv('fixedincidents.csv').dropna(subset=['Solution']).reset_index(drop=True)
content = (
    'Problem: ' + 
    incidents['Error Reported'] + 
    ' \n\nSolution: ' + 
    incidents['Solution']
)
csvdocs = []
n = len(content)
for i in range(n):
    csvdocs.append(Document(page_content=content[i]))
jsondocs = transform_json_to_documents(qa_pairs)
docs = jsondocs + csvdocs

Add all documents into a vectorstore format to be easily accessed by queries for documents with similar embeddings

In [None]:
vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="faqs",
    embedding=HuggingFaceEmbeddings(),
    persist_directory='./chroma_db',
)