In [5]:
#!pip install -r requirements.txt

In [20]:
import os
import re
import numpy as np
import pandas as pd
import textract
from PyPDF2 import PdfReader
import psycopg2

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import VertexAIEmbeddings
from langchain.docstore.document import Document
from langchain.llms import VertexAI
from langchain import PromptTemplate, LLMChain
from langchain.chains.question_answering import load_qa_chain

from dotenv import load_dotenv
from langchain.vectorstores import Chroma
load_dotenv()

True

In [8]:
def files(path):
    """
    Function that returns only filenames (and not folder names)
    """
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)):
            yield file

def create_data_packet(file_name, file_type, page_number, file_content):
    """Creating a simple dictionary to store all information (content and metadata)
    extracted from the document"""
    data_packet = {}
    data_packet["file_name"] = file_name
    data_packet["file_type"] = file_type
    data_packet["page_number"] = page_number
    data_packet["content"] = file_content
    return data_packet

def data_load():
    
    final_data = []
    
    for file_name in files("data/"):
        path = f"data/{file_name}"
        _, file_type = os.path.splitext(path)
        if file_type == ".pdf":
            # loading pdf files, with page numbers as metadata.
            reader = PdfReader(path)
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    packet = create_data_packet(
                        file_name, file_type, page_number=int(i + 1), file_content=text
                    )

                    final_data.append(packet)
        else:
            # loading other file types
            text = textract.process(path).decode("utf-8")
            packet = create_data_packet(
                file_name, file_type, page_number=None, file_content=text
            )
            final_data.append(packet)
    return final_data

data = data_load()

def get_embedding_df(data):

    df = pd.DataFrame(data[5:])

    contents = []
    for index, row in df.iterrows():
        content = row.content
        content = content.replace("취업규칙  \n","")
        content = re.sub("\n[0-9]* \n","", content)
        contents.append(content.strip())
    contents = ' '.join(contents)


    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n"],
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
    )
    chunked = []
    splits = text_splitter.create_documents([contents])
    index = 1
    for s in splits:
        r = {"cid": index, "content": s.page_content}
        chunked.append(r)
        index = index+1

    embeddings_service = VertexAIEmbeddings(model_name="textembedding-gecko-multilingual@latest")

    def retry_with_backoff(func, *args, retry_delay=5, backoff_factor=2, **kwargs):
        max_attempts = 10
        retries = 0
        for i in range(max_attempts):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print(f"error: {e}")
                retries += 1
                wait = retry_delay * (backoff_factor**retries)
                print(f"Retry after waiting for {wait} seconds...")
                time.sleep(wait)


    batch_size = 5
    for i in range(0, len(chunked), batch_size):
        request = [x["content"] for x in chunked[i : i + batch_size]]
        response = retry_with_backoff(embeddings_service.embed_documents, request)
        # Store the retrieved vector embeddings for each chunk back.
        for x, e in zip(chunked[i : i + batch_size], response):
            x["embedding"] = e

    # Store the generated embeddings in a pandas dataframe.
    product_embeddings = pd.DataFrame(chunked)
    return product_embeddings

embedding_df = get_embedding_df(data)

In [26]:
# DB 연결 함수
def get_connection():
    return psycopg2.connect(host="192.168.0.127", dbname="vector_db", user="postgres", password="qwer1234", port=5432)
    #return psycopg2.connect(host="172.17.2.45", dbname="vector_db", user="postgres", password="qwer1234", port=5432)

In [27]:
# DB 연결 테스트
with get_connection() as conn:
    with conn.cursor() as cur:
        cur.execute("select version()")
        result_one = cur.fetchone()
        print(result_one[0])

PostgreSQL 15.4 (Debian 15.4-1.pgdg120+1) on aarch64-unknown-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [13]:
# 테이블 초기화
def create_table():
    with get_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                CREATE EXTENSION IF NOT EXISTS vector            
            """)
            cur.execute("""
                DROP TABLE IF EXISTS vntg_embeddings CASCADE            
            """)
            cur.execute("""
                CREATE TABLE vntg_embeddings (
                    uuid SERIAL,
                    content TEXT,
                    embedding vector(768)
                )
            """)
create_table() 

In [14]:
# 데이터 (원문 + 임베딩결과) 입력
def insert_embeddings():

    with get_connection() as conn:
        with conn.cursor() as cur:
            for index, row in embedding_df.iterrows():
                cur.execute("INSERT INTO vntg_embeddings (content, embedding) VALUES (%s, %s)", (row['content'],row['embedding']))

insert_embeddings()   

In [18]:
# 질문과 관련된 문서 추출 함수
def get_embedding_matches(vector, num_matches = 4):
    similarity_threshold = 0.1
    #num_matches = 3
    matches = []
    with get_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
            WITH vector_matches AS (
              SELECT uuid, 1 - (embedding <=> %s::vector) AS similarity
              FROM vntg_embeddings
              WHERE 1 - (embedding <=> %s::vector) > %s 
              ORDER BY similarity DESC
              LIMIT %s
            )
            SELECT t1.uuid, t1.content, t2.similarity FROM vntg_embeddings t1, vector_matches t2 WHERE t1.uuid=t2.uuid
            ORDER BY t2.similarity DESC
            """,(vector, vector, similarity_threshold, num_matches))
            results = cur.fetchall()
            for r in results:
                matches.append(
                    {
                        "cid":r[0],
                        "content":r[1]
                    }
                )
    return matches

In [28]:
# 결과 호출
embeddings_service = VertexAIEmbeddings(model_name="textembedding-gecko-multilingual@latest")

question = '채용결격사유에 대한 사례 알려줘'
prompt_template = """Answer the questions correctly only within the context provided. If the answer is
                not contained in the context, say "answer not available in context" \n\n
                Context: \n {context}?\n
                Question: \n {question} \n
                Answer:
              """

qe = embeddings_service.embed_query(question)

matches = get_embedding_matches(qe)

docs = [Document(page_content=t["content"]) for t in matches]

llm = VertexAI(temperature=0, model_name="text-bison-32k", max_output_tokens=512)

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

stuff_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

stuff_answer = stuff_chain(
    {"input_documents": docs, "question": question}, return_only_outputs=True
)

print(stuff_answer["output_text"])

 채용결격사유에는 다음과 같은 사유가 있다. 
① 피성년후견인 또는 피한정후견인 
② 파산자로서 복권되지 않은 자 
③ 병역의무자로서 기피 중에 있는 자 
④ 신체 또는 정신상의 장애로 직무를 감당할 수 없다고 인정되는 자 
⑤ 금고이상의 형을 받고 그 집행이 종료되거나 집행을 받지 아니하기로 확정된 후 2년을 경과하지 않은 자 
⑥ 전 근무지에서 불법적 노동행위로 해고된 자 
⑦ 채용 시 허위사실이 있는 서류를 제출한 자 
⑧ 기타 위에 준하는 채용결격사유에 해당하는 자
