In [None]:
from langchain.document_loaders.github import GitHubIssuesLoader
import os

from dotenv import load_dotenv

load_dotenv()

GHA_TOKEN = os.getenv('GHA')


In [None]:
loader = GitHubIssuesLoader(repo="biojava/biojava",access_token=GHA_TOKEN, include_prs=False)
docs=loader.load()
docs[0].to_json()
len(docs)
docs[0]

In [None]:
len(docs)
summaries = [d.page_content[0:100] for d in  docs]
print('\n----\n'.join(summaries))


In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
filtered = filter_complex_metadata(docs)

In [None]:
c = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
r = RecursiveCharacterTextSplitter(chunk_size=240, chunk_overlap=40)
all_chunks = r.split_documents(filtered)
all_chunks[0]
len(all_chunks)
all_chunks[0].metadata
    

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores.utils import filter_complex_metadata


In [None]:
filtered = filter_complex_metadata(all_chunks)
filtered[0].metadata
for f in filtered:
    del(f.metadata['is_pull_request'])
filtered[0].metadata

In [None]:
v_store = Chroma.from_documents(documents=filtered, embedding=OpenAIEmbeddings(), persist_directory="./chroma2")
v_store._collection.count()

In [None]:
results = v_store.similarity_search(query = "alignment", k=4, score_threshold=0.8)
results

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo


In [None]:
att_info = [
    AttributeInfo(name="creator", description = "the creator of the Github issue", type="string"),
    AttributeInfo(name="created_at", description = "the timestamp of issue creation in ISO-8601 format", type="datetime.date"),
]
sqretriever = SelfQueryRetriever.from_llm(llm=OpenAI(),
                                        vectorstore=v_store,
                                        metadata_field_info=att_info,
                                        document_contents="Bio java issues",
                                        verbose=True,
                                        search_kwargs={'k':3,'where_document':{"$contains": "GANGSTA"}})

In [None]:
results = sqretriever.get_relevant_documents("marking deprecated code created by richarda23", verbose=True)
results


In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models  import ChatOpenAI
llm = ChatOpenAI(model="gpt-4", temperature=0.1, verbose=True)
retriever = v_store.as_retriever( search_kwargs={'k':4,'where_document':{"$contains": "GANGSTA"}})
vector_results = retriever.get_relevant_documents(query="What issues are about alignment")

vector_results

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm,chain_type='stuff',retriever=sqretriever,verbose=True )

result = qa_chain({"query": "what is the current test coverage in biojava-core  "})
result

In [None]:
result

In [None]:
#v_store.delete_collection()