In [None]:
from langchain.document_loaders.github import GitHubIssuesLoader
import os

from dotenv import load_dotenv

load_dotenv()

GHA_TOKEN = os.getenv('GHA')
os.getenv('OPENAI_API_KEY')

In [193]:
loader = GitHubIssuesLoader(repo="biojava/biojava",access_token=GHA_TOKEN, include_prs=False)
docs=loader.load()
docs[0].to_json()
len(docs)
docs[0]

Document(page_content='Is biojava ready to support java17? If not what is the plan to migrate it to java17?\r\n\r\nThanks for maintaining this awesome library. It is really helpful.', metadata={'url': 'https://github.com/biojava/biojava/issues/1071', 'title': 'New release based on Java 17', 'creator': 'rizwan-ishtiaq', 'created_at': '2023-07-27T11:03:12Z', 'comments': 1, 'state': 'open', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 1071, 'is_pull_request': False})

In [194]:
len(docs)
summaries = [d.page_content[0:100] for d in  docs]
print('\n----\n'.join(summaries))


Is biojava ready to support java17? If not what is the plan to migrate it to java17?

Thanks for m
----

While analyzing your project using our Automatic bug fixing software, i noticed that sonar rule S2
----
Hi,

After suggesting that you correct this type of defect automatically (https://github.com/bioja
----
Hi!

We notice that you use the loop structure in your test cases.
For example, testQualityScores
----
I am using BioJava through Maven, using biojava 6.0.5 and biojava-ws 6.0.4 (because trying to use 6.
----
I think `Profile` was indended to be general enough to represent multiple sequence alignments. Howev
----
Given a transcript sequence derived from a GeneSequence defined on reverse strand of a chromosomal s
----
Hi,
 Been looking at the ontology module as it might be useful in my day job. 
 I tried parsing th
----
...while setting all the metadata consistently and handling ligands intuitively (i.e. keeping the ad
----
D-AminoAcids (https://proteopedia.org/wiki/index.php/Ami

In [195]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
filtered = filter_complex_metadata(docs)

In [196]:
c = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
r = RecursiveCharacterTextSplitter(chunk_size=240, chunk_overlap=40)
all_chunks = r.split_documents(filtered)
all_chunks[0]
len(all_chunks)
    

369

In [197]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores.utils import filter_complex_metadata


In [198]:
filtered = filter_complex_metadata(docs)
filtered[0].metadata

{'url': 'https://github.com/biojava/biojava/issues/1071',
 'title': 'New release based on Java 17',
 'creator': 'rizwan-ishtiaq',
 'created_at': '2023-07-27T11:03:12Z',
 'comments': 1,
 'state': 'open',
 'locked': False,
 'number': 1071,
 'is_pull_request': False}

In [199]:
  from collections import Counter
  mine = [f  for f in filtered if f.metadata['creator'] == 'richarda23']
  c=Counter(authors)
  c.most_common(5)
mine
  

[Document(page_content="Given a transcript sequence derived from a GeneSequence defined on reverse strand of a chromosomal sequence, the method getCodingSequence doesn't return the reverse complement, but just the reverse.\r\n\r\nThis seems to be due to the complement being made twice:\r\n\r\n```java\r\nString sequence = this.getSequenceAsString(getBioBegin(), getBioEnd(), getStrand()); \r\n```\r\nin CDSSequence# getCodingSequence() \r\n\r\ngenerates the reverse complement, but subsequent code makes a complementary sequence again, so the end result is simply a reversed sequence.\r\n\r\nA simple fix might be just to remove this second complementation, but possibly it might be needed in some circumstances.\r\n\r\nThe bug is exposed by test TranscriptSequenceTest#getCDNASeqNegativeStrand() \r\n\r\n", metadata={'url': 'https://github.com/biojava/biojava/issues/1016', 'title': "o.b.n.core.sequence.CDSSequence doesn't reverse-complement correctly", 'creator': 'richarda23', 'created_at': '202

In [200]:
v_store = Chroma.from_documents(documents=all_chunks, embedding=OpenAIEmbeddings(), persist_directory="./chroma")
v_store._collection.count()

369

In [201]:
results = v_store.similarity_search(query = "alignment", k=4, score_threshold=0.8)
results

[Document(page_content='method will align sequences using this object.', metadata={'comments': 0, 'created_at': '2016-08-04T14:46:54Z', 'creator': 'lafita', 'is_pull_request': False, 'locked': False, 'milestone': '6.1.0', 'number': 550, 'state': 'open', 'title': 'Create SequenceAligner objects without specifying query and target sequences', 'url': 'https://github.com/biojava/biojava/issues/550'}),
 Document(page_content='A missing feature in the alignment package is that of semiglobal alignments, useful for aligning a short query sequence against a large target database (e.g. a genome).', metadata={'assignee': 'josemduarte', 'comments': 4, 'created_at': '2015-01-26T14:06:08Z', 'creator': 'josemduarte', 'is_pull_request': False, 'locked': False, 'number': 243, 'state': 'open', 'title': 'Introduction of semiglobal alignments', 'url': 'https://github.com/biojava/biojava/issues/243'}),
 Document(page_content='See [this wikipedia paragraph](http://en.wikipedia.org/wiki/Sequence_alignment#Gl

In [202]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo


In [203]:
att_info = [
    AttributeInfo(name="creator", description = "the creator of the Github issue", type="string"),
    AttributeInfo(name="created_at", description = "the timestamp of issue creation in ISO-8601 format", type="datetime.date"),
]
sqretriever = SelfQueryRetriever.from_llm(llm=OpenAI(),
                                        vectorstore=v_store,
                                        metadata_field_info=att_info,
                                        document_contents="Bio java issues",
                                        verbose=True)

In [231]:
results = sqretriever.get_relevant_documents("created by richarda23", verbose=True)
results




query=' ' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='creator', value='richarda23') limit=None


[Document(page_content="without tests it's sometimes hard to be sure", metadata={'comments': 6, 'created_at': '2021-07-23T16:43:52Z', 'creator': 'richarda23', 'is_pull_request': False, 'locked': False, 'number': 944, 'state': 'open', 'title': 'Improve test coverage - currently 49.5% in bio-java core', 'url': 'https://github.com/biojava/biojava/issues/944'}),
 Document(page_content="I'd be happy to take a stab at some of these tasks unless there are strong contrary opinions.", metadata={'comments': 1, 'created_at': '2022-01-04T16:23:43Z', 'creator': 'richarda23', 'is_pull_request': False, 'locked': False, 'number': 1013, 'state': 'open', 'title': 'Maintenance  of  ontology module', 'url': 'https://github.com/biojava/biojava/issues/1013'}),
 Document(page_content='- adding a README to the project\r\n- updating code style/formatting and using more modern Java 7/8 idioms (once test coverage is much better)', metadata={'comments': 1, 'created_at': '2022-01-04T16:23:43Z', 'creator': 'richard

In [232]:
from langchain.chains import RetrievalQA
from langchain.chat_models  import ChatOpenAI
llm = ChatOpenAI(model="gpt-4", temperature=0.9, verbose=True)
retriever = v_store.as_retriever( search_kwargs={'k':4})
vector_results = retriever.get_relevant_documents(query="What issues are about alignment")

vector_results

[Document(page_content='A missing feature in the alignment package is that of semiglobal alignments, useful for aligning a short query sequence against a large target database (e.g. a genome).', metadata={'assignee': 'josemduarte', 'comments': 4, 'created_at': '2015-01-26T14:06:08Z', 'creator': 'josemduarte', 'is_pull_request': False, 'locked': False, 'number': 243, 'state': 'open', 'title': 'Introduction of semiglobal alignments', 'url': 'https://github.com/biojava/biojava/issues/243'}),
 Document(page_content='I was wondering what is the best solution, because another option could be to import or implement one of the already published non-sequential structure alignment algorithms: e.g. VAST, GANGSTA, SNAP, FlexSnap, SCALI, MICAN, etc', metadata={'comments': 3, 'created_at': '2015-07-31T14:45:05Z', 'creator': 'lafita', 'is_pull_request': False, 'locked': False, 'number': 307, 'state': 'open', 'title': 'New StructuralAlignment Algorithms', 'url': 'https://github.com/biojava/biojava/iss

In [242]:
qa_chain = RetrievalQA.from_chain_type(llm,chain_type='stuff',retriever=sqretriever,verbose=True )

result = qa_chain({"query": "what is the current test coverage in biojava-core  "})
result



[1m> Entering new RetrievalQA chain...[0m
query='test coverage in biojava-core' filter=None limit=None

[1m> Finished chain.[0m


{'query': 'what is the current test coverage in biojava-core  ',
 'result': 'The text does not provide information on the current test coverage in biojava-core.'}

In [226]:
result

{'query': 'What issues are about alignment',
 'result': 'The context provided talks about a missing feature in the alignment package - that of semiglobal alignments, which are useful when aligning a short query sequence against a large target database. The context also mentions the possibility of adopting a non-sequential structure alignment algorithm as an alternative option.'}

In [208]:
#v_store.delete_collection()