## Install dependencies

In [None]:
!pip install langchain
!pip install pinecone-client
!pip install openai
!pip install tiktoken
!pip install chromadb
!pip install cohere

## Import dependancies

In [None]:
import os
import openai
import pinecone
import json
import numpy as np
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone, Chroma
from langchain.llms import OpenAI, OpenAIChat
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

## Set api key and environment

In [None]:
PINECONE_API_KEY = "PINECONE_API_KEY"
PINECONE_API_ENV = "PINECONE_API_ENV"
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
os.environ["COHERE_API_KEY"] = "COHERE_API_KEY"

text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=3000,
    chunk_overlap=0,
)

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV,
)

embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])

llms = OpenAIChat(model_name="gpt-3.5-turbo")

index_name = "citation-recommendation"

top_k = 10

## Load data

In [None]:
papers_file_path = '/content/drive/MyDrive/data/custom/papers.json'
contexts_file_path = '/content/drive/MyDrive/data/custom/contexts.json'
test_file_path = '/content/drive/MyDrive/data/custom/test.json'
train_file_path = '/content/drive/MyDrive/data/custom/train.json'
val_file_path = '/content/drive/MyDrive/data/custom/val.json'

papers_data = json.load(open(papers_file_path, 'r'))
contexts_data = json.load(open(contexts_file_path, 'r'))
test_data = json.load(open(test_file_path, 'r'))
train_data = json.load(open(train_file_path, 'r'))
val_data = json.load(open(val_file_path, 'r'))

query = contexts_data[val_data[0]['context_id']]['masked_text']
print(f'Query: {query}')

true_id = val_data[0]['positive_ids'][0]
print(f'True id: {true_id}')

## Create initial trained docsearch using just the papers

In [None]:
papers = []
papers_key = []

for key in papers_data:
    paper = papers_data[key]
    doc_number = key
    paper_content = paper['title'] + '\n' + paper['abstract']
    metadata = {'source': key}
    papers.append(Document(page_content=paper_content, metadata=metadata))
    papers_key.append(key)

papers_texts = text_splitter.split_documents(papers)

## Create Pinecone index

In [None]:
pinecone.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    pods=1,
    replicas=1,
    pod_type="s1.x1",
)

## Retrain docsearch using train data

In [None]:
for key in train_data:
    context_id = key['context_id']
    positive_id = key['positive_ids'][0]
    if positive_id not in papers_key:
        paper_content = contexts_data[context_id]['masked_text']
        metadata = {'source': positive_id}
        papers.append(Document(page_content=paper_content, metadata=metadata))
    else:
        for paper in papers:
            if paper.metadata['source'] == positive_id:
                paper_content = paper.page_content + '\n' + contexts_data[context_id]['masked_text']
                paper.page_content = paper_content

papers_texts = text_splitter.split_documents(papers)

docsearch = Pinecone.from_texts(
    texts=[t.page_content for t in papers_texts],
    embedding=embeddings,
    metadatas=[t.metadata for t in papers_texts],
    index_name=index_name,
).as_retriever(search_kwargs={"k": 100})

compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=docsearch)

print('Trained docsearch')

hit_list = []

for key in test_data[:100]:
    context_id = key['context_id']
    positive_id = key['positive_ids'][0]
    query = contexts_data[context_id]['masked_text']
    compressed_docs = compression_retriever.get_relevant_documents(query)
    if len(compressed_docs) >= 10:
      candidate_ids = [doc.metadata['source'] for doc in compressed_docs[0:9]]
    else:
      candidate_ids = [doc.metadata['source'] for doc in compressed_docs]
    hit_list.append(positive_id in candidate_ids)


print("The average recall@%d: %.4f" % (top_k, np.mean(hit_list)))

## Delete Pinecone index

In [None]:
pinecone.delete_index(index_name)