# Information Retrieval (IR)

In [None]:
from datasets import load_dataset, DatasetDict
import sentence_transformers
import sentence_transformers.cross_encoder.evaluation
from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample # High-level sentence encoders.
import sentence_transformers.models as models
import sentence_transformers.losses as losses
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm # Enables progress bars
import pandas as pd
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
QUICK_RUN = False

In [None]:
queries = load_dataset("BeIR/scidocs", "queries", split="queries")
docs = load_dataset("BeIR/scidocs", "corpus", split="corpus")
qrels = load_dataset("BeIR/scidocs-qrels", delimiter="\t",split="test")
len(queries), len(docs), len(qrels), len(set(qrels["query-id"])),
len(set(qrels["corpus-id"]))

In [None]:
queries, docs, qrels

In [None]:
if QUICK_RUN:
	queries = queries.select(range(100))
	docs = docs.select(range(2500))
	qrels = qrels.filter(lambda x: x["query-id"] in queries["_id"] and x["corpus-id"] in docs["_id"])

In [None]:
train_testvalid = qrels.train_test_split(test_size=0.1, seed=1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=1)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train']})
train_test_valid_dataset

In [None]:
def get_triple_for_example(example):
	q = queries[queries["_id"].index(example["query-id"])]["text"]
	d = docs[docs["_id"].index(example["corpus-id"])]["title"]
	r = example["score"]
	return q, d, r
ex0 = get_triple_for_example(train_test_valid_dataset["test"][0])
ex1 = get_triple_for_example(train_test_valid_dataset["test"][1])
ex0, ex1

In [None]:
from collections import Counter
from scipy import stats
# From Huggingface Evaluate
def label_dist(data):
	"""Returns the fraction of each label present in the data"""
	c = Counter(data)
	label_distribution = {"labels": [k for k in c.keys()], "fractions":[f / len(data) for f in c.values()]}
	if isinstance(data[0], str):
		label2id = {label: id for id, label in enumerate(label_distribution["labels"])}
		data = [label2id[d] for d in data]
	skew = stats.skew(data)
	return {"label_distribution": label_distribution, "label_skew": skew}
label_dist(data=train_test_valid_dataset["train"]["score"]),
label_dist(data=train_test_valid_dataset["valid"]["score"]),
label_dist(data=train_test_valid_dataset["test"]["score"])

In [None]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

In [None]:
docs.map(lambda x: {"title_text": x["title"] + ": " + x["text"]})["title_text"][:2]

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
import plotly

docs_for_analysis = docs.map(lambda x: {"title_text": x["title"] + ": "+ x["text"]})["title_text"]
topic_model = BERTopic(embedding_model=model_name,
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True))
topic_model.fit(docs_for_analysis)
topic_model.get_topic_info().head()

In [None]:
topic_model.reduce_topics(docs_for_analysis, nr_topics=15)
fig = topic_model.visualize_documents(docs_for_analysis)
plotly.offline.plot(fig, filename='bertopic_doc_embeddings.html')

In [None]:
from IPython.display import IFrame
IFrame(src='bertopic_doc_embeddings.html', width=1200, height=800)

In [None]:
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification

In [None]:
ex_tokenizer = AutoTokenizer.from_pretrained(model_name)
ex_model = AutoModel.from_pretrained(model_name)
ex_model_with_head = AutoModelForSequenceClassification.from_pretrained(model_name) # Needsfine-tuning, here for demonstration
test_sentences = ["This is the first sentence with complex tokens, such as SentenceTransformers.", "We can batch multiple sentences."]
ex_tokenized = ex_tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True) # Collates data with padding
ex_res = ex_model(**ex_tokenized)
ex_res_with_head = ex_model_with_head(**ex_tokenized)
print("\nTokenized text:") # Word Piece Tokenization
print(ex_tokenizer.tokenize(test_sentences))
print("\nToken IDs:")
print(ex_tokenized)
print("\nOutput Dictionary:")
print(ex_res.keys())
print("\nOutput Size:")
print(ex_res.last_hidden_state.size())
print("\nContextualized Token Embeddings (truncated):")
print(ex_res.last_hidden_state[:, :3, :7]) # First 3 tokens
print("\nPooled Embeddings (truncated):")
print(ex_res.pooler_output.shape, ex_res.pooler_output[:, :7])
print("\nPredicted Values (not fine-tuning)")
print(ex_res_with_head)

In [None]:
topic_model.embedding_model.embedding_model

In [None]:
topic_model.embedding_model.embedding_model[0]._modules["auto_model"]

In [None]:
from collections import defaultdict
class IRDataset(Dataset):
	def __init__(self, queries_ds, docs_ds, qrel_ds, mode="cross"):
		self.mode = mode
		qrels = defaultdict(set)
		def transform(x):
			q, d, r = x["query-id"], x["corpus-id"], x["score"]
			q_idx = queries_ds["_id"].index(q)
			x["query_text"] = queries_ds[q_idx]["text"]
			d_idx = docs_ds["_id"].index(d)
			x["doc_content"] = docs_ds[d_idx]["title"] + ": " +docs_ds[d_idx]["text"]
			x["label"] = float(r)
			if r:
				qrels[q].add(d)
			return x
		qrel_ds = qrel_ds.map(transform)
		self.q_ids = qrel_ds["query-id"]
		self.d_ids = qrel_ds["corpus-id"]
		self.qrels = qrels
		self.queries = qrel_ds["query_text"]
		self.docs = qrel_ds["doc_content"]
		self.labels = qrel_ds["label"]
	def __getitem__(self, idx):
		qs = self.queries[idx]
		ds = self.docs[idx]
		if self.mode == "rep":
			if type(idx) is int:
				text_list = [{"query": qs}, {"doc": ds}]
			else:
				text_list = [[{"query": q} for q in qs], [{"doc": d}for d in ds]]
				return InputExample(texts=text_list,label=self.labels[idx])
		return InputExample(texts=[qs, ds], label=self.labels[idx])
	def set_mode(self, mode):
		self.mode = mode
	def __len__(self):
		return len(self.labels)

In [None]:
train_ds = IRDataset(queries, docs, train_test_valid_dataset["train"])
valid_ds = IRDataset(queries, docs, train_test_valid_dataset["valid"])
train_ds[0].__dict__

In [None]:
monoBERT = CrossEncoder(
    model_name, 
	num_labels=1, # Perform binary classification
	device="mps", # Will use CUDA if available
)

In [None]:
monoBERT.predict([ex0[:2], ex1[:2]])

In [None]:
print(train_ds[0])

In [None]:
train_dl = DataLoader(train_ds, batch_size=32)
# We need sentence pairs format for the library here.
# valid_dl = DataLoader(valid_ds, batch_size=32)
sentence_pairs = list(zip(valid_ds.queries, valid_ds.docs))
labels = valid_ds.labels
len(train_dl)

In [None]:
monoBERT.__dict__.keys()

In [None]:
class_evaluator = sentence_transformers.cross_encoder.evaluation.CEBinaryClassificationEvaluator(sentence_pairs, labels, show_progress_bar=True)
monoBERT.fit(train_dataloader=train_dl,
	loss_fct=None, # uses nn.BCEWithLogitsLoss()
	evaluator=class_evaluator,
	epochs=10,
	optimizer_class=torch.optim.AdamW,
	show_progress_bar=True,
	save_best_model=True,
	output_path="./",
)

In [None]:
monoBERT.model

In [None]:
monoBERT.predict([ex0[:2], ex1[:2]])

File not provided for the below cell

In [None]:
df = pd.read_csv("CEBinaryClassificationEvaluator_results.csv")
df.tail(n=10)

In [None]:
df.set_index("epoch").drop(columns=["steps"]).plot()
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

In [None]:
repBased = SentenceTransformer(model_name)

In [None]:
qs, ds = repBased.encode([{"query": ex0[0]}, {"query": ex1[0]}]),
repBased.encode([{"doc": ex0[1]}, {"doc": ex1[0]}])
sentence_transformers.util.cos_sim(qs, ds)

In [None]:
train_ds.set_mode("rep")
valid_ds.set_mode("rep")
train_dl_repBased = DataLoader(train_ds, batch_size=32,
collate_fn=repBased.smart_batching_collate)
valid_dl_repBased = DataLoader(valid_ds, batch_size=32,
collate_fn=repBased.smart_batching_collate)
assert next(iter(train_dl_repBased))
queries_dict = dict(zip(valid_ds.q_ids, valid_ds.queries))
docs_dict = dict(zip(valid_ds.d_ids, valid_ds.docs))
qrels_dict = valid_ds.qrels

In [None]:
ir_evaluator = sentence_transformers.evaluation.InformationRetrievalEvaluator(queries_dict, docs_dict, qrels_dict, write_csv=True)
repBased.fit(
    train_objectives=[(train_dl_repBased,losses.CosineSimilarityLoss(repBased))],
	evaluator=ir_evaluator,
	epochs=10,
	optimizer_class=torch.optim.AdamW,
	show_progress_bar=True,
	save_best_model=True,
	output_path="./",
)

In [None]:
qs, ds = repBased.encode([{"query": ex0[0]}, {"query": ex1[0]}]),
repBased.encode([{"doc": ex0[1]}, {"doc": ex1[0]}])
sentence_transformers.util.cos_sim(qs, ds)

file not provided

In [None]:
df = pd.read_csv("eval/Information-Retrieval_evaluation_results.csv")
df.tail(n=10)

In [None]:
df.set_index("epoch").drop(columns=["steps"]).plot(legend=False)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=3)

# Prompt Engineering

In [18]:
import openai
import os
import IPython
from langchain.llms import OpenAI
from dotenv import load_dotenv

In [21]:
load_dotenv()
# API configuration
client = OpenAI(
	api_key=os.environ['OPENAI_API_KEY'],
)
# for LangChain
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["SERP_API_KEY"] = os.getenv("SERP_API_KEY")

In [19]:
def set_open_params(
	model="text-davinci-003",
	temperature=0.7,
	max_tokens=256,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0,
):
	""" set openai parameters"""
	openai_params = {}
	openai_params['model'] = model
	openai_params['temperature'] = temperature
	openai_params['max_tokens'] = max_tokens
	openai_params['top_p'] = top_p
	openai_params['frequency_penalty'] = frequency_penalty
	openai_params['presence_penalty'] = presence_penalty
	return openai_params

In [23]:
def get_completion(params, prompt):
	""" GET completion from openai api"""
	response = openai.Completion.create(
	engine = params['model'],
	prompt = prompt,
	temperature = params['temperature'],
	max_tokens = params['max_tokens'],
	top_p = params['top_p'],
	frequency_penalty = params['frequency_penalty'],
	presence_penalty = params['presence_penalty'],
	)
	return response

In [24]:
params = set_open_params()
prompt = "The sky is"
response = get_completion(params, prompt)

APIRemovedInV1: 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
