In [22]:
import torch
import numpy as np
import spacy

from transformers import AutoTokenizer, GPTNeoForCausalLM


class ConditionalEntropyRanker:
    def __init__(self, model=None, tokenizer=None, device=None):
        self.device = (
            device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        )
        self.tokenizer = (
            AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") if not tokenizer else tokenizer
        )
        self.model = (
            GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") if not model else model
        )
        self.model.to(self.device)
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.max_length = 10000000

    def get_output_entropy(
        self,
        prompt: str,
        output: str,
        exclude_start_words_count=0,
    ):
        text = prompt + "\n" + output
        normalized_text = ConditionalEntropyRanker._normalize_text(text)
        normalized_prompt = ConditionalEntropyRanker._normalize_text(prompt)
        tokenized_text = self._tokenize_text(normalized_text)
        tokenized_prompt = self._tokenize_text(normalized_prompt)
        word_to_token_indices_prompt = self._map_words_to_token_indices(tokenized_prompt)

        words_to_tokens_indices = self._map_words_to_token_indices(tokenized_text)
        entropy, relevant_tokens, _ = self._get_tokens_entropy(tokenized_text)
        decoded = self.tokenizer.batch_decode(relevant_tokens)
        original_words = ConditionalEntropyRanker._get_original_words(
            decoded, words_to_tokens_indices
        )

        original_words, entropy = self._get_text_entropy(
            original_words=original_words,
            entropy=entropy,
            words_to_tokens_indices=words_to_tokens_indices,
            exclude_start_words_count=exclude_start_words_count,
        )
        # skip len(word_to_token_indices_prompt) because we don't want to include the prompt in the entropy
        relevant_words = original_words[len(word_to_token_indices_prompt) + 1 :]
        relevant_entropy = entropy[len(word_to_token_indices_prompt) + 1 :]

        # Concat word to the previous word if it's "'s"
        (
            relevant_words,
            relevant_entropy,
        ) = ConditionalEntropyRanker._concat_apostrophes_to_previous_word(
            relevant_words, relevant_entropy
        )
        return list(zip(relevant_words, relevant_entropy))

    def get_relevance_score(self, query: str, passage: str, exclude_start_words_count=0):
        output = self.get_output_entropy(
            prompt=query, output=passage, exclude_start_words_count=exclude_start_words_count
        )
        entropy = [x[1] for x in output]
        return sum(entropy) / len(entropy)

    def _get_text_entropy(
        self,
        original_words: list[str],
        entropy: torch.Tensor,
        words_to_tokens_indices: list[tuple[int, int]],
        exclude_start_words_count=0,
    ):
        entropy = ConditionalEntropyRanker._get_entropy_of_original_words(
            original_words, entropy, words_to_tokens_indices
        )
        return original_words, entropy

    @staticmethod
    def _get_original_words(
        decoded: list[str], words_to_tokens_indices: list[tuple[int, int]]
    ) -> list[str]:
        original_words = [
            "".join(decoded[start:end]).strip() for start, end in words_to_tokens_indices
        ]
        return original_words

    @torch.no_grad()
    def _get_tokens_entropy(
        self, tokenized_text: str
    ) -> tuple[torch.Tensor, list[int], torch.Tensor]:
        # convert tokenized text to gpu tensors
        tokenized_text = {k: v.to(self.device) for k, v in tokenized_text.items()}
        # Get probabilities from logits
        logits = self.model(**tokenized_text).logits

        # Remove bos token from tokenized email
        relevant_tokens = tokenized_text["input_ids"][0][1:]

        entropy = ConditionalEntropyRanker._calculate_entropy_from_logits(logits)

        return entropy, relevant_tokens, logits

    def _tokenize_text(self, text: str):
        tokenized_text = self.tokenizer(
            self.tokenizer.bos_token + text,
            return_tensors="pt",
            truncation=True,
        )
        return tokenized_text

    @staticmethod
    def _concat_apostrophes_to_previous_word(
        token_strings: list[str], entropies: list[float]
    ) -> tuple[list[str], list[float]]:
        new_token_strings = []
        new_entropies = []

        if len(token_strings) != len(entropies):
            return token_strings, entropies
        # Concatenate apostrophes to previous word
        for i in range(len(token_strings)):
            if token_strings[i] == "'s" or token_strings[i] == "'t":
                new_token_strings[-1] += token_strings[i]
                new_entropies[-1] += entropies[i]
            else:
                new_token_strings.append(token_strings[i])
                new_entropies.append(entropies[i])

        return new_token_strings, new_entropies

    @staticmethod
    def _get_entropy_of_original_words(
        original_words: list[str],
        entropy: torch.Tensor,
        words_to_token_indices: list[tuple[int, int]],
    ) -> list[float]:
        mapped_entropy = []
        if len(original_words) != len(words_to_token_indices):
            print("Error in mapping entropy to original words")
        for start, end in words_to_token_indices:
            mapped_entropy.append(entropy[start:end].sum().item())

        return mapped_entropy

    @staticmethod
    def _normalize_text(text: str) -> str:
        text = text.replace("\t", " ")
        return text

    @staticmethod
    def _calculate_entropy_from_logits(logits: torch.Tensor) -> torch.Tensor:
        # make these calculations on the cpu
        probs = logits[0].softmax(dim=-1)
        # We don't need prediction for the last token
        relevant_probs = probs[range(len(probs) - 1), :]
        entropy = -torch.sum(relevant_probs * torch.log2(relevant_probs), dim=-1)

        # check if there is nan here - this happens when the probability is really close to 0
        if torch.isnan(entropy).any():
            # replace any nan with 0
            entropy[torch.isnan(entropy)] = 0

        return entropy

    @staticmethod
    def _map_words_to_token_indices(encoded) -> list[tuple[int, int]]:
        desired_output = []
        for word_id in encoded.word_ids():
            if word_id is not None:
                start, end = encoded.word_to_tokens(word_id)
                # we subtract 1 from start because the first token is BOS token,
                tokens = (start - 1, end - 1)
                if len(desired_output) == 0 or desired_output[-1] != tokens:
                    desired_output.append(tokens)

        # remove first token index because it's BOS token
        return desired_output[1:]


In [80]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

Downloading pytorch_model.bin: 100%|██████████| 2.84G/2.84G [00:53<00:00, 53.0MB/s]
Downloading (…)neration_config.json: 100%|██████████| 69.0/69.0 [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 100%|██████████| 237/237 [00:00<00:00, 236kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.50MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.43MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.18MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 1.08k/1.08k [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<?, ?B/s]


In [81]:
cer = ConditionalEntropyRanker(model=model, tokenizer=tokenizer)


In [83]:
print(cer.device)

cuda


In [104]:
query = """What is the main purpose of alchemy?"""

In [117]:
documents = [
#     "The Atlantic Ocean is the second largest ocean in the world, with an area of 31,830,000 sq mi (82,440,000 sq km) 12. It covers approximately 17% of Earth’s surface and about 24% of its water surface area 3. The Atlantic Ocean separates North and South America from Europe and Africa 1.",
#     "Various networking Companies use 192.168.0.1 IP address to access their admin page. We solve issues related to this IP address such as 192.168.0.1 login my account issue, 192.168.0.1 the page cannot be displayed etc. Feel free to contact us through live chat window.",
#     "It seems like their main problem is not hiring - clearly they've hired some bright technical people. It just seems like the iPhone and Android came along and management refused to admit that the new platforms were going to compete with the Blackberry and its ecosystem.",
#     "In order to do this, you would need to purchase an Amazon gift card from another online vendor that accepts PayPal and then use the Amazon gift card on the Amazon site. There are dozens (if not hundreds) of sites that sell Amazon gift cards online that accept PayPal.",
#     "So one approach would be purely mathematical: look at whichever has the higher interest rate and pay it first. Another approach is to ignore the math (since the interest savings difference between a mortgage and student loan is likely small anyways) and think about what your goals are. Do you like having a student loan payment? Would you prefer to get rid of it as quickly as possible? How would it feel to cut the balance in HALF in one shot? If it were me, I would pay the student loan as fast as possible. Student loans are not cancellable or bankruptable, and once you get it paid off you can put that payment amount toward your house to get it paid off.",
#     "I used to work for one of the three ratings agencies. Awhile ago. First: There are lots of different ratings. The bulk of ratings are for corporate debt and public finance. So senior debentures (fixed income) and General Obligations e.g. tax-free muni bonds, respectively. Ratings agencies are NOT paid by the investment banks, they are paid by the corporations or city/ state that is issuing debt. The investment banks are the syndicate that pulls the transaction together and brings it to market. For mortgage-backed securities, collateralized debt CDO-CLO's, all of which are fancy structured securitizations, well, that is a different matter! Those transactions are the ones where there is an inappropriately close tie between the investment bankers and ratings agencies. And those were the ratings that blew out and caused problems. Ratings agencies continued to do a decent job with what WAS their traditional business, corporate and municipal bond ratings, as far as I know. What khajja said was 100% correct: S&amp;P's fees were paid by investors, the people who were purchasing the bonds, until about 50 years ago. Around the same time that McGraw-Hill purchased S&amp;P, in 1966, they departed from that model, and started charging the bond issuers for ratings. I don't know if that decision was driven by McGraw-Hill or not, though. One more thing: Not all credit ratings agencies are paid by the issuers. One of the 10 NRSRO's (a designation given by the S.E.C.) is Egan-Jones. Their revenue comes from the investors, bond purchasers, not the companies issuing bonds, unlike the S&amp;P/ Fitch/ Moody's ""business model"". So there is an alternative, which I consider hopeful and reason not to totally despair. EDIT: What xcrunna19 mentions is also totally accurate. The part about Nouriel Roubini (who is a professor at N.Y.U. or Columbia or such and a sensible though slightly high strung sort) is consistent with my impression. As for whether it would require government action to implement the changes advocated by Roubini, yes, I guess it would, but I don't know if the government would do that. It would be better if the credit ratings agencies would find their own way to a different, less conflicted payment-incentive model. Keep in mind too that many of the provisions of Dodd-Frank have removed the existing regulatory requirements for credit ratings on bonds and other securities. This is the scary part though: There isn't anything to replace the credit ratings agencies, not at the moment, as far as I can tell! Eventually the government is supposed to come up with an alternative, but that hasn't happened yet. Which is better: Not requiring ratings at all, or the past situation of sometimes inflated ratings, which imparted a false sense of confidence? I don't know.",
#     "You will be filing the exact same form you've been filing until now (I hope...) which is called form 1040. Attached to it, you'll add a ""Schedule C"" form and ""Schedule SE"" form. Keep in mind the potential effect of the tax and totalization treaties the US has with the UK which may affect your filings. I suggest you talk to a licensed EA/CPA who works with expats in the UK and is familiar with all the issues. There are several prominent offices you can find by Googling.",
#     "First, to answer the question. The benefit of a 401k is that you don't have to pay income tax on the money contributed nor do you pay capital gains tax on the money that accumulates. You get that with the restriction that you can't willy nilly remove and contribute money to the account (and you are taxed on withdrawals, more severely if you do it before you are 65). Similar sorts of restrictions apply to all retirement accounts which give tax benefits. Now, for the 7000 not providing benefit. Assuming a very modest 4% growth, over 40 years 7000 becomes 34,671. Not something to sneeze at (inflation, risk reward, blah, blah, blah, it is less than it looks, but 4% is really pretty low, the stock market averages anywhere from 7-&gt;10% and IIRC the bond market is somewhere around 5%). Now, certainly, to avoid bankruptcy you should withdraw. However, if it is possible, you will be best served by keeping the money in your 401k account. The penalties and lost earning opportunities are pretty significant. /u/BeatArmy99 [has the numbers](http://www.reddit.com/r/finance/comments/2ct0qy/why_cant_i_access_my_401k_if_its_my_money/cjiorl7) for how much you lose by doing an early withdraw. Don't do this lightly and I would suggest avoiding cashing out the whole thing if you can.",
#     "I can only speak to natural gas but I imagine the answer for electricity is the same. In general, yes, it is better to lock into a fixed price contract as in the long run, natural gas prices increase over time. However, if you locked (signed a fixed price contract) in prior to the economic downturn, most likely you were better off not doing so but the key is long-term. http://en.wikipedia.org/wiki/Natural_gas_prices However, do your research as fixed priced contracts vary considerably from company to company. http://www.energyshop.com/ I think it's a good time to sign a fixed-term contract right now as I don't see prices coming down much further with global economies are now recovering from the downturn. HTH",
#     "Name one nation state that has survived more than 20 minutes without taxation. People won't pay if they don't have to, things don't get built if people don't pay. Take a holiday to Somalia if you want to see a libertarian paradise in action.",
# "An alphabet is a standard set of letters (basic written symbols or graphemes) that is used to write one or more languages based upon the general principle that the letters represent phonemes (basic significant sounds) of the spoken language. This is in contrast to other types of writing systems, such as syllabaries (in which each character represents a syllable) and logographies (in which each character represents a word, morpheme, or semantic unit).	",
"Answer: An android is a humanoid robot or synthetic organism designed to look and act like a human, especially one with a body having a flesh-like resemblance. Historically, androids remained completely within the domain of science fiction where they are frequently seen in film and television. Only recently have advancements in robot technology allowed the design of functional and realistic humanoid robots.",
# """Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist. Einstein developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics).""",
"Answer: It is a brief statement that contains the most important points of a long legal document or of several related legal papers.",
"""Answer: It aimed to purify, mature, and perfect certain objects. Common aims were chrysopoeia, the transmutation of "base metals" (e.g., lead) into "noble metals" (particularly gold); the creation of an elixir of immortality; the creation of panaceas able to cure any disease; and the development of an alkahest, a universal solvent. The perfection of the human body and soul was thought to permit or result from the alchemical magnum opus and, in the Hellenistic and western tradition, the achievement of gnosis. In Europe, the creation of a philosopher's stone was variously connected with all of these projects.""",
]

In [119]:

# output=cer.get_relevance_score(query="What is the second largest ocean?"
#                               ,passage="""In law, an abstract is a brief statement that contains the most important points of a long legal document or of several related legal papers.
# """)
# get scores for all documents
scores = []
for doc in documents:
    scores.append(cer.get_relevance_score(query=query, passage=doc))


print(scores)


[3.010985343131636, 2.8700875712931158, 3.2118490908092623]


In [109]:
print(scores)

[2.088653205374283, 3.5551322285085916, 4.341659794960703, 2.895300578394974, 3.4035164560708733, 4.161020574215866, 4.262144573032856, 3.958267181936525, 3.616004661907657, 3.68802069616504, 2.7226829525576357, 3.100113259080578, 1.9204128443812714, 3.5143199951752373, 3.2988936337987305]


In [108]:
from beir.reranking import Rerank

class YourCustomCEModel:
    def __init__(self, model_path=None, **kwargs):
        self.model = None # ---> HERE Load your custom model
    
    # Write your own score function, which takes in query-document text pairs and returns the similarity scores
    def predict(self, sentences: List[Tuple[str,str]], batch_size: int, **kwags) -> List[float]:
        pass # return only the list of float scores

reranker = Rerank(YourCustomCEModel(model_path="your-custom-model-path"), batch_size=128)

ModuleNotFoundError: No module named 'beir'