In [None]:
from datasets import load_dataset
from envyaml import EnvYAML

from common import get_vector_store

In [None]:
from langchain.llms import HuggingFaceHub

In [None]:
env = EnvYAML("../env.yaml", env_file="../.env")

vector_store = get_vector_store(env)

In [None]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import PineconeEmbeddings

pinecone_embeddings = PineconeEmbeddings(
    model=env["pinecone.embeddings_model"],
    pinecone_api_key=env["pinecone.api_key"],
    document_params=dict(input_type="passage"),
    query_params=dict(input_type="query"),
    batch_size=1,
)
# Needed for creating vector store
os.environ.setdefault("PINECONE_API_KEY", env["pinecone.api_key"])
vector_store = PineconeVectorStore.from_existing_index(
    index_name=env["pinecone.index_name"], embedding=pinecone_embeddings
)

In [None]:
from langchain.llms import HuggingFaceEndpoint, HuggingFaceHub

llm = HuggingFaceEndpoint(
    repo_id="microsoft/Phi-3-mini-128k-instruct",
    temperature= 0.7,
    max_new_tokens= 100,
    huggingfacehub_api_token=env["hugging_face.api_key"],
)

# llm = HuggingFaceHub(
#     repo_id="microsoft/Phi-3-mini-128k-instruct",
#     model_kwargs={"temperature": 0.7, "max_new_tokens": 100},
#     huggingfacehub_api_token=env["hugging_face.api_key"],
# )

In [None]:
llm.generate(["Who are you?"])

In [None]:
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableConfig
from langchain_core.documents import Document
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.combine_documents.reduce import ReduceDocumentsChain
from langchain_core.prompts import BasePromptTemplate
from langchain_core.language_models import BaseLanguageModel


def shorten_string(text: str, llm: BaseLanguageModel, max_tokens: int):
    token_length = llm.get_num_tokens(text)
    char_length = len(text)
    new_length = int(char_length * max_tokens / token_length)
    return text[:new_length]


class ShortenDocuments:
    def __init__(self, llm: BaseLanguageModel, max_tokens: int=500):
        self.max_tokens = max_tokens
        self.llm = llm

    def __call__(self, input: dict):
        documents: List[Document] = input["context"]
        for document in documents:
            document.page_content = shorten_string(
                document.page_content, llm, max_tokens=self.max_tokens
            )
        input["context"] = documents
        return input


def create_stuff_shortened_documents_chain(llm: BaseLanguageModel, prompt: BasePromptTemplate, max_tokens: int):
    return ShortenDocuments(llm, max_tokens=max_tokens) | create_stuff_documents_chain(llm, prompt)


system_prompt = (
    "Use the given context to answer the question about the book. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
query = "What is the theme of Pride and Prejudice?"

retriever = vector_store.as_retriever(search_kwargs=dict(k=1))

answer_chain = create_stuff_shortened_documents_chain(llm, prompt, max_tokens=100)

chain = create_retrieval_chain(retriever, answer_chain)

chain.invoke({"input": query})["answer"]

In [None]:
print('Use the given context to answer the question. If you don\'t know the answer, say you don\'t know. Use three sentence maximum and keep the answer concise. Context: The Project Gutenberg eBook, Addison, by William John Courthope\n\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\n\n\n\nTitle: Addison\n\n\nAuthor: William John Courthope\n\n\n\nRelease Date: November 27, 2012  [eBook #41496]\n\nLanguage: English\n\nCharacter set encoding: ISO-8859-1\n\n\n***START OF THE PROJECT GUTENBERG EBOOK ADDISON***\n\n\nE-text prepared by the Online Distributed Proofreading Team\n(http://www.pgdp.net) from page images generously made available by\nInternet Archive (http://archive.org)\n\n\n\nNote: Images of the original pages are available through\n      Internet Archive. See\n      http://archive.org/details/addison_00cour\n\n\nTranscriber\'s note:\n\n      Text enclosed by underscores is in italics (_italics_).\n\n      Text enclosed by curly brackets is superscripted\n      (example: y{e}).\n\n\n\n\n\nEnglish Men of Letters\n\nEdited by John Morley\n\nADDISON\n\nby\n\nW. J. COURTHOPE\n\n\n\n\n\n\n\nHarper & Brothers Publishers\nNew York and London\n1902\n\n     *       *       *       *       *\n\nENGLISH MEN OF LETTERS.\n\nEDITED BY JOHN MORLEY.\n\n  JOHNSON                Leslie Stephen.\n  GIBBON                 J. C. Morison.\n  SCOTT                  R. H. Hutton.\n  SHELLEY                J. A. Symonds.\n  HUME                   T. H. Huxley.\n  GOLDSMITH              William Black.\n  DEFOE                  William Minto.\n  BURNS                  J. C. Shairp.\n  SPENSER                R. W. Church.\n  THACKERAY              Anthony Trollope.\n  BURKE                  John Morley.\n  MILTON                 Mark Pattison.\n  HAWTHORNE              Henry James, Jr.\n  SOUTHEY                E. Dowden.\n  CHAUCER         \n\nThe Project Gutenberg eBook, Addison, by William John Courthope\n\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\n\n\n\nTitle: Addison\n\n\nAuthor: William John Courthope\n\n\n\nRelease Date: November 27, 2012  [eBook #41496]\n\nLanguage: English\n\nCharacter set encoding: ISO-8859-1\n\n\n***START OF THE PROJECT GUTENBERG EBOOK ADDISON***\n\n\nE-text prepared by the Online Distributed Proofreading Team\n(http://www.pgdp.net) from page images generously made available by\nInternet Archive (http://archive.org)\n\n\n\nNote: Images of the original pages are available through\n      Internet Archive. See\n      http://archive.org/details/addison_00cour\n\n\nTranscriber\'s note:\n\n      Text enclosed by underscores is in italics (_italics_).\n\n      Text enclosed by curly brackets is superscripted\n      (example: y{e}).\n\n\n\n\n\nEnglish Men of Letters\n\nEdited by John Morley\n\nADDISON\n\nby\n\nW. J. COURTHOPE\n\n\n\n\n\n\n\nHarper & Brothers Publishers\nNew York and London\n1902\n\n     *       *       *       *       *\n\nENGLISH MEN OF LETTERS.\n\nEDITED BY JOHN MORLEY.\n\n  JOHNSON                Leslie Stephen.\n  GIBBON                 J. C. Morison.\n  SCOTT                  R. H. Hutton.\n  SHELLEY                J. A. Symonds.\n  HUME                   T. H. Huxley.\n  GOLDSMITH              William Black.\n  DEFOE                  William Minto.\n  BURNS                  J. C. Shairp.\n  SPENSER                R. W. Church.\n  THACKERAY              Anthony Trollope.\n  BURKE                  John Morley.\n  MILTON                 Mark Pattison.\n  HAWTHORNE              Henry James, Jr.\n  SOUTHEY                E. Dowden.\n  CHAUCER         \n\n\ufeffThe Project Gutenberg EBook of The Reverberator, by Henry James\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: The Reverberator\n\nAuthor: Henry James\n\nRelease Date: February, 2005 [EBook #7529]\nPosting Date: July 25, 2009\nLast Updated: September 18, 2016\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK THE REVERBERATOR ***\n\n\n\n\nProduced by Eve Sobol\n\n\n\n\n\nTHE REVERBERATOR\n\n\nBy Henry James\n\n\n\n\nI\n\n“I guess my daughter’s in here,” the old man said leading the way into\nthe little salon de lecture. He was not of the most advanced age, but\nthat is the way George Flack considered him, and indeed he looked older\nthan he was. George Flack had found him sitting in the court of the\nhotel--he sat a great deal in the court of the hotel--and had gone up to\nhim with characteristic directness and asked him for Miss Francina. Poor\nMr. Dosson had with the greatest docility disposed himself to wait\non the young man: he had as a matter of course risen and made his way\nacross the court to announce to his child that she had a visitor. He\nlooked submissive, almost servile, as he preceded the visitor, thrusting\nhis head forward in his quest; but it was not in Mr. Flack’s line to\nnotice that sort of thing. He accepted the old gentleman’s good offices\nas he would have accepted those of a waiter, conveying no hint of an\nattention paid also to himself. An observer of these two persons would\nhave assured himself that the degree to which Mr. Dosson thought it\nnatural any one should want to see his daughter was \n\nThe Project Gutenberg EBook of The Brothers-In-Law: A Tale Of The\nEquatorial Islands; and The Brass Gun Of The Buccaneers, by Louis Becke\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: The Brothers-In-Law: A Tale Of The Equatorial Islands; and The Brass Gun Of The Buccaneers\n       1901\n\nAuthor: Louis Becke\n\nRelease Date: April 12, 2008 [EBook #25056]\n\nLanguage: English\n\nCharacter set encoding: ISO-8859-1\n\n*** START OF THIS PROJECT GUTENBERG EBOOK THE BROTHERS-IN-LAW ***\n\n\n\n\nProduced by David Widger\n\n\n\n\n\nTHE BROTHERS-IN-LAW: A TALE OF THE EQUATORIAL ISLANDS, and THE BRASS GUN OF THE BUCCANEERS\n\nFrom "The Tapu Of Banderah and Other Stories"\n\nBy Louis Becke\n\nC. Arthur Pearson Ltd.\n\n1901\n\n\n\n\nTHE BROTHERS-IN-LAW: A TALE OF THE EQUATORIAL ISLANDS\n\n"There," said Tâvita the teacher, pointing with his paddle to a long,\nnarrow peninsula which stretched out into the shallow waters of the\nlagoon, "there, that is the place where the battle was fought. In those\ndays a village of thirty houses or more stood there; now no one liveth\nthere, and only sometimes do the people come here to gather cocoanuts."\n\nThe White Man nodded. "\'Tis a fair place to look upon. Let us land and\nrest awhile, for the sun is hot."\n\nThe native pastor swung the bow of the canoe round towards the shore,\nand presently the little craft glided gently upon the hard, white sand,\nand the two men got out, walked up to the grove of cocoa-palms, and sat\ndown under their shade to rest and smoke until the sun lost some of its\nfierce intensity and they could proceed on their journey homeward to the\nprincipal village.\n\nThe White Man was the one trader living in Peru,{*} the native was a\nSamoan, and one of the oldest and bra')

In [None]:
create_retrieval_chain?

In [None]:
HuggingFaceEndpoint?

In [None]:
from langchain.llms import HuggingFaceEndpoint
HuggingFaceEndpoint(
    repo_id="microsoft/Phi-3-mini-128k-instruct",
    model_kwargs={"temperature": 0.7, "max_new_tokens": 100},
    huggingfacehub_api_token=env["hugging_face.api_key"],
)

In [None]:
llm.get_token_ids(document.page_content)

In [None]:
llm.get_num_tokens_from_messages(("human", "{}"))
# document.page_content = document.page_content[:100]
# document

In [None]:
retriever.invoke(query, config=RunnableConfig())

In [None]:
llm.get_num_tokens?