# projects 


In [59]:
import os
dotenv_path = './Downloads/llmchatmaking/.env'
from dotenv import load_dotenv

load_dotenv(dotenv_path, override=True)


True

In [19]:
pip install pypdf -q


Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [44]:
def load_document(file):
    import os
    namae, extension =os.path.splitext(file)
    if extension=='.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading{file}') 
        loader =PyPDFLoader(file)
    elif extension=='.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading{file}') 
        loader =Docx2txtLoader(file)
    elif extension=='.csv':
        from langchain.document_loaders import CSVLoader
        print(f'Loading{file}') 
        loader = CSVLoader(file)
        
    elif extension=='.html':
        from langchain.document_loaders import UnstructuredHTMLLoader
        print(f'Loading{file}') 
        loader = UnstructuredHTMLLoader(file)
    elif extension=='.json':
        from langchain.document_loaders import JSONLoader
        print(f'Loading{file}') 
        loader = JSONLoader(file)
    elif extension=='.md':
        from langchain.document_loaders import UnstructuredMarkdownLoader
        print(f'Loading{file}') 
        loader = UnstructuredMarkdownLoader(file)
        
        
    data=loader.load()
    return data

#wikipedia

def load_from_wikipedia(query, lang='en', laod_max_docs=1):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang)
    data=loader.load()
    return data

# Chunking data

In [73]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks=text_splitter.split_documents(data)
    return chunks

# Calculating Cost

In [69]:
def print_embedding_cost(texts):
    import tiktoken
    enc=tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens=sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000*0.0004:.6f}')

# Embedding & Uploading to a Vector Database (Pinecone)

In [93]:
def insert_or_fetch_embedding(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings=OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store=Pinecone.from_existing_index(index_name, embeddings)
        print('ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name,dimension=1536, metric='cosine')
        vector_store=Pinecone.from_documents(chunks,embeddings,index_name=index_name)
        print('ok')
    
    return vector_store

In [71]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes=pinecone.list_indexes()
        print('Deleting all indexes ...')
        for index in indexes:
            pinecone.delete_index(index)
        print('ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('ok')

# Asking & Getting Answers

In [98]:
def ask_get_answer(vector_store,q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI
    
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever=vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})
    chain=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", retriever=retriever)
    answer=chain.run(q)
    return answer

def askwithmemory(vector_store,question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever=vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})
    
    crc=ConversationalRetrievalChain.from_llm(llm, retriever)
    result=crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question,result['answer']))
    
    return result, chat_history

# running code

In [67]:
data=load_document('./Downloads/llmchatmaking/files/LegacyDeveloper.pdf')
print(data[1].page_content)
print(data[1].metadata)
print(f'You have {len(data)} pages in your document')
print(f'There are {len(data[1].page_content)} characters in the current page')

Loading./Downloads/llmchatmaking/files/LegacyDeveloper.pdf
3/16/23, 2:1 1 PM Legacy Developer Documentation
https://documentation.b2c.commercecloud.salesforce.com/DOC1/advanced/print.jsp?topic=/com.demandware.dochelp/LegacyDevDoc/LegacyDevDoc.html&cp=0 … 2/1582.8. Common Page Components
2.9. Development Components
2.10. Import Reference Application Data into a Sandbox
2.11. SGJC Forms
2.11.1. What Is a Form Deﬁnition
2.11.2. Object Binding with Forms
2.11.3. Extracting Form Field Parameters from Metadata
2.11.4. Form Element Nam ing Conventions
2.11.5. Cross Site Request Forgery Protection
2.11.6. Form Validation
2.11.7. Using API Form Classes
2.11.8. Form Deﬁnition Elements
2.11.8.1. Action Form Element
2.11.8.2. Field Form Element
2.11.8.3. Option Form Element
2.11.8.4. Options Form Element
2.11.8.5. Group Form Element
2.11.8.6. Include Form Element
2.11.8.7. List Form Element
2.11.9. Developing Forms with Pipelines
2.11.9.1. How Pipelines Process Forms
2.11.9.2. Salesforce B2C Comme

In [45]:
data =load_from_wikipedia('GPT-4')
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was released on March 14, 2023, and has been made publicly available in a limited form via the paid chatbot product ChatGPT Plus, and access to the GPT-4 via OpenAI's API is provided via a waitlist. As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4 is also capable of taking images as input, though this feature has not been made available since launch. Op

In [74]:
chunks=chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)
print_embedding_cost(chunks) 

2596
2.7.7.11.1. SiteGenesis Choice of Bonus Product Discount Implementation
2.7.7.11.2. SiteGenesis Coupons
2.7.7.11.3. SiteGenesis Source Codes
2.7.7.12. SiteGenesis Recommendations
2.7.7.12.1. SiteGenesis Recommendation Examples
Total Tokens: 112413
Embedding Cost in USD: 0.044965


In [89]:
delete_pinecone_index()

Deleting all indexes ...
ok


In [92]:
index_name='asdfghjkl'
vector_store=insert_or_fetch_embedding(index_name)

Indexasdfghjkl already exists. Loading embeddings ...ok


In [94]:
q='What is the whole document about?'
answer=ask_get_answer(vector_store,q)
print(answer)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 8.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 10.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and bi

RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [101]:
import time
i=1
print('Write Quit or Exit to quit.')
while True:
    q=input(f'Question #{i}: ')
    i=i+1
    if q.lower() in ['quit','exit']:
        print('Quitting... bye bye')
        time.sleep(2)
        break
    
    answer=ask_get_answer(vector_store,q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-"*50} \n')
    
    

Write Quit or Exit to quit.
Question #1: hello


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


KeyboardInterrupt: 

In [100]:
#asking with history
chat_history=[]
question='How many amendments in the indian constitution?'
result.chat_history=askwithmemory(vector_store,question, chat_history)
print(result['answer'])
print(chat_history)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


KeyboardInterrupt: 