<a href="https://colab.research.google.com/github/rajagopal17/langchain-tutorials/blob/main/v5_3_Pinecone_with_Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#install Pincecone, Langchain, openai & pypdf
!pip install pinecone-client
!pip -q install langchain
!pip install openai
!pip -q install pypdf unstructured[local-inference]

import os
import pinecone
import pandas as pd
import numpy as np

#API keys for openai
OPEN_API_KEY                      = "XXXXXXXXXXXXXXXXXXXX" 
os.environ['OPENAI_API_KEY']      = OPEN_API_KEY

#API keys for Pinecone
PINE_API_KEY                      ="XXXXXXXXXXXXXXXXXXXXXXXXXX"
pinecone.init(api_key= PINE_API_KEY, environment="us-west1-gcp-free")

#import Langchain libraries
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")


#Install langchain document loader libraries

In [2]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import PagedPDFSplitter
from langchain.document_loaders import UnstructuredPDFLoader


import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

#Extract text data from all the pdf files in the folder

In [12]:
root_dir="/content/drive/MyDrive/Python"
pdf_folder_path=f'{root_dir}/SAPDOCS/'
pdf_list=os.listdir(pdf_folder_path)
pdf_path_list =[os.path.join(pdf_folder_path,i) for i in pdf_list]
pdf_path_list


['/content/drive/MyDrive/Python/SAPDOCS/S4F41_Cash Management.pdf',
 '/content/drive/MyDrive/Python/SAPDOCS/S4F50_Process in TRM.pdf',
 '/content/drive/MyDrive/Python/SAPDOCS/S4F51_Customizations in TRM.pdf',
 '/content/drive/MyDrive/Python/SAPDOCS/4 HANA - Practical Guide to SAP Business Partner .pdf',
 '/content/drive/MyDrive/Python/SAPDOCS/Copy of FixedAssets.pdf']

In [13]:
loader=[UnstructuredPDFLoader(pdf_path_list[i]).load() for i in range(0,len(pdf_path_list))]
loader_summary=[loader[i][0]for i in range(len(loader))]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter

text_splitter  = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=20)
pdf_text       = text_splitter.split_documents(loader_summary)#this clean text is used to upload to summarization jobs


In [None]:
pdf_text

In [None]:
pdf_text[0].page_content

'S4F41\n\nImplementing Cash Management in SAP S/4HANA\n\n.\n\n.\n\n.\n\nPARTICIPANT HANDBOOK INSTRUCTOR\n\n\n\nLED TRAINING\n\nCourse Duration: 5 Day(s)\n\nSAP Copyrights, Trademarks and Disclaimers\n\n© 2020 SAP SE or an SAP affiliate company. All rights reserved.\n\nNo part of this publication may be reproduced or transmitted in any form or for any purpose without the express permission of SAP SE or an SAP affiliate company.\n\nSAP and other SAP products and services mentioned herein as well as their respective logos are trademarks or registered trademarks of SAP SE (or an SAP affiliate company) in Germany and other countries. Please see http://global12.sap.com/corporate-en/legal/copyright/index.epx for additional trademark information and notices.'

#Create Index in Pinecone 

In [3]:
#First delete the existing index
#pinecone.delete_index('reviews-index-ai')

#Create a new index in Pinecone
#pinecone.create_index("sapdocs-hana",dimension=1536)

#optional code to check if index exists in Pinecone
index_name = 'sapdocs-hana'
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name,dimension=1536, metric='cosine')
else:
  print("Index aleady exists")


Index aleady exists


#Create embeddings using Langchain to upload to Pinecone

In [None]:
!pip install tiktoken
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
embeds = OpenAIEmbeddings() #default model is ada-002
                                    

In [None]:
#Convert the file into embeddings using openai ada-002 model

file_embeds = Pinecone.from_documents(documents=pdf_text,embedding=embeds,index_name=index_name)

#Query Pinecone database using Langchain & openai LLM

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

fact_llm = OpenAI(temperature=0) #use openai llm to retrieve answers from vector database

hana_retriever =RetrievalQA.from_chain_type(llm=fact_llm, chain_type="stuff",retriever=file_embeds.as_retriever())
                                            
                                           

In [None]:
query='what is money market in SAP'
hana_retriever.run(query)

' Money market in SAP is a type of financial instrument used to map money market transactions, which include different forms of interest calculations and repayments.'

#reload index for later use

In [5]:
reload_index = pinecone.Index("sapdocs-hana")#reload_index is used for xxxxxxx
reload_docs = Pinecone.from_existing_index("sapdocs-hana",embedding=embeds) #reload_docs can be used as retriever
                                           

In [6]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

fact_llm = OpenAI(temperature=0) #use openai llm to retrieve answers from vector database

get_retriever =RetrievalQA.from_chain_type(llm=fact_llm, 
                                            chain_type="stuff",
                                            retriever=reload_docs.as_retriever())

query='what is process of inhouse cash in SAP'
get_retriever.run(query)

" In-House Cash is SAP's in-house bank functionality. An up to five-character house bank ID is created to represent the bank, and an up to five-character house bank account ID is created to represent each bank account. The house bank ID is assigned to a company code. The Receivables Line Item Matching service for SAP Cash Application consists of two tasks: training jobs and inference jobs. Training jobs can be scheduled as part of the configuration steps or in transaction SM36, and inference jobs can be scheduled using Name as RFEBKA_AUTO_REPRO. SAP Cash Application is a cloud-based solution that leverages machine learning technology to provide matching prediction result based on historical data."

In [7]:
reload_docs.similarity_search('intercompany transfer of asset',k=5)

[Document(page_content='Intracompany and Intercompany Asset Transfer In Asset Accounting, SAP differentiates between two types of transfer, intracompany and intercompany asset transfer:\n\n\n\n\n\nIntracompany asset transfer: a sending asset (or component of an asset) is transferred to a target asset within a single company code, for example, if the asset was created in the wrong asset class.\n\nIntercompany asset transfer: a sending asset (or component of an asset) is transferred to a target asset that is assigned to a different company code.\n\nYou can post both of these types of transfer either automatically (retirement and acquisition posted in one step) or manually (retirement and acquisition posted in two separate steps).', metadata={'source': '/content/drive/MyDrive/Python/SAPDOCS/Copy of FixedAssets.pdf'}),
 Document(page_content='An intercompany asset transfer within a corporate group may be necessary for one of the following reasons:\n\nThe physical location of the asset has 

In [8]:
query1='what are the basic configurations for cash management'
get_retriever.run(query1)

' The basic configurations for cash management include switching off the business function FIN_FSCM_CLM and selecting Basic Scope in the Customizing activity Define Basic Settings. Additionally, the basic cash management capability offers the following basic functions: Manage banks and house banks (only a subset of fields that are needed for enabling the payment process), define bank accounts and house bank accounts (only common attributes are supported), display bank accounts in a list view, import and export bank accounts, monitor cash positions and liquidity forecasts, manage memo records using the app or transactions FF63 and FF65, executing and tracking bank transfers and approval of bank payments using Bank Communication Management, and Bank Communication Management can use simplified payment signature setup in Bank Account Management.'

#Summarize data from Pinecone

In [9]:
from langchain.chains.summarize import load_summarize_chain
from langchain import PromptTemplate, LLMChain
import textwrap

In [None]:
prompt_template = """Write a concise bullet point summary regarding the cash management in SAP:

{text}

CONSCISE SUMMARY IN BULLET POINTS:"""

select_POINT_PROMPT = PromptTemplate(template=prompt_template,input_variables=["text"])


chain = load_summarize_chain(fact_llm, chain_type="map_reduce",map_prompt=select_POINT_PROMPT,combine_prompt=select_POINT_PROMPT) 
                             
output_summary = chain.run(pdf_text)

wrapped_text = textwrap.fill(output_summary, width=150, break_long_words=False,replace_whitespace=False)
                                                       
print(wrapped_text)