In [26]:
import openai
import pandas as pd
import datetime, os, yaml

# Authenticating
with open('api_key_openai', 'r') as f:
    openai.api_key = f.read()
    
from langchain import OpenAI
from langchain.chains import LLMChain, ConversationChain
from langchain.chains.conversation.memory import (ConversationBufferMemory, 
                                                  ConversationSummaryMemory, 
                                                  ConversationBufferWindowMemory,
                                                  ConversationKGMemory)
from langchain.callbacks import get_openai_callback

llm = OpenAI(temperature=0.1, openai_api_key=openai.api_key)
text = "What would be a good company name for a company that makes colorful socks?"
llm(text)

'\n\nBrightSox.'

In [41]:
from langchain.callbacks import get_openai_callback

def count_tokens(chain, query):
    with get_openai_callback() as cb:
        result = chain.run(query)
        print(f'Spent a total of {cb.total_tokens} tokens')

    return result


In [28]:
from langchain import OpenAI, ConversationChain

llm = OpenAI(temperature=0, openai_api_key=openai.api_key)
conversation = ConversationChain(llm=llm, verbose=True)

output = conversation.predict(input="Hi there!")
print(output)



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: Hi there!
AI:[0m

[1m> Finished chain.[0m
 Hi there! It's nice to meet you. How can I help you today?


### Conversation

In [29]:
from langchain import OpenAI
from langchain.chains import ConversationChain

# first initialize the large language model
llm = OpenAI(
	temperature=0,
	openai_api_key=openai.api_key,
	model_name="gpt-3.5-turbo"
)


The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Human: {input}
AI:




In [45]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai.api_key)

#### ConversationBufferMemory

In [46]:

# now initialize the conversation chain
conversation = ConversationChain(llm=llm)
print(conversation.prompt.template)

The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Human: {input}
AI:


In [47]:
conversation_buf = ConversationChain(
    llm=llm,
    memory=ConversationBufferMemory()
)
conversation_buf("Good morning AI!")



{'input': 'Good morning AI!',
 'history': '',
 'response': 'Good morning to you too! How can I assist you today?'}

In [48]:
conversation_buf("My interest here is to explore the potential of integrating Large Language Models with external knowledge")

{'input': 'My interest here is to explore the potential of integrating Large Language Models with external knowledge',
 'history': 'Human: Good morning AI!\nAI: Good morning to you too! How can I assist you today?',
 'response': "That's a fascinating topic! Large Language Models, such as GPT-3, have shown incredible capabilities in generating human-like text. However, integrating external knowledge can help improve their accuracy and relevance in specific domains. Some approaches to integrating external knowledge include using knowledge graphs, ontologies, or pre-trained models that incorporate domain-specific knowledge. Would you like me to provide more details on any of these approaches?"}

In [49]:
print(conversation_buf.memory.buffer)

Human: Good morning AI!
AI: Good morning to you too! How can I assist you today?
Human: My interest here is to explore the potential of integrating Large Language Models with external knowledge
AI: That's a fascinating topic! Large Language Models, such as GPT-3, have shown incredible capabilities in generating human-like text. However, integrating external knowledge can help improve their accuracy and relevance in specific domains. Some approaches to integrating external knowledge include using knowledge graphs, ontologies, or pre-trained models that incorporate domain-specific knowledge. Would you like me to provide more details on any of these approaches?


#### Conversation summary memory

#### Conversation summary memory

### Vector DB

In [52]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

Downloading and preparing dataset wikipedia/20220301.simple to /home/prabhu/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading: 100%|██████████| 1.66k/1.66k [00:00<00:00, 1.07MB/s]
Downloading: 100%|██████████| 235M/235M [00:19<00:00, 11.8MB/s] 


Dataset wikipedia downloaded and prepared to /home/prabhu/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [54]:
import tiktoken  

tokenizer = tiktoken.get_encoding('p50k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

In [59]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_text(data[6]['text'])
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [61]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2],), tiktoken_len(chunks[3],)

(397, 304, 399, 50)

In [64]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=openai.api_key
)
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])


(2, 1536)

In [67]:
type(res)

list

In [71]:
from datetime import datetime
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import VectorStoreRetrieverMemory
from langchain.chains import ConversationChain
from langchain.prompts import PromptTemplate

import faiss

from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS

embedding_size = 1536 # Dimensions of the OpenAIEmbeddings
index = faiss.IndexFlatL2(embedding_size)
embedding_fn = OpenAIEmbeddings(openai_api_key=openai.api_key).embed_query
vectorstore = FAISS(embedding_fn, index, InMemoryDocstore({}), {})

# In actual usage, you would set `k` to be a higher value, but we use k=1 to show that
# the vector lookup still returns the semantically relevant information
retriever = vectorstore.as_retriever(search_kwargs=dict(k=1))
memory = VectorStoreRetrieverMemory(retriever=retriever)

# When added to an agent, the memory object can save pertinent information from conversations or used tools
memory.save_context({"input": "My favorite food is pizza"}, {"output": "thats good to know"})
memory.save_context({"input": "My favorite sport is soccer"}, {"output": "..."})
memory.save_context({"input": "I don't the Celtics"}, {"output": "ok"}) # 
# Notice the first result returned is the memory pertaining to tax help, which the language model deems more semantically relevant
# to a 1099 than the other documents, despite them both containing numbers.
print(memory.load_memory_variables({"prompt": "what sport should i watch?"})["history"])

In [72]:
memory

VectorStoreRetrieverMemory(retriever=VectorStoreRetriever(vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x7f618eaaf0d0>, search_type='similarity', search_kwargs={'k': 1}), memory_key='history', input_key=None, return_docs=False)