<a href="https://colab.research.google.com/github/sherry-ger/elastic-workshop/blob/main/GenAI/Summarize_Docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages and libraries we will need for the workshop

In [None]:
# install packages
!pip install -qU langchain elasticsearch tiktoken
!pip install transformers
!pip install accelerate bitsandbytes
!pip install sentencepiece

# import modules
from getpass import getpass
from langchain.vectorstores import ElasticsearchStore
from urllib.request import urlopen
from langchain.text_splitter import CharacterTextSplitter
import json
import torch

# Initial Set Up

In [None]:
# set elastic cloud id and password

CLOUD_ID = getpass("Elastic deployment Cloud ID")
CLOUD_USERNAME = "elastic"
CLOUD_PASSWORD = getpass("Elastic deployment Password")

# Data

Here is the [JSON file](https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/workplace-search/data/data.json)

In [3]:
url = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/workplace-search/data/data.json"

response = urlopen(url)

workplace_docs = json.loads(response.read())

# Data and Chunking with LangChain Text Splitters

Here is the [documentation](https://js.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/token).



In [None]:
metadata = []
content = []

# Add metadata to our documents

for doc in workplace_docs:
  content.append(doc["content"])
  metadata.append({
      "name": doc["name"],
      "summary": doc["summary"],
      "rolePermissions":doc["rolePermissions"]
  })

# Chunking using fixed length chunking method

text_splitter = CharacterTextSplitter(chunk_size=384, chunk_overlap=20)
docs = text_splitter.create_documents(content, metadatas=metadata)

for documents in docs:
  print(documents.page_content)
  print(documents.metadata['name'])
  print(documents.metadata['summary'])
  print(documents.metadata['rolePermissions'])


# Connect to Elasticsearch using Python client

In [None]:
from elasticsearch import Elasticsearch
client = Elasticsearch(
    cloud_id=CLOUD_ID,
    basic_auth=("elastic", CLOUD_PASSWORD),
    request_timeout=30
)

# Successful response!
client.info()

# Download and Start ELSER model

In [None]:
# Download / Load ELSER
client.ml.put_trained_model(model_id=".elser_model_1", input={"field_names": "text_field"})


In [None]:
# Start ELSER
client.ml.start_trained_model_deployment(
    model_id=".elser_model_1"
)

# Ingest data using LangChain

In [12]:
vector_store = ElasticsearchStore(es_connection=client,
            index_name= "workplace_index"
        )

# Ingest using ELSER - sparse encoder strategy
documents = vector_store.from_documents(
    docs, es_connection=client, index_name="workplace_index",
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy()
)

# Query with LangChain

In [14]:
# Helper method to print out outputs
def showResults(output):
  print("Total results: ", len(output))
  for index in range(len(output)):
    print(output[index])


In [None]:
results = documents.similarity_search("How does the compensation work?")
showResults(results)

# Text Expansion query with ELSER

Since we have splitted a single document into multiple pieces, we want to group the results of a single document into a single output by using the [Elasticsearch query results collapse](https://www.elastic.co/guide/en/elasticsearch/reference/current/collapse-search-results.html).

In [None]:
query_text = input("Enter a question :")
print('\n')
# How does compensation work
query={
    "text_expansion": {
    "vector.tokens": {
        "model_id":".elser_model_1",
        "model_text": query_text
    }
  }
}

# print(query)

# We don't want to repeatedly see the different sections of the same document
# Let's just get all the sections and group them together by document name
# Well at least the first 10 chunks

collapse={
    "field": "metadata.name.keyword",
    "inner_hits": [{
        "name": "text",
        "_source": "false",
        "fields": ["text"],
        "size": 10
      }]
}

# print collapse

resp = client.search(index="workplace_index", query=query, collapse=collapse, source=["metadata.name", "metadata.summary", "text"])

# Test to see if we have all the docs
# for hit in resp['hits']['hits']:
#  doc_name = hit['_source']['metadata']['name']
#  print(f"\nDocument Name: {doc_name}")

# Show the first doc. Default sort order is by _score
hit = resp['hits']['hits'][0]
doc_name = hit['_source']['metadata']['name']
print(f"Document Name: {doc_name}\n")

doc_text = "summarize: "

# Compile the inner hits (at max we have 10)
for inner_hit in hit['inner_hits']['text']['hits']['hits']:
  doc_text += ''.join(inner_hit['fields']['text'])
#  print(inner_hit['fields']['text'])
#  print("\n")

print(doc_text)

# Summarization

Use a [T5 model](https://https://colab.research.google.com/drive/18sGI2hylGPUIW3mRrTPjH1F6vxBEfxp-#scrollTo=jRKq4f2H4AA8&line=3&uniqifier=1) to perform summarization

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

#tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
# Not sufficient memory to import fp16 model
#model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
# Two questions will run out of memory
#model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

In [None]:
input_ids = tokenizer(doc_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids,
                         min_length=20,
                         max_new_tokens=600
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Clean up

In [None]:
# Stop ELSER model
client.ml.stop_trained_model_deployment(
    model_id=".elser_model_1",
     force=True
)