In [None]:
!pip -q install langchain tiktoken chromadb pypdf InstructorEmbedding
!pip install transformers==4.30
!pip -q install accelerate bitsandbytes kaleido openai cohere python-multipart


In [None]:
!pip show langchain

## QA Retrieval No Open AI - Flan-T5-XL




In [None]:
!pip install torch
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")



In [None]:
# del tokenizer,model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl",
                                              load_in_8bit=True,
                                              device_map='auto',
                                              # torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True,

                                              )

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

#### Check the LLM is working

In [None]:
print(local_llm('What is the capital of Great Britain?'))

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files - PDFs
- ChromaDB
- Local LLM
- Instuctor Embeddings


## Setting up LangChain


In [None]:
import os
!pip install sentence_transformers

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

## Load multiple and process documents

In [None]:
# Load and process the text files
# loader = TextLoader('single_text_file.pdf')
loader = DirectoryLoader('/kaggle/input/dataset', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [None]:
len(documents)

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
len(texts)

## HF Instructor Embeddings

In [None]:

from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})


## create the DB

In [None]:
!pip install faiss-gpu

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
# persist_directory = '/kaggle/input/database'

# ## Here is the nmew embeddings being used
embedding = instructor_embeddings



from langchain.vectorstores import FAISS
db = FAISS.from_documents(texts, embedding)
# vectordb = Chroma.from_documents(texts,
#                                  embedding,
#                                  persist_directory=persist_directory)




## Make a retriever

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 2})

## Make a chain

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
!pip install pandas openpyxl


In [None]:
import os
import pandas as pd

In [70]:

# Define the directory where you want to save the file
save_directory = '/kaggle/working/'

# List to store query-response pairs
data = []

queries = []
standard_answers = []
batch_size = 19  # Adjust the batch size as needed

while True:
    query = input("Enter your query (or type 'process' to process the queries): ")
    if query == 'process':
        if queries:
            llm_responses = [qa_chain(q) for q in queries]
            for query, response, standard_answer in zip(queries, llm_responses, standard_answers):
                # Extract only the 'result' part of the response
                response_text = response.get('result', 'No response found')
                print(f"Query: {query}\nProcessed response: {response_text}\nStandard Answer: {standard_answer}\n")  # Print the query, processed response, and standard answer
                # Store the query, the response text, and the standard answer
                data.append({"Query": query, "Response": response_text, "Standard Answer": standard_answer})
            queries = []  # Reset the queries list
            standard_answers = []  # Reset the standard answers list
        else:
            print("No queries to process.")
    elif query == 'exit':
        # Create DataFrame and write to Excel file before exiting
        df = pd.DataFrame(data)
        file_path = os.path.join(save_directory, "queries_and_responses.xlsx")
        
        df.to_excel(file_path, index=False)
        print(f"Excel file saved at {file_path}. Exiting.")
        break
    elif len(queries) < batch_size:
        queries.append(query)
        standard_answer = input("Enter the standard answer for this query: ")
        standard_answers.append(standard_answer)
    else:
        print(f"Processing {batch_size} queries...")
        llm_responses = [qa_chain(q) for q in queries]
        for query, response, standard_answer in zip(queries, llm_responses, standard_answers):
            # Extract only the 'result' part of the response
            response_text = response.get('result', 'No response found')
            print(f"Query: {query}\nProcessed response: {response_text}\nStandard Answer: {standard_answer}\n")  # Print the query, processed response, and standard answer
            # Store the query, the response text, and the standard answer
            data.append({"Query": query, "Response": response_text, "Standard Answer": standard_answer})
        queries = []  # Reset the queries list
        standard_answers = []  # Reset the standard answers list


Enter your query (or type 'process' to process the queries):  What is the purpose of the Indian Network of Climate Change Assessment (INCCA)?
Enter the standard answer for this query:  To assess the drivers and implications of climate change through scientific research and prepare climate change assessments.
Enter your query (or type 'process' to process the queries):  How does the IPCC characterize the impact of human activities on climate?
Enter the standard answer for this query:  The impact of human activities on climate is unequivocal (IPCC, 2007).
Enter your query (or type 'process' to process the queries):  Which social and economic sectors in India were assessed for climate change impacts in the 2050s and 2080s?
Enter the standard answer for this query:  Water resources, agriculture, natural ecosystems and forestry, human health, infrastructure, and energy.
Enter your query (or type 'process' to process the queries):  What regions in India are considered climate-sensitive in th

Processing 19 queries...




Query: What is the purpose of the Indian Network of Climate Change Assessment (INCCA)?
Processed response: to assess the drivers and implications of climate change through scientifi c research
Standard Answer: To assess the drivers and implications of climate change through scientific research and prepare climate change assessments.

Query: How does the IPCC characterize the impact of human activities on climate?
Processed response: unequivocal
Standard Answer: The impact of human activities on climate is unequivocal (IPCC, 2007).

Query: Which social and economic sectors in India were assessed for climate change impacts in the 2050s and 2080s?
Processed response: Agriculture, Water, Natural Ecosystems and Biodiversity and Human Health
Standard Answer: Water resources, agriculture, natural ecosystems and forestry, human health, infrastructure, and energy.

Query: What regions in India are considered climate-sensitive in the assessment?
Processed response: the Himalayan region, the Nort

Enter your query (or type 'process' to process the queries):  How does the report address the impacts of climate variability in India? 
Enter the standard answer for this query:  The report reviews the impacts of climate variability in the four major climate-sensitive regions
Enter your query (or type 'process' to process the queries):  What is the resolution of the regional climate change model PRECIS used in the assessment? 
Enter the standard answer for this query:  The resolution is 50km x 50km.
Enter your query (or type 'process' to process the queries):  How is the transmission of malaria assessed in the 2030s? 
Enter the standard answer for this query:  : Transmission windows are defined in terms of temperature and relative humidity
Enter your query (or type 'process' to process the queries):  Why is sea-level rise expected to continue even if GHG emissions are halted today?
Enter the standard answer for this query:  The ocean has a significant thermal inertia, leading to a dela

Processing 19 queries...




Query: How does the report address the impacts of climate variability in India? 
Processed response: It presents an assessment of the impacts of climate change in the 2030s on four key sectors of the economy that are climate dependent
Standard Answer: The report reviews the impacts of climate variability in the four major climate-sensitive regions

Query: What is the resolution of the regional climate change model PRECIS used in the assessment? 
Processed response: 50km x 50km
Standard Answer: The resolution is 50km x 50km.

Query: How is the transmission of malaria assessed in the 2030s? 
Processed response: based on temperature
Standard Answer: : Transmission windows are defined in terms of temperature and relative humidity

Query: Why is sea-level rise expected to continue even if GHG emissions are halted today?
Processed response: Because the ocean has an enormous thermal inertia
Standard Answer: The ocean has a significant thermal inertia, leading to a delayed adjustment in sea le

Enter your query (or type 'process' to process the queries):  What challenges are associated with the SWAT model used for assessing water yields in various regions?
Enter the standard answer for this query:  Challenges include obtaining information from global sources, assuming static man-made changes, and the need for scenario projections reflecting realistic trends.
Enter your query (or type 'process' to process the queries):  How is the entire Indian region climatologically divided in the assessment? 
Enter the standard answer for this query:  It is divided into the western Himalayas, north-west, north-east, northern-central region, eastern coast, western coast, and the interior plateau.
Enter your query (or type 'process' to process the queries):  What is the projected rise in annual mean surface air temperature in the 2030s?
Enter the standard answer for this query:  The annual mean surface air temperature is projected to rise by 1.7°C to 2.0°C in the 2030s
Enter your query (or ty

Processing 19 queries...


Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors


Query: What challenges are associated with the SWAT model used for assessing water yields in various regions?
Processed response: not enough information
Standard Answer: Challenges include obtaining information from global sources, assuming static man-made changes, and the need for scenario projections reflecting realistic trends.

Query: How is the entire Indian region climatologically divided in the assessment? 
Processed response: western Himalayas, north-west, north-east, northern-central region, eastern coast, western coast, and the interior plateau
Standard Answer: It is divided into the western Himalayas, north-west, north-east, northern-central region, eastern coast, western coast, and the interior plateau.

Query: What is the projected rise in annual mean surface air temperature in the 2030s?
Processed response: 1.7°C and 2.0°C
Standard Answer: The annual mean surface air temperature is projected to rise by 1.7°C to 2.0°C in the 2030s

Query: How does temperature variability d

Enter your query (or type 'process' to process the queries):  How does the reduction in water yield in the western coastal region compare to the increase in specific areas?
Enter the standard answer for this query:  While the western coastal region may experience a general reduction, Karnataka may see an increase of 10%–20% in water yield.
Enter your query (or type 'process' to process the queries):  Why is the validation of regional climate models important for accurate impact assessments in India? 
Enter the standard answer for this query:  Validation ensures that regional climate models accurately simulate observed climate conditions, reducing uncertainties in impact assessments
Enter your query (or type 'process' to process the queries):  What steps can be taken to bridge the significant data gap in interdisciplinary climate change research? 
Enter the standard answer for this query:  Establishing an effective mechanism for sharing and accessing diverse data sets is crucial to brid

Processing 19 queries...




Query: How does the reduction in water yield in the western coastal region compare to the increase in specific areas?
Processed response: There is a general reduction in water yield in the eastern coastal region of West
Standard Answer: While the western coastal region may experience a general reduction, Karnataka may see an increase of 10%–20% in water yield.

Query: Why is the validation of regional climate models important for accurate impact assessments in India? 
Processed response: It can reduce the uncertainty of our estimates to an extent.
Standard Answer: Validation ensures that regional climate models accurately simulate observed climate conditions, reducing uncertainties in impact assessments

Query: What steps can be taken to bridge the significant data gap in interdisciplinary climate change research? 
Processed response: Various agencies in India are presently collecting such data on a regular basis.
Standard Answer: Establishing an effective mechanism for sharing and acc

Enter your query (or type 'process' to process the queries):  How can systematic observations be improved, especially concerning forest vegetation types and soil characteristics? 
Enter the standard answer for this query:  New long-term systematic observations are essential for gathering data on forest vegetation types and soil characteristics
Enter your query (or type 'process' to process the queries):  What is the recommended approach for addressing data gaps in climate change research? 
Enter the standard answer for this query:  Efforts are needed to establish an effective mechanism for sharing and accessing climate, ecosystem, water, agriculture, and socio-economic data.
Enter your query (or type 'process' to process the queries):  How are floods projected using the SWAT model, and what regions show a significant change in flood magnitudes? 
Enter the standard answer for this query:  Floods, exceeding 99th percentile flow, may increase from 10% to over 30% in various regions, impac

Processing 19 queries...




Query: How can systematic observations be improved, especially concerning forest vegetation types and soil characteristics? 
Processed response: Observing for a long period of time
Standard Answer: New long-term systematic observations are essential for gathering data on forest vegetation types and soil characteristics

Query: What is the recommended approach for addressing data gaps in climate change research? 
Processed response: a mechanism for sharing and accessing this data in formats that can be easily deciphered
Standard Answer: Efforts are needed to establish an effective mechanism for sharing and accessing climate, ecosystem, water, agriculture, and socio-economic data.

Query: How are floods projected using the SWAT model, and what regions show a significant change in flood magnitudes? 
Processed response: daily outfl ow discharge in each sub-basin
Standard Answer: Floods, exceeding 99th percentile flow, may increase from 10% to over 30% in various regions, impacting existing

Enter your query (or type 'process' to process the queries):  How does the intensity of extreme precipitation events change in the Coastal region, and what areas might experience an increase in rainfall?
Enter the standard answer for this query:  Rainy days may decrease by 1–5 days, but intensity is likely to increase between 1mm/day and 4mm/day, with slight increases along the Orissa coast.
Enter your query (or type 'process' to process the queries):  What is the expected change in extreme temperatures in the Coastal region, and how do minimum and maximum temperatures differ
Enter the standard answer for this query:  Minimum temperatures may rise by 2.0°C to 4.5°C, and maximum temperatures may increase by 1°C to 3.5°C in the Coastal region.
Enter your query (or type 'process' to process the queries):  How is the number of rainy days and intensity expected to change in the Western Ghats, especially in the Karnataka region? 
Enter the standard answer for this query:  In the Western Ghat



Query: How does the intensity of extreme precipitation events change in the Coastal region, and what areas might experience an increase in rainfall?
Processed response: The intensity of rainfall is likely to increase between 1mm/day and 4mm/day
Standard Answer: Rainy days may decrease by 1–5 days, but intensity is likely to increase between 1mm/day and 4mm/day, with slight increases along the Orissa coast.

Query: What is the expected change in extreme temperatures in the Coastal region, and how do minimum and maximum temperatures differ
Processed response: The warming in night temperatures is more over the south peninsula and central and northern India, whereas daytime warming is more in central and northern India
Standard Answer: Minimum temperatures may rise by 2.0°C to 4.5°C, and maximum temperatures may increase by 1°C to 3.5°C in the Coastal region.

Query: How is the number of rainy days and intensity expected to change in the Western Ghats, especially in the Karnataka region? 


Enter your query (or type 'process' to process the queries):  exit


Excel file saved at /kaggle/working/queries_and_responses.xlsx. Exiting.


In [71]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define a function to calculate cosine similarity between two texts
def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Read the Excel file
file_path = '/kaggle/input/queries-and-responses/queries_and_responses.xlsx'  # Update the path accordingly
df = pd.read_excel(file_path)

# Calculate similarities
df['Similarity'] = df.apply(lambda row: calculate_cosine_similarity(row['Response'], row['Standard Answer']), axis=1)

# Print similarities (optional)
print(df[['Response', 'Standard Answer', 'Similarity']])

# Save the updated DataFrame back to an Excel file
output_file_path = '/kaggle/working/queries_and_responses_with_similarity.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Updated Excel file saved at {output_file_path}.")


                                              Response  \
0    to assess the drivers and implications of clim...   
1                                          unequivocal   
2    Agriculture, Water, Natural Ecosystems and Bio...   
3    the Himalayan region, the North-Eastern region...   
4    MoEF , Ministry of Earth Sciences, Ministry of...   
..                                                 ...   
99             To overcome the limitation of the model   
100                                                b).   
101                    lack of proper scientifi c data   
102  Because of the limitations of the physical cli...   
103       capturing the orography of the Indian region   

                                       Standard Answer  Similarity  
0    To assess the drivers and implications of clim...    0.763310  
1    The impact of human activities on climate is u...    0.219511  
2    Water resources, agriculture, natural ecosyste...    0.682989  
3    Himalayan region, Nort