In [30]:
pip install ctransformers

Note: you may need to restart the kernel to use updated packages.


In [31]:
  !pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0\
  InstructorEmbedding

In [32]:
pip install langdetect googletrans==4.0.0-rc1

Note: you may need to restart the kernel to use updated packages.


In [71]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer
from dotenv import load_dotenv
import os
import timeit
import sys
import pinecone
from langdetect import detect
from googletrans import Translator

In [72]:
load_dotenv()
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY','')
PINECONE_API_ENV=os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [85]:
file_path = 'french.txt'

In [86]:
documents = []

# Open the file for reading
with open(file_path, 'r', encoding='latin-1') as file:
    # Read each line (document) from the file
    for line in file:
        # Append the line (document) to the list of documents, removing leading/trailing whitespace
        documents.append(line.strip())

In [87]:
input_text = "\n".join(documents)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Truncate the input text to fit within the allowed token limit
max_tokens = 512
tokenized_text = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_tokens, padding="max_length")
input_ids = tokenized_text["input_ids"]
input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

In [88]:
# Assuming 'documents' is a list of strings from the text file

# Initialize a list to store the split documents
split_docs = []

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

# Split each line/document into smaller chunks
for document in documents:
    split_chunks = text_splitter.split_text(input_text)
    split_docs.extend(split_chunks)

# Now, 'split_docs' contains the smaller text chunks


In [89]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [90]:
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="railway-chatbot"

In [91]:
docsearch = Pinecone.from_texts(split_docs, embeddings, index_name=index_name)

In [92]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that I don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [93]:
chain_type_kwargs={"prompt": PROMPT}

llm=CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [94]:
qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'k': 2}),return_source_documents=True, chain_type_kwargs=chain_type_kwargs)

In [95]:
translator = Translator()

In [96]:
while True:
    user_input = input("\nInput Prompt: ")
    if user_input.lower() == 'exit':
        print('Exiting')
        sys.exit()
    if user_input == '':
        continue

    # Detect the language of the user input
    query_language = detect(user_input)

    # Detect the language of the documents (assuming all documents are in the same language)
    doc_language = detect(documents[0]) if documents else 'en'

    # Translate the user query to the document language if they differ
    if query_language != doc_language:
        translated_query = translator.translate(user_input, src=query_language, dest=doc_language).text
    else:
        translated_query = user_input

    # Get the result from the QA system
    result = qa({"query": translated_query})
    
    #print("\nResponse in doc language: ", result["result"])

    # Translate the result back to the user's language if needed
    if query_language != doc_language:
        translated_result = translator.translate(result["result"], src=doc_language, dest=query_language).text
    else:
        translated_result = result["result"]
    
    
    print("\nFinal Response: ", translated_result)


Input Prompt: Auswirkungen des Verlusts der Artenvielfalt?

Final Response:  Der Verlust der biologischen Vielfalt kann mehrere negative Auswirkungen auf Ökosysteme haben, darunter:

1. Verlust von Ökosystemleistungen: Die Biodiversität bietet eine Reihe wichtiger Ökosystemleistungen wie Bestäubung, Schädlingsbekämpfung und Nährstoffkreislauf.Ohne die biologische Vielfalt können diese Dienste in der Qualität verloren gehen oder reduziert werden.
2. Reduzierte Widerstandsfähigkeit gegenüber Umweltveränderungen: Wenn ein Ökosystem unterschiedlich ist, kann es mit anderen Umweltveränderungen wie Entwürfen oder extremen Wetterereignissen in der Lage sind.Der Verlust der biologischen Vielfalt kann die Widerstandsfähigkeit eines Ökosystems verringern und es für diese Änderungen motiviert werden.
3. Verringerte Ernährungssicherheit: Viele Pflanzen und Viehzucht stützen sich auf die Biodiversität für Bestäubung, Schädlingsbekämpfung und andere Ökosystemdienste.Der Verlust der biologischen Vie

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
