In [1]:

import warnings
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
import requests
import gradio as gr

import os
from flask import Flask, request, jsonify

from langchain.schema import Document

In [2]:

# Directory constants
MODEL_DIR = 'models'
EMBEDDINGS_DIR = os.path.join(MODEL_DIR, 'embeddings')
QA_MODEL_DIR = os.path.join(MODEL_DIR, 'qa')
DATA_DIR = 'data'
WP_POSTS_DIR = os.path.join(DATA_DIR, 'wordpress_posts')
UTILS_DIR = 'utils'

# Create directories if they don't exist
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
os.makedirs(QA_MODEL_DIR, exist_ok=True)
os.makedirs(WP_POSTS_DIR, exist_ok=True)
os.makedirs(UTILS_DIR, exist_ok=True)

# Constants for models and dataset
DATASET_NAME = "databricks/databricks-dolly-15k"
PAGE_CONTENT_COLUMN = "context"
EMBEDDING_MODEL_PATH = "sentence-transformers/all-MiniLM-l6-v2"
QA_MODEL_NAME = "Intel/dynamic_tinybert"
BASE_URL = "https://techcrunch.com"



In [3]:

# Initialize Flask app
app = Flask(__name__)

In [4]:

# Load dataset and split it into manageable chunks
loader = HuggingFaceDatasetLoader(DATASET_NAME, PAGE_CONTENT_COLUMN)
data = loader.load()



In [5]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)


In [6]:

# Set up embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': False})
db = FAISS.from_documents(docs, embeddings)


  warn_deprecated(


In [10]:

# Set up question-answering model
tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL_NAME)
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, return_tensors='pt')
llm = HuggingFacePipeline(pipeline=question_answerer, model_kwargs={"temperature": 0.7, "max_length": 512})


In [8]:

# Set up retriever and QA pipeline
retriever = db.as_retriever(search_kwargs={"k": 4})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)


In [11]:
# Define function to fetch posts from a WordPress site
def fetch_wordpress_posts(base_url, count=15, post_type='posts'):
    url = f"{base_url}/wp-json/wp/v2/{post_type}"
    params = {'per_page': count}
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

# Load WordPress posts and add to vector store
wp_data = fetch_wordpress_posts(BASE_URL)
wp_docs = [Document(page_content=doc['content']['rendered']) for doc in wp_data]
wp_docs_split = text_splitter.split_documents(wp_docs)
db.add_documents(wp_docs_split)


['def1769c-7ce8-4f4c-9509-7ad1bd57a4a6',
 '4c910a86-3df5-434d-98ef-271db56b5260',
 'e4836dca-319c-4b41-8a36-868ee1557522',
 '0291d74e-f241-4ea0-bc9e-5c15308df9b0',
 '688a2109-8184-40ce-8374-e973adfac560',
 '88e99fb4-da21-436c-a08d-2952912ea4ba',
 '69c9f52a-845d-4e0a-964b-043aa2a0dc92',
 'e832ce41-766a-47ef-92d0-edce522d5aed',
 '1af58cbd-1125-40c3-a7bf-c07b2a5e6340',
 '90255f01-224d-4d8e-97e5-af8156c7e754',
 '58ae9dc3-a9e0-48f9-9495-a69c88cd7b05',
 '5c50b24f-a9c3-4116-b8c5-fd65664025dd',
 '4436ca60-6bc6-445c-ac15-b73be3c8e339',
 '8fea0c7f-3acb-408e-8389-6dc4cb39b36e',
 'd9b43823-c1d9-4fdb-b8b7-bca87bb408fa',
 'c28997ad-5382-498f-b45a-ab3e1b9f2146',
 '2162fd47-3452-4237-9108-ad23c433c03c',
 '6618fa64-0c7a-41a7-b018-40ae9ae5e7a9',
 '118622fc-beea-434a-8c43-e98a6bea231a',
 '835e747c-859b-49ef-a660-cbf629ef524c',
 '49667368-f267-4a54-bc18-d21a6af9a283',
 '1f118db2-dc37-4d6c-b0e8-b3ce5f181d54',
 'c17334a8-c112-4fdf-9c31-c3e0bce705aa',
 '1f83bffa-d968-4bb0-99a3-7bbf99e9f536',
 'df5db8c5-badd-

In [12]:

# Define function to process user queries
def answer_question(final_question):
    wp_search_docs = db.similarity_search(final_question)
    context = " ".join([doc.page_content for doc in wp_search_docs])
    qa_input = {"question": final_question, "context": context}
    final_answer = question_answerer(qa_input)
    return final_answer['answer']


In [13]:

@app.route('/chat', methods=['POST'])
def chat():
    user_query = request.json.get('query')
    answer = answer_question(user_query)
    return jsonify(response=answer)


In [18]:
@app.route('/retrievee', methods=['POST'])
def retrievee():
    query = request.json.get('query')
    wp_search_docs = db.similarity_search(query)
    retrieved_docs = [{"content": doc.page_content} for doc in wp_search_docs]
    return jsonify(docs=retrieved_docs)

In [19]:

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.145.208:5000
Press CTRL+C to quit
127.0.0.1 - - [02/Jul/2024 16:25:43] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:25:44] "HEAD / HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:25:44] "HEAD / HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:25:44] "HEAD / HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:25:45] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:25:45] "HEAD / HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:25:45] "HEAD / HTTP/1.1" 404 -
127.0.0.1 - - [02/Jul/2024 16:26:35] "OPTIONS /chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Jul/2024 16:26:58] "OPTIONS /chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Jul/2024 16:29:09] "OPTIONS /chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Jul/2024 16:33:26] "POST /chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Jul/2024 16:33:28] "POST /retrieve HTTP/1.1" 200 -
127.0.0.1 - - [02/Jul/2024 17:05:22] "POST /chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Jul/20