In [37]:
import os
from collections import defaultdict
from bs4 import BeautifulSoup
from typing import List
import json

HTML_FOLDER_PATH = os.path.join(os.getcwd(), 'DATA')

distinct_tags_in_ai_intent_1 = set()
distinct_tags_in_ai_intent_2 = set()

distinct_tags = {
    "1": distinct_tags_in_ai_intent_1,
    "2": distinct_tags_in_ai_intent_2
}

def fetch_tags(tag: BeautifulSoup, all_tags: List):
    if tag.has_attr('ai-intent'):
        if tag.decode_contents().strip() != "":
            ai_intent_value = tag['ai-intent']
            tag_name = tag.name
            distinct_tags.get(ai_intent_value, set()).add(tag_name)
            tag_content = {
                'tag': tag_name,
                'content': tag.text.strip(),
                'ai-intent': ai_intent_value
            }
            all_tags.append(tag_content)
    for child in tag.find_all(recursive=False):
        fetch_tags(child, all_tags)

def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        all_tags = []
        fetch_tags(soup, all_tags)
        
        grouped_data = defaultdict(list)
        for item in all_tags:
            tag = item['tag']
            content = item['content']
            content_with_ai_intent = {
                "content": content,
                "ai-intent": item['ai-intent']
            }
            grouped_data[tag].append(content_with_ai_intent)

        return grouped_data
    
def create_ai_intent_files(grouped_data, file_name):
    filtered_data_1 = {"file_name": file_name, "data": {}}
    filtered_data_2 = {"file_name": file_name, "data": {}}
    
    for tag, values in grouped_data.items():
        filtered_data_1["data"][tag] = [item["content"] for item in values if item["ai-intent"] == "1"]
        filtered_data_2["data"][tag] = [item["content"] for item in values if item["ai-intent"] == "2"]
    
    filtered_data_1["data"] = {tag: items for tag, items in filtered_data_1["data"].items() if items}
    filtered_data_2["data"] = {tag: items for tag, items in filtered_data_2["data"].items() if items}
    
    return filtered_data_1, filtered_data_2

def process_html_files_recursively(folder_path):
    ai_intent_1 = []
    ai_intent_2 = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.html'):
                file_path = os.path.join(root, file_name)
                grouped_data = extract_content_from_html(file_path)
                filtered_data_1, filtered_data_2 = create_ai_intent_files(grouped_data, file_name)
                ai_intent_1.append(filtered_data_1)
                ai_intent_2.append(filtered_data_2)
                
    with open('ai-intent_1.json', 'w', encoding='utf-8') as json_file:
        json.dump(ai_intent_1, json_file, indent=4, ensure_ascii=False)
        
    with open('ai-intent_2.json', 'w', encoding='utf-8') as json_file:
        json.dump(ai_intent_2, json_file, indent=4, ensure_ascii=False)
                
process_html_files_recursively(HTML_FOLDER_PATH)

In [40]:
distinct_tags['2']

{'div', 'li', 'ol', 'p', 'section', 'span', 'table', 'tbody', 'td', 'th', 'ul'}

In [69]:
with open("ai-intent_1.json", "r") as file:
    ai_intent_1_file = json.load(file)
    
with open("ai-intent_2.json", "r") as file:
    ai_intent_2_file = json.load(file)

def get_contexts():           
    priorities = {
        "1": ["h1", "h2", "h3", "ul", "ol", "li"],
        "2": ["p", "table", "note"]
    }     
    
    total_contents = []
    for ai_intent_value, tag_list in priorities.items():
        for tag in tag_list:
            if tag in distinct_tags[ai_intent_value]:
                content = read_tag_contents(tag, ai_intent_value)
                total_contents.extend(content)
                
    grouped_contents = defaultdict(list)
    
    for obj in total_contents:
        file_name, tag, content = list(obj.values())
        grouped_contents[file_name].append({"tag": tag, "content": content})
    
    result = [{"file_name": file_name, "total_contents": contents} for file_name, contents in grouped_contents.items()]
    
    return result

def read_tag_contents(tag_name, ai_intent_value):
    ai_intent_file_name = ai_intent_1_file if ai_intent_value == "1" else ai_intent_2_file
    
    return [{"file_name": obj['file_name'], "tag": tag, "content": content} 
            for obj in ai_intent_file_name 
            for tag, content_list in obj['data'].items() 
            if tag == tag_name 
            for content in content_list
            ]

AVAILABLE_CONTEXTS = get_contexts()
AVAILABLE_CONTEXTS

[{'file_name': 'Changing_Password.html',
  'total_contents': [{'tag': 'h1', 'content': 'Changing Password'}]},
 {'file_name': 'Retrieving_Password.html',
  'total_contents': [{'tag': 'h1', 'content': 'Retrieving Password'}]},
 {'file_name': 'Modifying_User_Preferences.html',
  'total_contents': [{'tag': 'h1', 'content': 'Modifying User Preferences'}]},
 {'file_name': 'Engage_Home.html',
  'total_contents': [{'tag': 'h1', 'content': 'Engage Home'},
   {'tag': 'p',
    'content': 'The Home page displays the\n                    list of all available campaigns with their statuses and enables the agency users to view a summary\n                    of the campaign details. The project campaigns have the following statuses:'},
   {'tag': 'p',
    'content': 'The Search campaigns field\n                    enables you to search for appropriate campaigns and view the details of projects associated with\n                    them.'},
   {'tag': 'p',
    'content': 'The Sort by drop-down list\n  

In [37]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from functools import wraps

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_filtered_keywords(user_query):
    query_embedding = model.encode([user_query])
    keywords_embeddings = model.encode([keyword['content'] for keyword in AVAILABLE_CONTEXTS])

    similarities = cosine_similarity(query_embedding, keywords_embeddings)

    similarity_pairs = list(zip(AVAILABLE_CONTEXTS, similarities[0]))

    sorted_keywords = sorted(similarity_pairs, key=lambda x: x[1], reverse=True)
    
    threshold = 0.5
    filtered_keywords = [keyword for keyword, score in sorted_keywords if score > threshold]
    
    return filtered_keywords

def clean_text(text):
    text = text.replace('\n', ' ').strip()
    text = ' '.join(text.split())
    return text

def clean_text_decorator(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        return clean_text(result)
    return wrapper

@clean_text_decorator
def get_relevent_context_from_files(contexts):
    data = []
    file_names = [context['file_name']  for context in contexts]
    
    body_contents = "/n/n".join([obj['data']['body'][0] for obj in ai_intent_1_file if obj['file_name'] in file_names])
    return body_contents

def get_contexts_wrapper(user_query):
    filtered_keywords = get_filtered_keywords(user_query)
    print(filtered_keywords)
    contexts = get_relevent_context_from_files(filtered_keywords)
    return contexts
    

In [38]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate

repo_id="mistralai/Mistral-7B-Instruct-v0.3"

prompt = """Answer the question based on the following context. If you don't know the answer, just say that you don't know, don't try to make up an answer. Ensure the answer is concise and to the point.

###Context: {context}
###Question: {question}

###Helpful Answer:"""
prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])

llm = HuggingFaceEndpoint(repo_id=repo_id,temperature=0.01,huggingfacehub_api_token='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')

user_query = "user preference"
contexts = get_contexts_wrapper(user_query)

formatted_prompt = prompt.format(context=contexts, question=user_query)

response = llm(formatted_prompt)
response

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/metapercept/.cache/huggingface/token
Login successful
[{'file_name': 'Modifying_User_Preferences.html', 'content': 'Modifying User Preferences'}, {'file_name': 'User_Management.html', 'content': 'User Management'}]


'\n\nIn the given context, User Preferences is a page where you can update personal information of your account such as First Name, Last Name, Department, and Designation. You cannot change the email address.'