In [252]:

import os
from collections import defaultdict
from bs4 import BeautifulSoup
from typing import List
import json

HTML_FOLDER_PATH = os.path.join(os.getcwd(), 'DATA')

distinct_tags_in_ai_intent_1 = set()
distinct_tags_in_ai_intent_2 = set()

distinct_tags = {
    "1": distinct_tags_in_ai_intent_1,
    "2": distinct_tags_in_ai_intent_2
}

priorities = {
        "1": [["h1"], ["h2", "h3", "ul", "ol", "li"]],
        "2": [["p", "table", "note"]]
}    

def fetch_tags(tag: BeautifulSoup, all_tags: List):
    if tag.has_attr('ai-intent'):
        if tag.decode_contents().strip() != "":
            ai_intent_value = tag['ai-intent']
            tag_name = tag.name
            distinct_tags.get(ai_intent_value, set()).add(tag_name)
            tag_content = {
                'tag': tag_name,
                'content': tag.text.strip(),
                'ai-intent': ai_intent_value
            }
            all_tags.append(tag_content)
    for child in tag.find_all(recursive=False):
        fetch_tags(child, all_tags)

def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        all_tags = []
        fetch_tags(soup, all_tags)
        
        grouped_data = defaultdict(list)
        for item in all_tags:
            tag = item['tag']
            content = item['content']
            content_with_ai_intent = {
                "content": content,
                "ai-intent": item['ai-intent']
            }
            grouped_data[tag].append(content_with_ai_intent)

        return grouped_data
    
def create_ai_intent_files(grouped_data, file_name):
    filtered_data_1 = {"file_name": file_name, "data": {}}
    filtered_data_2 = {"file_name": file_name, "data": {}}
    
    for tag, values in grouped_data.items():
        filtered_data_1["data"][tag] = [item["content"] for item in values if item["ai-intent"] == "1"]
        filtered_data_2["data"][tag] = [item["content"] for item in values if item["ai-intent"] == "2"]
    
    filtered_data_1["data"] = {tag: items for tag, items in filtered_data_1["data"].items() if items}
    filtered_data_2["data"] = {tag: items for tag, items in filtered_data_2["data"].items() if items}
    
    return filtered_data_1, filtered_data_2

def process_html_files_recursively(folder_path):
    ai_intent_1 = []
    ai_intent_2 = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.html'):
                file_path = os.path.join(root, file_name)
                grouped_data = extract_content_from_html(file_path)
                filtered_data_1, filtered_data_2 = create_ai_intent_files(grouped_data, file_name)
                ai_intent_1.append(filtered_data_1)
                ai_intent_2.append(filtered_data_2)
                
    with open('ai-intent_1.json', 'w', encoding='utf-8') as json_file:
        json.dump(ai_intent_1, json_file, indent=4, ensure_ascii=False)
        
    with open('ai-intent_2.json', 'w', encoding='utf-8') as json_file:
        json.dump(ai_intent_2, json_file, indent=4, ensure_ascii=False)
                
process_html_files_recursively(HTML_FOLDER_PATH)

In [253]:
ALL_DISTINCT_TAGS = list(distinct_tags_in_ai_intent_1) + list(distinct_tags_in_ai_intent_2)
ALL_DISTINCT_TAGS

['div',
 'body',
 'h1',
 'article',
 'td',
 'li',
 'section',
 'tbody',
 'ol',
 'span',
 'th',
 'ul',
 'p',
 'table',
 'div']

In [254]:
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

cosine_sim_score = lambda embedding1, embedding2: torch.nn.functional.cosine_similarity(embedding1, embedding2).item()

with open("ai-intent_1.json", "r") as file:
    ai_intent_1_file = json.load(file)
    
with open("ai-intent_2.json", "r") as file:
    ai_intent_2_file = json.load(file)
    
def get_contexts():           
    total_contents = []
    for _, tag_list in priorities.items():
        for tag_l in tag_list:
            for tag in tag_l:
                if tag in ALL_DISTINCT_TAGS:
                    ai_intent_file_name = ai_intent_1_file if tag in distinct_tags_in_ai_intent_1 else ai_intent_2_file
                    content = read_tag_contents(tag, ai_intent_file_name)
                    total_contents.extend(content)
                
    grouped_contents = defaultdict(list)

    for obj in total_contents:
        tag, file_name, content = list(obj.values())
        grouped_contents[tag].append({"file_name": file_name, "content": content, "encoding": encode_text(content)})
        
    result = [{"tag": tag, "total_contents": contents} for tag, contents in grouped_contents.items()]
    
    return result

def read_tag_contents(tag_name, ai_intent_file_name):
    return [{"tag": tag,"file_name": obj['file_name'], "content": content} 
            for obj in ai_intent_file_name 
            for tag, content_list in obj['data'].items() 
            if tag == tag_name 
            for content in content_list
            ]

AVAILABLE_CONTEXTS = get_contexts()
# AVAILABLE_CONTEXTS



In [255]:
from functools import wraps

def get_filtered_file_names(user_query: str) -> List:
    query_embedding = encode_text(user_query)

    threshold = 0.6
    
    for _, tags in priorities.items():
        for tag_list in tags:
            for tag in tag_list:
                relevant_contexts = [ctx for ctx in AVAILABLE_CONTEXTS if ctx['tag'] == tag]
                if relevant_contexts:
                    for keyword in relevant_contexts:
                        for context_obj in keyword['total_contents']:
                            file_name = [context_obj['file_name']]
                            encoding = context_obj['encoding']
                            
                            similarities = cosine_sim_score(query_embedding, encoding)
                            similarity_pairs = list(zip(file_name, [similarities]))

                            for file_name, score in similarity_pairs:
                                if score >= threshold:
                                    return file_name, 
                            print("*"*50)

    return []

def clean_text(text):
    text = text.replace('\n', ' ').strip()
    text = ' '.join(text.split())
    return text

def clean_text_decorator(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        return clean_text(result)
    return wrapper

@clean_text_decorator
def get_relevent_context_from_files(file_names):
    body_contents = "/n/n".join([obj['data']['body'][0] for obj in ai_intent_1_file if obj['file_name'] in file_names])
    return body_contents

def get_contexts_wrapper(user_query):
    filtered_file_names = get_filtered_file_names(user_query)
    print(filtered_file_names)
    contexts = get_relevent_context_from_files(filtered_file_names)
    return contexts

In [256]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate

repo_id="mistralai/Mistral-7B-Instruct-v0.3"

prompt = """Answer the question based on the following context. If you don't know the answer, just say that you don't know, don't try to make up an answer. Ensure the answer is concise and to the point.

###Context: {context}
###Question: {question}

###Helpful Answer:"""
prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])

llm = HuggingFaceEndpoint(repo_id=repo_id,temperature=0.01,huggingfacehub_api_token='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')

user_query = ["initials of your username", "change password", "draft status"]
contexts = get_contexts_wrapper(user_query[2])

if contexts.strip() == "":
    print("No relevant context found.")
else:
    formatted_prompt = prompt.format(context=contexts, question=user_query[2])

    response = llm(formatted_prompt)
    print(response)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/metapercept/.cache/huggingface/token
Login successful
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
**************************************************
******************************

In [228]:
short_phrase = "username initials"
long_text = """In the Home page, click the username initials of your username at the top
                right corner of the page.
From the drop-down list,
                click User Preferences.
The User Preferences page is
                displayed.

Click Change Password.
The Change Password dialog box
                is displayed.

In the OLD PASSWORD field, enter the current password.
In the NEW PASSWORD field, enter a new password.
In the CONFIRM PASSWORD field, enter the same password you entered
                in the NEW PASSWORD field.
Click Save."""
same_phrase = "username initials"
similar_text = "initials of your username"
not_similar = "yoyo preference"

# cosine_similarity(model.encode([short_phrase]), model.encode([similar_text]))

array([[0.88722336]], dtype=float32)

In [229]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([short_phrase, not_similar])
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(cosine_sim)

[[0.]]


In [249]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model1 = AutoModel.from_pretrained("bert-base-uncased")

def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model1(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

embedding1 = encode_text('draft status')
embedding2 = encode_text('Changing Password')
cosine_sim = torch.nn.functional.cosine_similarity(embedding1, embedding2)
print(cosine_sim.item())



0.6138250231742859


In [218]:
from rank_bm25 import BM25Okapi

corpus = short_phrase.splitlines()
bm25 = BM25Okapi([doc.split() for doc in corpus])
scores = bm25.get_scores(similar_text.split())
best_match_index = scores.argmax()
best_match_score = scores[best_match_index]
print(best_match_score)
corpus[best_match_index]


-0.5493061443340548


'username initials'