The following code is a demo code where I have used T5 transformers to create an answer generation layer post retrieval but it didnt seem much effective in the local computer. I am noting down the methods and explanations of the functions being used in the code.  

In [12]:
#Demo Code
import json
import csv
import nltk
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Initialize stop words globally
STOP_WORDS = set(stopwords.words('english'))

#Preprocessing text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalnum() and t not in STOP_WORDS]
    return " ".join(tokens)

#Loading data of the json file
def load_data(csv_path, json_path):
    try:
        # Load CSV file
        with open(csv_path, 'r') as f:
            reader = csv.reader(f)
            questions_answers = list(reader)
        
        # Load JSON knowledge base
        with open(json_path, 'r') as f:
            docs = json.load(f)
        
        # Process documents
        documents = []
        urls = []
        for url, content in docs.items():
            text = content['text']
            title = content['title']
            documents.append(preprocess_text(title + ' ' + text))
            urls.append(url)
        
        return questions_answers, documents, urls
    except FileNotFoundError as e:
        print(f"Error: Could not find file - {e}")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {e}")
        raise
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
        raise

#Models Initialisation
def initialize_models():
    try:
        # Use smaller embedding model
        retriever_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        
        # Use smaller generation model
        model_name = "bigscience/bloom-560m"  # Much smaller than Dolly
        generator = pipeline(
            "text-generation",
            model=model_name,
            device=-1  # Use CPU
        )
        
        return retriever_model, generator
    except Exception as e:
        print(f"Error initializing models: {e}")
        raise

#Search Engine Function
def retrieve(query, retriever_model, documents, urls, top_n=5):  
    query_embedding = retriever_model.encode([query])
    doc_embeddings = retriever_model.encode(documents)
    
    similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    results = []
    for idx in top_indices:
        url = urls[idx]
        score = similarities[idx]
        doc_text = documents[idx][:1000]  # Limit text length
        results.append({
            'url': url,
            'score': score,
            'text': doc_text
        })
    
    return results

#Post Retrieval Answer Generation
def generate_answer(query, retrieved_docs, generator):
    # Creating a shorter context
    context = "\n".join([
        f"Doc {i+1}: {doc['text'][:200]}..."  # Limit context size
        for i, doc in enumerate(retrieved_docs)
    ])
    
    prompt = f"""Question: {query}
    Context: {context}
    Answer:"""
    
    try:
        response = generator(
            prompt,
            max_length=250,  # Reduced length
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
        
        return response[0]['generated_text']
    except Exception as e:
        return f"Error generating response: {str(e)}"

#Processing of the Query
def process_query(query, retriever_model, generator, documents, urls):
    print(f"Processing query: {query}\n")
    
    try:
        retrieved_docs = retrieve(query, retriever_model, documents, urls)
        
        print("Retrieved Sources:")
        for i, doc in enumerate(retrieved_docs, 1):
            print(f"{i}. {doc['url']} (Score: {doc['score']:.4f})")
        
        answer = generate_answer(query, retrieved_docs, generator)
        
        print("\nGenerated Answer:")
        print(answer)
        
        return {'retrieved_docs': retrieved_docs, 'answer': answer}
    except Exception as e:
        print(f"Error processing query: {e}")
        return None

#Main Execution
if __name__ == "__main__":
    try:
        print("Loading data...")
        csv_path = 'clearfeed_qa_pairs.csv'
        json_path = 'Clearfeed_kb.json'
        
        questions_answers, documents, urls = load_data(csv_path, json_path)
        print("Data loaded successfully")
        
        print("Initializing models...")
        retriever_model, generator = initialize_models()
        print("Models initialized successfully")
        
        query = "How can I integrate ClearFeed with Jira?"
        result = process_query(query, retriever_model, generator, documents, urls)
        
    except Exception as e:
        print(f"Error running the program: {e}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading data...
Data loaded successfully
Initializing models...




Models initialized successfully
Processing query: How can I integrate ClearFeed with Jira?



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Retrieved Sources:
1. https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira-service-management (Score: 0.5828)
2. https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira (Score: 0.5694)
3. https://docs.clearfeed.ai/clearfeed-help-center/answers/indexing-knowledge-sources/other-supported-ks (Score: 0.4308)
4. https://docs.clearfeed.ai/clearfeed-help-center/getting-started/for-internal-support (Score: 0.4244)
5. https://docs.clearfeed.ai/clearfeed-help-center/getting-started/using-clearfeed-with-microsoft-teams/installing-clearfeed-on-teams (Score: 0.4236)

Generated Answer:
Question: How can I integrate ClearFeed with Jira?
    Context: Doc 1: jira service management page integrate atlassian enable ticketing functionalities sync settings tickets individual tickets message sync mode status sync mode helpful edit github https jira service man...
Doc 2: jira page integrate atlassian enable ticketing emoji rules jira integration create tickets create tickets manually u

However, The use of Groq API Key and execution of the code under this method helped generate the accurate answer using the mistral open source model. Also, i went with gemini ai api key process but it was not giving me accurate results and some restrictions too were faced. The following code gives you the top results based on the confidence score and also provides a concrete answer that we can see in the output. 

In [23]:
import json
import csv
import nltk
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
from dotenv import load_dotenv
from typing import List, Dict, Any

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))

load_dotenv()  # Load environment variables from .env


class EnhancedQASystem:
    def __init__(self, groq_api_key: str):
        """Initialize the QA system with Groq API access."""
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.groq_client = Groq(api_key=groq_api_key)
        self.qa_pairs = []
        self.qa_embeddings = None
        self.docs_data = {}

    def load_data(self, json_path: str, csv_path: str) -> None:
        """Load documentation and QA training data."""
        # Load JSON documentation
        with open(json_path, 'r') as f:
            self.docs_data = json.load(f)

        # Load CSV QA pairs
        with open(csv_path, 'r') as f:
            reader = csv.DictReader(f)
            self.qa_pairs = list(reader)

        # Create embeddings for QA pairs
        questions = [pair['question'] for pair in self.qa_pairs]
        self.qa_embeddings = self.embedding_model.encode(questions)

    def preprocess_text(self, text: str) -> List[str]:
        """Preprocess text by lowercasing, tokenizing, and removing stopwords."""
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t.isalnum() and t not in STOP_WORDS]
        return tokens

    def calculate_word_overlap_score(self, query_tokens: List[str], doc_tokens: List[str]) -> float:
        """Calculate word overlap score between query and document."""
        query_counter = Counter(query_tokens)
        doc_counter = Counter(doc_tokens)
        overlap = sum((query_counter & doc_counter).values())
        total = sum(query_counter.values())
        return overlap / total if total > 0 else 0

    def get_semantic_similarity(self, query: str, text: str) -> float:
        """Calculate semantic similarity using sentence embeddings."""
        embeddings = self.embedding_model.encode([query, text])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    def rank_documents(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Rank documents based on both word overlap and semantic similarity."""
        query_tokens = self.preprocess_text(query)
        ranked_docs = []

        for url, content in self.docs_data.items():
            # Combine title and text for better matching
            full_content = f"{content['title']} {content['text']}"
            doc_tokens = self.preprocess_text(full_content)

            # Calculate scores
            word_overlap_score = self.calculate_word_overlap_score(query_tokens, doc_tokens)
            semantic_score = self.get_semantic_similarity(query, full_content)

            # Combine scores with adjusted weights
            final_score = (0.3 * word_overlap_score) + (0.7 * semantic_score)

            ranked_docs.append({
                'url': url,
                'title': content['title'],
                'text': content['text'],
                'score': final_score
            })

        ranked_docs.sort(key=lambda x: x['score'], reverse=True)
        return ranked_docs[:top_k]

    def prepare_context(self, ranked_docs: List[Dict[str, Any]], query: str) -> str:
        """Prepare context from top-ranked documents."""
        top_doc = ranked_docs[0]
        context = f"""
Document Title: {top_doc['title']}
Content:
{top_doc['text']}
"""
        prompt = f"""You are a knowledgeable assistant for ClearFeed. Based on the following document, answer the question:

Question: {query}

Document Context:
{context}

Instructions:
1. Provide a clear and concise answer to the question based on the document context.
2. If step-by-step instructions are relevant, number them clearly.
3. Avoid introducing any content not found in the document.

Answer:"""
        return prompt

    def strip_markdown(self, text: str) -> str:
        """Remove Markdown formatting from text."""
        import re
        # Remove headings (e.g., ## Heading)
        text = re.sub(r'^#+\s', '', text, flags=re.MULTILINE)
        # Remove bullet points (e.g., -, *, etc.)
        text = re.sub(r'^[-*]\s', '', text, flags=re.MULTILINE)
        return text.strip()

    def generate_response(self, prompt: str) -> str:
        """Generate response using Groq API with enhanced formatting instructions."""
        try:
            completion = self.groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": """You are a ClearFeed documentation expert. When responding:
                        - Write answers in plain text only, without Markdown or special formatting
                        - Use natural numbering for steps if needed, e.g., 1, 2, 3
                        - Only include information found in the provided context
                        - Clearly state if the information is missing"""
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                model="mixtral-8x7b-32768",
                temperature=0.4,
                max_tokens=2048,
                top_p=0.9
            )
            raw_response = completion.choices[0].message.content.strip()
            return self.strip_markdown(raw_response)
        except Exception as e:
            return f"Error generating response: {str(e)}"

    def process_query(self, query: str) -> Dict[str, Any]:
        """Process a query and return an answer based on the top-ranked document."""
        ranked_docs = self.rank_documents(query, top_k=5)
        if not ranked_docs:
            return {"answer": "No relevant documents found.", "sources": []}

        context = self.prepare_context(ranked_docs, query)
        answer = self.generate_response(context)

        return {
            'answer': answer,
            'sources': [
                {'url': doc['url'], 'title': doc['title'], 'confidence': doc['score']}
                for doc in ranked_docs
            ]
        }


def main():
    groq_api_key = os.getenv('GROQ_API_KEY')
    if not groq_api_key:
        raise ValueError("Please set the GROQ_API_KEY environment variable")
    qa_system = EnhancedQASystem(groq_api_key)
    qa_system.load_data('Clearfeed_kb.json', 'clearfeed_qa_pairs.csv')

    query = "How can I integrate my Confluence account with ClearFeed?"
    result = qa_system.process_query(query)

    print(f"Query: {query}\n")
    print("Top Sources:")
    for i, source in enumerate(result['sources'], 1):
        print(f"{i}. {source['title']}")
        print(f"   URL: {source['url']}")
        print(f"   Confidence: {source['confidence']:.4f}\n")

    print("Answer:")
    print(result['answer'])


if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query: How can I integrate my Confluence account with ClearFeed?

Top Sources:
1. Confluence
   URL: https://docs.clearfeed.ai/clearfeed-help-center/answers/indexing-knowledge-sources/confluence
   Confidence: 0.8723

2. Jira Service Management
   URL: https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira-service-management
   Confidence: 0.6038

3. Clickup
   URL: https://docs.clearfeed.ai/clearfeed-help-center/integrations/clickup
   Confidence: 0.5926

4. Integrations
   URL: https://docs.clearfeed.ai/clearfeed-help-center/pricing-and-billing/billing/integrations
   Confidence: 0.5898

5. Jira
   URL: https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira
   Confidence: 0.5882

Answer:
To integrate your Confluence account with ClearFeed, follow these steps:

1. Go to the ClearFeed web app and navigate to `Settings` from the nav-bar. Then, proceed to the `Integrations` tab.
2. Within the list of available integrations, locate and select **Confluence**.
3. Press

SO, this was my whole thought process behind the code and I have also considered a lot of other methods which I am unable to include because it will take a whole lot of space. So, please consider this assignment completed and give me your thoughts on it. I have mentioned the requirements in the requirements.txt folder which you can go through. 
