<a href="https://colab.research.google.com/github/saifullahAnsari0001/assignment/blob/main/citations_from_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Identifying Citations from API Responses**

In [8]:
import json
import requests
from sentence_transformers import SentenceTransformer, util

#### **Begin by testing code on a sample dataset to verify its correctness. Once confirmed satisfactory results, extend the logic to process the entire dataset. This incremental approach ensures accuracy and reliability throughout the process.**

In [9]:
def load_model():
    """Load the Sentence Transformer model."""
    try:
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    except Exception as e:
        print("Error loading the model:", e)
        return None
    return model

def identify_citations(response_text, sources, model, threshold):
    """Identify citations based on the similarity between response text and sources."""
    citations = []

    # Encode the response text and source contexts to obtain embeddings
    response_embedding = model.encode(response_text, convert_to_tensor=True)
    context_embeddings = model.encode([source['context'] for source in sources], convert_to_tensor=True)

    # Calculate cosine similarity between the response and source embeddings
    similarities = util.pytorch_cos_sim(response_embedding, context_embeddings)

    # Extract citations for contexts with similarity above the threshold
    for idx, sim in enumerate(similarities[0]):
        if sim.item() > threshold:
            citations.append({
                'id': sources[idx]['id'],
                'link': sources[idx]['link']
            })

    return citations

# Example usage
def main():
    # Load the pre-trained model
    model = load_model()
    if model is None:
        return

    # Example response text
    response_text = "Yes, we offer online delivery services through major platforms like Swiggy and Zomato. You can also reserve a table directly from our website if you are planning to have breakfast!"

    # Example sources
    sources = [
        {
            "id": "71",
            "context": "Order online Thank you for your trust in us! We are available on all major platforms like zomato, swiggy. You can also order directly from our website",
            "link": "https://orders.brikoven.com"
        },
        {
            "id": "75",
            "context": "Do you give franchise if the brand No, we currently don't offer franchise opportunities for BrikOven! Although do feel free to drop in an email at theteam@brikoven.com so we can get in touch with you at a later stage if we do decide to give out franchisees'",
            "link": ""
        },
        {
            "id": "8",
            "context": "Breakfast Reservations\r For Breakfast, we recommend making reservations in advance. Reservation is only available through our website",
            "link": "https://www.brikoven.com/reservations"
        }
    ]

    # Define the similarity threshold
    threshold = 0.5

    # Identify citations
    citations = identify_citations(response_text, sources, model, threshold)

    # Print the identified citations
    print("Citations:", citations)

if __name__ == "__main__":
    main()


Citations: [{'id': '71', 'link': 'https://orders.brikoven.com'}, {'id': '8', 'link': 'https://www.brikoven.com/reservations'}]


#### ***After confirming the effectiveness of the code on a sample dataset, extend the same logic to process the entire dataset. ***

#### **Begin by fetching data from the specified API endpoints and storing it locally. This initial step sets the foundation for further processing and analysis.**

In [10]:
def fetch_data(api_url):
    all_data = []
    page = 1

    try:
        while True:
            response = requests.get(api_url, params={'page': page})
            response.raise_for_status()
            data = response.json()

            if 'data' not in data or 'data' not in data.get('data', {}):
                break

            all_data.extend(data['data']['data'])

            if not data['data'].get('next_page_url'):
                break
            page += 1
    except requests.RequestException as e:
        print("Error fetching data:", e)
        return []

    return all_data


api_endpoint = "https://devapi.beyondchats.com/api/get_message_with_sources"

all_data = fetch_data(api_endpoint)

print("Total number of items fetched:", len(all_data))

with open('fetched_data.json', 'w') as f:
    json.dump(all_data, f, indent=2)

print("Data has been saved to fetched_data.json")

Total number of items fetched: 124
Data has been saved to fetched_data.json


In [11]:
with open('fetched_data.json', 'r') as f:
    saved_data = json.load(f)

print(json.dumps(saved_data, indent=2))

[
  {
    "id": 1,
    "response": "Yes, we offer online delivery services through major platforms like Swiggy and Zomato. You can also order directly from our website!",
    "source": [
      {
        "id": "71",
        "context": "Order online Thank you for your trust in us! We are available on all major platforms: [Order online Order directly from our website](https://orders.brikoven.com), [Order from Swiggy](https://www.swiggy.com/direct/brand/7389?source=swiggy-direct&subSource=generic), [Order from zomato](https://www.zomato.com/bangalore/delivery?chain=18224650)",
        "link": ""
      },
      {
        "id": "8",
        "context": "Breakfast Reservations\r For Breakfast, we recommend making reservations in advance. \r For walk-ins, we only seat parties on a first come, first served basis. \r Your reservation is confirmed upon you filling the form. \r Done reserving? Check out the menu below! \r https://www.brikoven.com/breakfastmenu \r \r Did you know you can order your 

#### **Now, aim to determine if each response corresponds to any of the provided sources across the entire dataset. This step involves comparing response texts with the context of each source to identify potential matches**

In [12]:
def load_model():
    """Load the Sentence Transformer model."""
    try:
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    except Exception as e:
        print("Error loading the model:", e)
        return None
    return model

def identify_citations(response_text, sources, model, threshold=0.5):
    """Identify citations based on the similarity between response text and sources."""
    citations = []

    # Encode the response text and source contexts to obtain embeddings
    response_embedding = model.encode(response_text, convert_to_tensor=True)
    context_embeddings = model.encode([source['context'] for source in sources], convert_to_tensor=True)

    # Calculate cosine similarity between the response and source embeddings
    similarities = util.pytorch_cos_sim(response_embedding, context_embeddings)

    # Extract citations for contexts with similarity above the threshold
    for idx, sim in enumerate(similarities[0]):
        if sim.item() > threshold:
            citations.append({
                'id': sources[idx]['id'],
                'link': sources[idx]['link']
            })

    return citations

def process_data(data, model, threshold=0.6):
    """Process data and identify citations for each response."""
    processed_data = []

    for item in data:
        response_text = item.get('response', '')
        sources = item.get('source', [])

        # Identify citations for the response
        citations = identify_citations(response_text, sources, model, threshold)

        # Store the processed item with its ID and citations
        processed_item = {'id': item.get('id'), 'citations': citations}
        processed_data.append(processed_item)

    return processed_data

def main():
    # Load data from the JSON file
    try:
        with open('fetched_data.json', 'r') as f:
            fetched_data = json.load(f)
    except FileNotFoundError as e:
        print("Error loading the JSON file:", e)
        return

    # Load the pre-trained model
    model = load_model()
    if model is None:
        return

    # Define the similarity threshold
    threshold = 0.5

    # Process the data and identify citations
    processed_data = process_data(fetched_data, model, threshold)

    # Print the processed data
    for item in processed_data:
        print(json.dumps({'id': item['id'], 'citations': item['citations']}, indent=4))

if __name__ == "__main__":
    main()


{
    "id": 1,
    "citations": [
        {
            "id": "71",
            "link": ""
        },
        {
            "id": "73",
            "link": ""
        },
        {
            "id": "57",
            "link": ""
        },
        {
            "id": "75",
            "link": ""
        }
    ]
}
{
    "id": 2,
    "citations": [
        {
            "id": "7",
            "link": ""
        }
    ]
}
{
    "id": 3,
    "citations": [
        {
            "id": "4682",
            "link": ""
        },
        {
            "id": "4701",
            "link": ""
        },
        {
            "id": "4721",
            "link": ""
        },
        {
            "id": "4729",
            "link": ""
        },
        {
            "id": "4736",
            "link": ""
        },
        {
            "id": "4714",
            "link": "https://www.drmalpani.com/knowledge-center/articles/cost-of-ivf"
        },
        {
            "id": "5531",
            "link": ""
   