## Retrieving the info of the restaurants that the Instagram pages suggest:

Using a local clone of `llama3` and `langchain` I collect the following information from each post caption that I saved in the last [notebook](./caption_scraper.ipynb):

```json
{
    "restaurant_name": "",
    "address": "",
    "instagram": "",
    "famous_for": "",
    "location_tag": [],
    "cuisine": []
}
```

The information is extracted via a series of prompts in the following logical order:

1. We loop over each post for each Instagram page and ask the model to determine if the caption is talking about a food or drinks place.
2. If the caption is about a restaurant, we extract the restaurant name, address, instagram handle, a one line description of what they are known for and the relevant location tags that specify which neighborhood they are in LA.
3. If the restaurant address is not available, then we leave it empty.
4. We then ask the model to determine the location of the restaurant based on the restaurant name and location tag.
5. We then ask the model to fix the JSON object extracted from the previous prompt. (**for some reason the model does not always return a valid JSON object**)
6. Finally, we ask the model to determine the type of cuisine the restaurant serves based on the caption.
7. We save the extracted information in a JSON file.


In [None]:
import json
import re
import os
import gc

from tqdm import tqdm
from typing import List, Dict, Any, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.chains import LLMChain
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate

def response_to_json(response: str) -> Optional[Dict[str, Any]]:
    """
    Extract the JSON part from the LLM response.

    Parameters:
    - response: str: The response from the LLM.

    Returns:
    - Optional[Dict[str, Any]]: The extracted JSON part, if found; otherwise, None.
    """
    try:
        # Regular expression to match JSON object
        json_match = re.search(r"\{.*\}", response, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            return json.loads(json_str)
        else:
            print("No JSON object found in the response.")
            return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

def response_to_json_llm(response: str) -> Optional[Dict[str, Any]]:
    """
    Fix the LLM response to ensure it returns a proper JSON dictionary object.

    Parameters:
    - response: str: The response from the LLM.

    Returns:
    - Optional[Dict[str, Any]]: The corrected JSON dictionary, if possible; otherwise, None.
    """
    global fix_json_llm_chain
    fixed_response = fix_json_llm_chain.run({"response": response})
    # print(f"Fixed Response: {fixed_response}")
    try:
        return json.loads(fixed_response)
    except json.JSONDecodeError as e:
        # print(f"Error decoding fixed JSON: {e}")
        return None

def load_captions_from_file(filename: str) -> List[str]:
    """
    Load captions from a JSON file.

    Parameters:
    - filename: str: The name of the JSON file.

    Returns:
    - List[str]: A list of captions.
    """
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def analyze_caption_with_llm(caption: str) -> Dict[str, Any]:
    """
    Use the LLM chain to analyze a caption and determine if it's about a restaurant,
    and extract the name and address if available.

    Parameters:
    - caption: str: The caption to analyze.

    Returns:
    - Dict[str, Any]: The analysis result containing whether it's about a restaurant,
                      the restaurant name, and the address (if available).
    """
    global analyze_llm_chain
    response = analyze_llm_chain.run({"caption": caption})
    # print(f"LLM Response: {response}")
    analysis = response_to_json(response)
    
    if analysis is None:
        analysis = response_to_json_llm(response)

    # print(f"Scraped Response: {analysis}")
    return analysis


def guess_address_llm(restaurant_name: str, location_tag: list) -> Optional[str]:
    """
    Guess the address of a restaurant based on its name using the Ollama model.

    Parameters:
    - restaurant_name: str: The name of the restaurant.
    - location_tags: list: The location tags extracted from the caption.

    Returns:
    - Optional[str]: The guessed address, if found; otherwise, None.
    """
    global address_llm_chain
    response = address_llm_chain.run({"restaurant_name": restaurant_name, "location_tag": location_tag})
    return response.strip()

def get_cuisine_llm(restaurant_name: str, caption: str) -> Optional[str]:
    """
    Get the cuisine of a restaurant based on its name and the caption.
    
    Parameters:
    - restaurant_name: str: The name of the restaurant.
    - caption: str: The caption containing additional information.
    """
    global cuisine_llm_chain
    response = cuisine_llm_chain.run({"restaurant_name": restaurant_name, "caption": caption})
    return response.strip()


def process_single_caption(caption: str) -> Dict[str, Any]:
    """
    Process a single caption to analyze if it is about a restaurant and extract relevant information.

    Parameters:
    - caption: str: The caption to analyze.

    Returns:
    - Dict[str, Any]: The processed result containing analysis details for the caption.
    """
    analysis = analyze_caption_with_llm(caption)
    
    if analysis.get('is_about_restaurant') and analysis.get('restaurant_name'):
        if not analysis.get('address'):
            analysis['address'] = guess_address_llm(analysis.get('restaurant_name'), analysis.get('location_tag', []))
        analysis['cuisine'] = get_cuisine_llm(analysis.get('restaurant_name'), caption)
    analysis.pop('is_about_restaurant', None)
    
    return analysis

def extract_restaurant_info(captions: List[str], insta_account: str):
    """
    Process the captions to analyze if they are about a restaurant and extract relevant information to save it to a file.

    Parameters:
    - captions: List[str]: The list of captions to analyze.
    - insta_account: str: The Instagram account name.

    Returns:
    - None
    """
    filename = f"results/{insta_account.replace('posts/', 'recs_by_')}"

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_single_caption, caption) for caption in captions]
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write('[')
            
            first_result = True
            for future in tqdm(as_completed(futures), total=len(captions)):
                result = future.result()
                
                if not result:
                    continue  # Skip None or empty results
                
                if not first_result:
                    f.write(',\n')
                json.dump(result, f, ensure_ascii=False, indent=4)
                
                # Explicitly delete the result and the future to free memory
                del result
                del future
                first_result = False

                # Run garbage collector to free up memory
                gc.collect()
            
            f.write(']')

            
def save_analysis_to_file(analysis: List[Dict[str, Any]], filename: str) -> None:
    """
    Save the analysis results to a file in JSON format.

    Parameters:
    - analysis: List[Dict[str, Any]]: The analysis results to save.
    - filename: str: The name of the file to save the results to.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(analysis, f, ensure_ascii=False, indent=4)

In [None]:
# Initialize the Ollama model
llm = Ollama(model="llama3:8b-instruct-q5_0")

In [None]:
# Define the prompt template
prompt_template = PromptTemplate(
    input_variables=["caption"],
    template="""
    Analyze the following caption and determine if it's talking about a food or drinks place!
    If it is, extract the restaurant name, address, instagram handle and the relevant location tags that SPECIFY which neighborhood they are in LA (don't include vague tags like losangeles, lafoodie, california, etc). 
    Also, include a one line description of what they are known for.
    If the resturant address is not available, then leave it empty.
    Return the result in the following JSON format and format ONLY! Say absolutely nothing else! Ensure that both the JSON braces are present.
    {{
        "is_about_restaurant": <true/false>,
        "restaurant_name": "<restaurant_name>",
        "address": "<address>",
        "instagram": "<instagram_handle>",
        "famous_for": "<famous_for>", 
        "location_tag": "<location_tag>" // list of location tags 
    }}
    Caption: {caption}
    """
)


# Define the prompt template for guessing the address
address_prompt_template = PromptTemplate(
    input_variables=["restaurant_name","location_tag"],
    template="""
    You are a YellowPages book operator and you know all the locations of resturants in '{location_tag}'. Where is the restaurant '{restaurant_name}' located? 
    Respond only and ONLY with the full address of the restaurant. Absolutely, do not say anything else or you may lose your job!
    """
)

# Define the prompt template for fixing the JSON response
fix_json_prompt_template = PromptTemplate(
    input_variables=["response",],
    template="""
    The following text contains a JSON object, but it might have some formatting issues or additional text. Extract and correct the JSON object so that it is a valid JSON dictionary:
    Response: {response}
    Corrected JSON:
    Ensure that both the JSON braces are present. Say absolutely nothing else! Your response should only contain the corrected JSON object ONLY!!!
    """
)

# Define the prompt for assigning the cuisine type
cuisine_prompt_template = PromptTemplate(
    input_variables=["restaurant_name","caption"],
    template="""
    You are a food critic and you know all the cuisines of the world. Based on this article - `{caption}`, what type of cuisine does the restaurant '{restaurant_name}' serve?
    Respond only and ONLY with the type of cuisine in a list format (e.g. ["Chinese"] or ["Chinese", "Cantonese"] are valid responses). Absolutely, do not say anything else or you may lose your job!
    """
)

In [None]:
# Create the LLM chains
analyze_llm_chain = LLMChain(llm=llm, prompt=prompt_template)
address_llm_chain = LLMChain(llm=llm, prompt=address_prompt_template)
fix_json_llm_chain = LLMChain(llm=llm, prompt=fix_json_prompt_template)
cuisine_llm_chain = LLMChain(llm=llm, prompt=cuisine_prompt_template)

In [None]:
account_names = ["dinertheory","infatuation_la","kcrwgoodfood","la.ethnic.eats","lacoffeelist","ricklox","thelacountdown"]

In [None]:
insta_accounts = [f"posts/{i}.json" for i in account_names]
for insta_account in insta_accounts:
      captions = load_captions_from_file(insta_account)
      extract_restaurant_info(captions, insta_account)

## To Do:

- [ ] continue from entry 526 from infatuation

- [ ] continue from entry 111 from ricklox 

- [ ] process kcrwgoodfood
