# Post Processing of the LLM Generated Data:

After [running the LLM model](./restaurant_extractor.ipynb) to generate restaurant recommendations, the data is post-processed to clean and filter the results. The post-processing steps are outlined below.

## Steps

1. **Load JSON Files**:
    - Reads all JSON files from the `results` directory, which follow the naming pattern `recs_*.json`.
    - Combines the data from these files into a single list.

2. **Remove Duplicates**:
    - Identifies and removes duplicate entries based on the `restaurant_name` field to ensure each restaurant is unique.

3. **Clean Data**:
    - Removes entries with missing or invalid `restaurant_name` or `address`.
    - Filters out restaurants whose `restaurant_name` starts with "Infatuation".
    - Corrects improperly formatted `cuisine` fields.

4. **Filter by Southern California**:
    - Uses the Ollama API to determine if the restaurant's `address` is located in Southern California.
    - Removes entries that are not located in Southern California.

5. **Update Location Tags**:
    - Analyzes the `location_tags` field to remove tags that are neighborhoods in Southern California.
    - Verifies the `address` of each entry to determine the neighborhood it resides in.
    - Updates the `location_tags` to include the neighborhood name if it is not already present.

6. **Save Cleaned Data**:
    - Writes the processed and cleaned data to a new JSON file `combined_recs.json` in the `results` directory.


In [None]:
import json
import glob
import os

from langchain.chains import LLMChain
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate

def load_json_files(directory):
    files = glob.glob(os.path.join(directory, 'recs_*.json'))
    data = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            data.extend(json.load(f))
    return data

# Function to remove duplicates from the data based on restaurant name
def remove_duplicates(data):
    unique_restaurants = {}
    for entry in data:
        restaurant_name = entry.get('restaurant_name', '')
        if restaurant_name:
            restaurant_name = restaurant_name.strip()
            if restaurant_name and restaurant_name not in unique_restaurants:
                unique_restaurants[restaurant_name] = entry
    return list(unique_restaurants.values())

# Function to clean data by removing entries with missing restaurant names, addresses, or invalid cuisine data
def clean_data(data):
    cleaned_data = []
    for entry in data:
        restaurant_name = entry.get('restaurant_name', '').strip() if entry.get('restaurant_name') else ''
        address = entry.get('address', '').strip() if entry.get('address') else ''
        if not restaurant_name or restaurant_name.startswith("Infatuation"):
            continue
        if address.startswith("123"):
            continue
        if 'cuisine' in entry and isinstance(entry['cuisine'], str):
            try:
                entry['cuisine'] = json.loads(entry['cuisine'])
            except json.JSONDecodeError:
                pass
        cleaned_data.append(entry)
    return cleaned_data

# Function to filter data based on Southern California
def filter_southern_california(data):
    filtered_data = []
    for entry in data:
        address = entry.get('address', '').strip()
        if is_in_southern_california(address):
            filtered_data.append(entry)
    return filtered_data

# Function to determine if an address is in Southern California
def is_in_southern_california(address):
    global ollama
    prompt = f"Is the following address located in Southern California?\n\nAddress: {address}\n\nAnswer with 'Yes' or 'No'."
    response = ollama(prompt)
    return "Yes" in response
    
# Function to determine if a tag is a neighborhood in Southern California
def is_neighborhood_in_socal(tag):
    prompt = f"Is '{tag}' a neighborhood in Southern California? Answer with 'Yes' or 'No'."
    response = ollama(prompt)
    return "Yes" in response

# Function to determine the neighborhood of an address
def get_neighborhood_from_address(address):
    prompt = f"What neighborhood is the following address located in?\n\nAddress: {address}\n\nProvide the neighborhood name ONLY."
    response = ollama(prompt)
    return response.strip()

# Function to update the location tags of the data
def update_location_tags(data):
    updated_data = []
    for entry in data:
        location_tags = entry.get('location_tag', [])
        address = entry.get('address', '').strip()

        # Remove tags that are neighborhoods in Southern California
        location_tags = [tag for tag in location_tags if not is_neighborhood_in_socal(tag)]

        # Get the neighborhood of the address
        neighborhood = get_neighborhood_from_address(address)

        # If the neighborhood is not in the updated location_tags, add it
        if neighborhood and neighborhood not in location_tags:
            location_tags.append(neighborhood)

        # Update the entry
        entry['location_tag'] = location_tags
        updated_data.append(entry)
    return updated_data

In [None]:
directory = 'results'
combined_data = load_json_files(directory)
combined_data = remove_duplicates(combined_data)
combined_data = clean_data(combined_data)

# Initialize the Ollama model
ollama = Ollama(model="llama3:8b-instruct-q5_0")
combined_data = filter_southern_california(combined_data)
combined_data = update_location_tags(combined_data)

In [None]:
with open(os.path.join(directory, 'combined_recs.json'), 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=4)