# Basic Coral RAG 



In [3]:
from fileinput import filename
import os
import getpass
import pandas as pd
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from IPython.display import Image, display
from langgraph.graph import START, END, StateGraph
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from pydantic import Field, BaseModel
from langchain_core.prompts import ChatPromptTemplate
from typing import Optional, TypedDict
# For fuzzy matching
from thefuzz import fuzz, process

In [4]:
# Load environment variables from .env file
load_dotenv()

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")


_set_env("OPENAI_API_KEY")

_set_env("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langchain-academy"

## Step 1: Create 

- Create a very simple workflow that takes a common name and searches a csv to find these fields. 

In [5]:
class AgentState(TypedDict):
    filename: str
    is_valid: bool
    common_name: Optional[str]
    species_id_1: Optional[str]

In [6]:
class SpeciesCodeBook():
    def __init__(self, filename: str):
        self.codebook = pd.read_csv(filename)
        print("self.codebook columns:", self.codebook.columns.tolist())
        
    def search_by_common_name_exact(self, common_name: str):
        """Exact substring search"""
        matches = self.codebook[self.codebook['Common Name'].str.contains(common_name, case=False, na=False)]
        return matches
        
    def search_by_common_name_fuzzy(self, common_name: str, threshold=70):
        """Fuzzy search with similarity threshold"""
        # Get all common names (drop NaN values)
        common_names = self.codebook['Common Name'].dropna().tolist()
        
        # Find best matches using fuzzy matching
        matches = process.extract(common_name, common_names, limit=5, scorer=fuzz.partial_ratio)
        
        # Filter by threshold and get the rows
        good_matches = [match for match in matches if match[1] >= threshold]
        
        if good_matches:
            # Get the actual dataframe rows for good matches
            matched_names = [match[0] for match in good_matches]
            result = self.codebook[self.codebook['Common Name'].isin(matched_names)]
            
            # Add similarity scores
            result = result.copy()
            result['similarity_score'] = result['Common Name'].apply(
                lambda x: next(match[1] for match in good_matches if match[0] == x)
            )
            return result.sort_values('similarity_score', ascending=False)
        else:
            return pd.DataFrame()  # Empty dataframe if no good matches
            
    def get_all_columns(self):
        """Get all column names in the codebook"""
        return self.codebook.columns.tolist()

In [7]:
codebook = SpeciesCodeBook("/Users/rylanlorance/Dear_Ocean/Dear_Ocean_Digital_Coral_Ark_Agent/data/codebook/Master - DCA Metadata Codebook - Master.csv")

self.codebook columns: ['Common Name', 'Grouping', 'Other Search Terms', 'Family', 'Kingdom', 'Phylum', 'Class', 'Sub-Class', 'Order', 'Group Abbre', 'Common Abbre', 'LENGTH', 'LENGTH.1', 'Date of Addition', 'Rank']


In [11]:
def retrieve_species_name_from_common_name(state: AgentState) -> AgentState:
    codebook = SpeciesCodeBook("/Users/rylanlorance/Dear_Ocean/Dear_Ocean_Digital_Coral_Ark_Agent/data/codebook/Master - DCA Metadata Codebook - Master.csv")
    common_name = state.get("common_name", "")
    print("common_name", common_name)
    
    # Try fuzzy matching first
    fuzzy_matches = codebook.search_by_common_name_fuzzy(common_name, threshold=60)
    
    if not fuzzy_matches.empty:
        best_match = fuzzy_matches.iloc[0]
        print(f"Fuzzy match found: '{best_match['Common Name']}' (similarity: {best_match['similarity_score']}%)")
        state["species_id_1"] = best_match.get("Group Abbre", None)
    else:
        # Fallback to exact search
        exact_matches = codebook.search_by_common_name_exact(common_name)
        if not exact_matches.empty:
            best_match = exact_matches.iloc[0]
            print(f"Exact match found: '{best_match['Common Name']}'")
            state["species_id_1"] = best_match.get("Group Abbre", None)
        else:
            print(f"No matches found for '{common_name}'")
            state["species_id_1"] = None

    return state

In [None]:
init_state = {
  "filename": "Antler Coral Pocillopora eydouxi entangled Hanauma Bay 20210421_25_Roberts_Anke - HAN.JPG",
  "is_valid": True,
  "common_name": "Potter's Angelfish"
}
res = retrieve_species_name_from_common_name(init_state)

self.codebook columns: ['Common Name', 'Grouping', 'Other Search Terms', 'Family', 'Kingdom', 'Phylum', 'Class', 'Sub-Class', 'Order', 'Group Abbre', 'Common Abbre', 'LENGTH', 'LENGTH.1', 'Date of Addition', 'Rank']
common_name Potter's Angelfish
Fuzzy match found: 'Potter's Angelfish' (similarity: 100%)
