# üìò Matching Incoming Quotes to Historical Repair Projects
## Using Snowflake Cortex Search + Multi-Index Search

---

This notebook demonstrates how to automatically match incoming quotes to the most similar historical repair projects using the trucking maintenance dataset stored in Snowflake.

### Business Goals
Help maintenance teams quickly determine:
- Whether a new quote resembles previous work
- Whether the quote might be a duplicate or follow-on repair
- What similar repairs historically cost
- Which vendor or department handled similar work

### Workflow Overview
1. **Cortex Search Services** ‚Äî Create multi-index search on projects, vendors, and parts
2. **Batch Search** ‚Äî Retrieve likely historical candidates for each quote
3. **Similarity Scoring** ‚Äî Compute a unified match score combining semantic + categorical + temporal signals
4. **Best Match Selection** ‚Äî Select the top match per quote
5. **Pricing Context** ‚Äî Add historical cost comparison
6. **Validation** ‚Äî Use Streamlit UI for manual review



---
## üì¶ 1. Setup & Data Exploration


In [None]:
-- Set context
USE DATABASE TRUCKING_DEMO;
USE SCHEMA DEMO;
USE WAREHOUSE COMPUTE_WH;


In [None]:
-- Explore projects table (historical repair records)
SELECT * FROM projects LIMIT 10;

In [None]:
-- Explore quotes table (incoming quotes to match)
SELECT * FROM quotes LIMIT 10;

In [None]:
-- Explore payments table (optional - for financial context)
SELECT * FROM payments LIMIT 10;

---
## üîç 2. Create Multi-Index Cortex Search Service

We create a **single multi-index service** with both TEXT and VECTOR indexes:

| Index Type | Columns | Purpose |
|------------|---------|---------|
| **TEXT INDEXES** | `vendor`, `part_type`, `department` | Keyword/lexical matching for exact terms |
| **VECTOR INDEXES** | `description` | Semantic similarity using embeddings |

This multi-index approach provides:
- **Keyword search** on vendor/part_type for exact matches
- **Semantic search** on descriptions for meaning-based matches
- **Configurable weights** to balance lexical vs semantic relevance
- **Field-specific boosts** to prioritize certain columns


In [None]:
-- Create Multi-Index Cortex Search Service
-- TEXT INDEXES: For keyword/lexical matching (exact term matches)
-- VECTOR INDEXES: For semantic similarity (meaning-based matches)
-- NOTE: description is in BOTH indexes for hybrid keyword + semantic search
CREATE OR REPLACE CORTEX SEARCH SERVICE trucking_projects_multi_search
    TEXT INDEXES description, vendor, part_type, department
    VECTOR INDEXES description (model = 'snowflake-arctic-embed-m-v1.5')
    ATTRIBUTES project_id, manager, start_date, price
    WAREHOUSE = COMPUTE_WH
    TARGET_LAG = '1 minute'
AS (
    SELECT 
        project_id,
        description,
        vendor,
        part_type,
        manager,
        department,
        start_date,
        price
    FROM projects
);

In [None]:
-- Verify the multi-index service is created
SHOW CORTEX SEARCH SERVICES;

-- Describe the service to see indexed columns
DESC CORTEX SEARCH SERVICE trucking_projects_multi_search;


In [None]:
-- Test multi-index query with hybrid search on description
-- Description is queried BOTH as text (keyword) AND vector (semantic)
-- scoring_config.weights controls the balance between text vs vector relevance
SELECT PARSE_JSON(
    SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
        'trucking_projects_multi_search',
        '{
            "multi_index_query": {
                "description": [{"text": "engine overhaul repair"}],
                "vendor": [{"text": "Peterbilt"}],
                "part_type": [{"text": "Engine"}]
            },
            "scoring_config": {
                "weights": {"texts": 2, "vectors": 3, "reranker": 1},
                "functions": {
                    "text_boosts": [
                        {"column": "description", "weight": 2},
                        {"column": "vendor", "weight": 1.5},
                        {"column": "part_type", "weight": 1}
                    ]
                }
            },
            "columns": ["project_id", "description", "vendor", "part_type", "price"],
            "limit": 5
        }'
    )
):results AS multi_index_results;


In [None]:
session = get_active_session()
import pandas as pd 
top_n_matches = 5

# Get all quotes
quotes_df = session.sql("""
    SELECT quote_id, description, vendor, part_type, quote_date
    FROM quotes
""").to_pandas()

quotes_df.columns = [col.upper() for col in quotes_df.columns]
all_results = []

for idx, row in quotes_df.iterrows():
    search_query = {
        "multi_index_query": {
            "description": [{"text": str(row['DESCRIPTION'])}],
            "vendor": [{"text": str(row['VENDOR']) if row['VENDOR'] else ""}],
            "part_type": [{"text": str(row['PART_TYPE']) if row['PART_TYPE'] else ""}]
        },
        "scoring_config": {
            "weights": {"texts": 1, "vectors": 3, "reranker": 1},
            "functions": {
                "text_boosts": [{"column": "description", "weight": 2}],
                "vector_boosts": [{"column": "description", "weight": 2}]
            }
        },
        "columns": ["project_id", "description", "vendor", "part_type", "price"],
        "limit": top_n_matches
    }
    
    search_sql = f"""
        SELECT '{row['QUOTE_ID']}' AS quote_id,
               value['project_id']::text AS project_id,
               value['description']::text AS project_description,
               value['vendor']::text AS project_vendor,
               value['part_type']::text AS project_part_type,
               value['price']::float AS project_price,
               value['@score']::float AS score
        FROM TABLE(FLATTEN(PARSE_JSON(SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
            'trucking_projects_multi_search', '{json.dumps(search_query).replace("'", "''")}'
        ))['results']))
    """
    
    results = session.sql(search_sql).to_pandas()
    all_results.append(results)

final_results = pd.concat(all_results, ignore_index=True)
display(final_results)

In [None]:
final_results.head()

In [None]:
# Batch search with detailed scoring breakdown
top_n_matches = 5

# Get all quotes
quotes_df = session.sql("""
    SELECT quote_id, description, vendor, part_type, quote_date
    FROM quotes
""").to_pandas()

quotes_df.columns = [col.upper() for col in quotes_df.columns]
all_results = []

for idx, row in quotes_df.iterrows():
    search_query = {
        "multi_index_query": {
            "description": [{"text": str(row['DESCRIPTION'])}],
            "vendor": [{"text": str(row['VENDOR']) if row['VENDOR'] else ""}],
            "part_type": [{"text": str(row['PART_TYPE']) if row['PART_TYPE'] else ""}]
        },
        "scoring_config": {
            "weights": {"texts": 1, "vectors": 1, "reranker": 1},
            "functions": {
                "text_boosts": [{"column": "description", "weight": 1},
                                {"column": "vendor", "weight": 1},
                                {"column": "part_type", "weight": 1} ],
                "vector_boosts": [{"column": "description", "weight": 1}]
            }
        },
        "columns": ["project_id", "description", "vendor", "part_type", "price"],
        "limit": top_n_matches
    }
    
    # Get raw JSON response to extract detailed scores
    search_sql = f"""
        SELECT SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
            'TRUCKING_DEMO.DEMO.TRUCKING_PROJECTS_MULTI_SEARCH',
            '{json.dumps(search_query).replace("'", "''")}'
        ) AS search_result
    """
    
    try:
        result_json = session.sql(search_sql).collect()[0]['SEARCH_RESULT']
        result_data = json.loads(result_json) if isinstance(result_json, str) else result_json
        
        for rank, result in enumerate(result_data.get('results', []), 1):
            match_data = {
                'QUOTE_ID': row['QUOTE_ID'],
                'QUOTE_DESCRIPTION': row['DESCRIPTION'],
                'PROJECT_ID': result.get('project_id', ''),
                'PROJECT_DESCRIPTION': result.get('description', ''),
                'PROJECT_VENDOR': result.get('vendor', ''),
                'PROJECT_PRICE': result.get('price', 0),
                'RANK': rank,
                'OVERALL_SCORE': result.get('@score', 0.0)
            }
            
            # Extract detailed scores
            scores = result.get('@scores', {})
            match_data['TEXT_MATCH'] = scores.get('text_match', 0.0)
            match_data['COSINE_SIMILARITY'] = scores.get('cosine_similarity', 0.0)
            
            # Extract function scores (per-column breakdown)
            function_scores = scores.get('function_scores', {})
            
            # Vector boost scores
            vector_boost = function_scores.get('vector_boost', {})
            match_data['VECTOR_WEIGHTED_SCORE'] = vector_boost.get('weighted_score', 0.0)
            
            # Text boost scores
            text_boost = function_scores.get('text_boost', {})
            match_data['TEXT_WEIGHTED_SCORE'] = text_boost.get('weighted_score', 0.0)
            
            all_results.append(match_data)
    except Exception as e:
        print(f"Error for {row['QUOTE_ID']}: {e}")

final_results = pd.DataFrame(all_results)
print(f"Total results: {len(final_results)}")
final_results.head(20)