In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"/workspaces/Supply-Chain-Management/Data/supplier_contracts_dataset.csv")
df = df.replace({np.nan: None})
# Rename columns to remove spaces
# df.columns = [col.replace(' ', '_') for col in df.columns]

In [47]:
# df.columns = df.columns.str.lower()
# df.to_csv(r"/workspaces/Supply-Chain-Management/Data/supplier_contracts_dataset.csv",index=False)

In [48]:
df.columns

Index(['supplier_id', 'supplier_name', 'supplier_type', 'risk_level',
       'compliance_issues', 'key_terms', 'past_performance',
       'negotiate_recommendation', 'supply_chain_disruption',
       'quality_metrics', 'cost_metrics'],
      dtype='object')

In [3]:

df = df.rename(columns={'supplier_id':'id'})
# # Convert only the relevant text-based fields to string
text_fields = ['supplier_name', 'supplier_type', 'risk_level',
       'compliance_issues', 'key_terms', 'past_performance',
       'negotiate_recommendation', 'supply_chain_disruption',
       'quality_metrics', 'cost_metrics']

# Ensure the specified text fields are strings
for field in text_fields:
    df[field] = df[field].astype(str)

**MINSEARCH**

In [50]:
# df.columns = df.columns.str.lower()
# df.to_csv(r"/workspaces/Supply-Chain-Management/Data/supplier_contracts_dataset.csv",index=False)

In [4]:
documents = df.to_dict(orient='records')

Get top 10 list of high risk level contracts

In [5]:
import minsearch

# Create an index
index = minsearch.Index(
    text_fields=text_fields,
    keyword_fields=[]
)

# Fit the index with the documents
index.fit(documents)

# Example search query
query = "high risk level"
results = index.search(query, num_results=10)

# Print results
for result in results:
    print(result)

{'id': 'S0430', 'supplier_name': 'Supplier 430', 'supplier_type': 'Service Provider', 'risk_level': 'High', 'compliance_issues': 'None', 'key_terms': '45-day payment, 10-day delivery', 'past_performance': 'Poor', 'negotiate_recommendation': 'Adjust delivery schedules, Include compliance monitoring, Include penalty clauses for late delivery', 'supply_chain_disruption': 'Yes', 'quality_metrics': '4.22% defect rate, Meets standards', 'cost_metrics': '$69.97/unit, $6563.98 total cost'}
{'id': 'S1311', 'supplier_name': 'Supplier 1311', 'supplier_type': 'Manufacturer', 'risk_level': 'High', 'compliance_issues': 'Non-Compliance with Standards', 'key_terms': '45-day payment, 10-day delivery', 'past_performance': 'Poor', 'negotiate_recommendation': 'Adjust delivery schedules, Include compliance monitoring, Include penalty clauses for late delivery', 'supply_chain_disruption': 'Yes', 'quality_metrics': '4.61% defect rate, Meets standards', 'cost_metrics': '$52.85/unit, $6165.72 total cost'}
{'id

Get the Contract types that has high risk and their count

In [6]:
import minsearch
from collections import Counter
# Create an index
index = minsearch.Index(
    text_fields=text_fields,
    keyword_fields=['risk_level']
)

# Fit the index with the documents
index.fit(documents)

# Perform the search for high-risk level contracts
filter_dict = {'risk_level': 'High'}
results = index.search(query='high risk level', filter_dict=filter_dict, num_results=len(documents))

# Extract and print contract types with high risk level
high_risk_contract_types = [result['supplier_type'] for result in results]

# Count the occurrences of each contract type
contract_type_counts = Counter(high_risk_contract_types)

# Print the count of each contract type
print("Count of each supplier type with high risk level:")
for contract_type, count in contract_type_counts.items():
    print(f"{contract_type}: {count}")

Count of each supplier type with high risk level:
Retailer: 197
Distributor: 198
Manufacturer: 201
Service Provider: 219


**GROQ API**

In [6]:
import os

from groq import Groq

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [7]:

# Function to perform the search on supplier contracts based on query

def search(query, filter_dict=None, max_results=10):
    # Filter the DataFrame based on risk level (if provided)
    if filter_dict:
        filtered_df = df[df['risk_level'] == filter_dict.get('risk_level', '')]
    else:
        filtered_df = df
    # Convert the filtered data to a list of dictionaries and limit the number of results
    results = filtered_df.to_dict(orient='records')[:max_results]
    return results

# Function to build a clearer prompt for Groq API
def build_clear_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context += (
            f"- **Supplier_Type**: {doc['supplier_type']}\n"
            f"  **Supplier_Name**: {doc['supplier_name']}\n"
            f"  **Risk_Level**: {doc['risk_level']}\n"
            f"  **Compliance_Issues**: {doc['compliance_issues']}\n"
            f"  **Key_Terms**: {doc['key_terms']}\n"
            f"  **Negotiate_Recommendation**: {doc['negotiate_recommendation']}\n"
            f"  **Quality_Metrics**: {doc['quality_metrics']}\n"
            f"  **Past_Performance**: {doc['past_performance']}\n"
            f"  **Supply_Chain_Disruption**: {doc['supply_chain_disruption']}\n"
            f"  **Cost_Metrics**: {doc['cost_metrics']}\n\n"
        )
    
    prompt = (
        f"QUESTION: {query}\n\n"
        f"CONTEXT:\n{context}"
    )
    
    return prompt

# Function to call the LLM (Groq API)
def llm(prompt, model='Llama3-groq-70b-8192-tool-use-preview'):
    # Assuming client is the Groq API client instance
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model=model
    )
    
    return response.choices[0].message.content

# Function to perform the full RAG (Retrieve and Generate) process
def rag(query, model='Llama3-groq-70b-8192-tool-use-preview'):
    # Search for high-risk contracts (you can modify filter_dict based on needs)
    search_results = search(query, filter_dict={'risk_level': 'High'})
    
    # Build the prompt using the search results
    prompt = build_clear_prompt(query, search_results)
    
    # Get the LLM response based on the prompt
    answer = llm(prompt, model=model)
    
    return answer

# Example usage
question = "Give supplier types, quality metrics, supply chain disruptions, and their negotiation recommendations for high-risk contracts"
answer = rag(question)
print(answer)

Supplier Types: Manufacturer, Retailer, Distributor, Service Provider
Quality Metrics: 3.43% defect rate, 4.38% defect rate, 3.84% defect rate, 2.45% defect rate, 1.81% defect rate, 4.27% defect rate, 3.01% defect rate, 1.48% defect rate, 4.68% defect rate, 1.76% defect rate
Supply Chain Disruptions: Yes, Yes, Yes, Yes, No, Yes, No, No, Yes, No
Negotiation Recommendations: Seek alternative suppliers, Include penalty clauses for late delivery, Adjust delivery schedules, Include compliance monitoring


In [9]:
print("\n## Risk-Based Queries")

question = "Which suppliers have the most non-compliance issues, regardless of risk level?"
answer = rag(question)
print(answer)

print("\n## Compliance & Legal Queries:")

question = "Identify suppliers where compliance monitoring is recommended."
answer = rag(question)
print(answer)

print("\n## Cost and Financial Metrics:")

question = "List suppliers that offer the best cost metrics but are classified as high risk."
answer = rag(question)
print(answer)

print("\n## Contractual Terms and Recommendations:")

question = "Which suppliers have penalty clauses for late delivery in their contracts, and what are the associated risks?"
answer = rag(question)
print(answer)

print("\n## Supplier Relationship Queries:")

question = "What are the relationship metrics for suppliers with the best past performance scores?"
answer = rag(question)
print(answer)

print("\n## Opportunity and Innovation Queries:")

question = "Identify suppliers with innovative solutions despite having poor quality metrics."
answer = rag(question)
print(answer)

print("\n# Custom Queries")

question = "Show me all suppliers with a combination of poor quality metrics, high compliance issues, and good past performance scores."
answer = rag(question)
print(answer)

print("\n")

question = "What are the patterns between compliance issues and cost metrics in supplier contracts?"
answer = rag(question)
print(answer)



## Risk-Based Queries
Based on the provided data, the suppliers with the most non-compliance issues, regardless of risk level, are:

1. Supplier 6 (Manufacturer) - Non-Compliance with Standards
2. Supplier 8 (Retailer) - Non-Compliance with Standards
3. Supplier 12 (Distributor) - Substandard Quality
4. Supplier 14 (Service Provider) - Non-Compliance with Standards
5. Supplier 15 (Distributor) - Non-Compliance with Standards
6. Supplier 26 (Distributor) - Late Delivery
7. Supplier 28 (Retailer) - Late Delivery
8. Supplier 32 (Retailer) - Non-Compliance with Standards
9. Supplier 38 (Retailer) - Late Delivery
10. Supplier 40 (Retailer) - Non-Compliance with Standards

## Compliance & Legal Queries:
Based on the provided data, the suppliers where compliance monitoring is recommended are:

1. Supplier 12 (Distributor) - High risk level, compliance issues with substandard quality.
2. Supplier 40 (Retailer) - High risk level, compliance issues with non-compliance with standards.

## Cost a

**Retrieval Evaluation**

In [8]:
from tqdm.auto import tqdm

In [9]:
df_question = pd.read_csv(r'/workspaces/Supply-Chain-Management/Data/ground-truth-retrieval.csv')

In [10]:
ground_truth = df_question.to_dict(orient='records')

In [60]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
                break  
    return total_score / len(relevance_total)

def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

from tqdm.auto import tqdm

evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/12500 [00:00<?, ?it/s]

{'hit_rate': 0.86384, 'mrr': 0.8502181587301586}

**Best Retrieval Method**

Approach 1

In [61]:
from sklearn.model_selection import train_test_split

df_validation, df_test = train_test_split(df_question, test_size=0.5, random_state=42)
gt_val = df_validation.to_dict(orient='records')

In [62]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},  # Adjust filters if needed
        boost_dict=boost,
        num_results=10
    )

    return results


This function interacts with your search index. If no boost parameters are provided, it defaults to an empty dictionary. It returns the top 10 search results based on the query and the optional boost parameters. The boost_dict modifies the importance of specific fields during the search.

In [63]:

param_ranges = {
    'supplier_name': (0.0, 3.0),
    'supplier_type': (0.0, 3.0),
    'risk_level': (0.0, 3.0),
    'compliance_issues': (0.0, 3.0),
    'key_terms': (0.0, 3.0),
    'past_performance': (0.0, 3.0),
    'negotiate_recommendation': (0.0, 3.0),
    'supply_chain_disruption': (0.0, 3.0),
    'quality_metrics': (0.0, 3.0),
    'cost_metrics': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']


This dictionary defines the range of values that each parameter can take during optimization. The values indicate how much weight or importance is assigned to each of these fields when retrieving results from the search index.

The function evaluates the effectiveness of a particular set of boost parameters. It does so by calling minsearch_search with the boost parameters and calculating the Mean Reciprocal Rank (MRR) over the validation set (gt_val). The higher the MRR score, the better the ranking of relevant results.

In [64]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [65]:
best_params, best_score = simple_optimize(param_ranges, objective, n_iterations=20)

print("Best Boost Parameters:", best_params)
print("Best MRR Score:", best_score)


  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

Best Boost Parameters: {'supplier_name': 2.8691396940069374, 'supplier_type': 1.8940336800638193, 'risk_level': 0.923442405280464, 'compliance_issues': 2.407565036605866, 'key_terms': 0.3208613898026407, 'past_performance': 1.4030636901968452, 'negotiate_recommendation': 0.7292841547630614, 'supply_chain_disruption': 1.650281040936573, 'quality_metrics': 0.15471113546029314, 'cost_metrics': 1.2247811572567455}
Best MRR Score: 0.9346627936507936


This is the optimized search function using the best boost parameters obtained from the optimization step. These parameters are applied to the search function, and it is evaluated using the full ground truth dataset.

In [66]:
def minsearch_improved(query):
    boost = {
        'supplier_name': best_params['supplier_name'],
        'supplier_type': best_params['supplier_type'],
        'risk_level': best_params['risk_level'],
        'compliance_issues': best_params['compliance_issues'],
        'key_terms': best_params['key_terms'],
        'past_performance': best_params['past_performance'],
        'negotiate_recommendation': best_params['negotiate_recommendation'],
        'supply_chain_disruption': best_params['supply_chain_disruption'],
        'quality_metrics': best_params['quality_metrics'],
        'cost_metrics': best_params['cost_metrics']
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results
evaluate(ground_truth, lambda q: minsearch_improved(q['question']))


  0%|          | 0/12500 [00:00<?, ?it/s]

{'hit_rate': 0.93696, 'mrr': 0.9342981269841271}

Approach 2

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
# Preprocessing function
def preprocess_text(text):
    # Ensure the text is a string
    if isinstance(text, str):
        text = text.lower()
        # Additional preprocessing steps like removing punctuation can be added here
        return text
    return ''

# Apply preprocessing
df_question['processed_question'] = df_question['question'].apply(preprocess_text)

# Prepare TF-IDF Vectorizer with custom settings
corpus = df_question['processed_question'].tolist()
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Bi-grams
    stop_words='english',  # Use English stop words
    sublinear_tf=True  # Sublinear term frequency scaling
)
X = vectorizer.fit_transform(corpus)

def tfidf_search(query, num_results=10):
    query_processed = preprocess_text(query)
    query_vec = vectorizer.transform([query_processed])
    similarities = cosine_similarity(query_vec, X).flatten()
    top_indices = np.argsort(similarities)[::-1][:num_results]
    return df_question.iloc[top_indices].to_dict(orient='records')

def evaluate_tfidf(ground_truth):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = tfidf_search(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Evaluate TF-IDF retrieval approach
tfidf_results = evaluate_tfidf(ground_truth)
print('TF-IDF Hit Rate:', tfidf_results['hit_rate'])
print('TF-IDF MRR:', tfidf_results['mrr'])

  0%|          | 0/12500 [00:00<?, ?it/s]

TF-IDF Hit Rate: 0.97296
TF-IDF MRR: 0.9553497460317472


In [69]:
#Best one among the the two 
def compare_methods(ground_truth, best_params):
    # Evaluate TF-IDF
    tfidf_results = evaluate_tfidf(ground_truth)
    
    # Evaluate Minsearch
    minsearch_results = evaluate(ground_truth, best_params)

    print("TF-IDF Results:")
    print('Hit Rate:', tfidf_results['hit_rate'])
    print('MRR:', tfidf_results['mrr'])

    print("Minsearch Results:")
    print('Hit Rate:', minsearch_results['hit_rate'])
    print('MRR:', minsearch_results['mrr'])

    if tfidf_results['mrr'] > minsearch_results['mrr']:
        return 'TF-IDF', tfidf_results
    else:
        return 'Minsearch', minsearch_results

In [70]:
results = compare_methods(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

TF-IDF Results:
Hit Rate: 0.97296
MRR: 0.9553497460317472
Minsearch Results:
Hit Rate: 0.93696
MRR: 0.9342981269841271


In [71]:
results

('TF-IDF', {'hit_rate': 0.97296, 'mrr': 0.9553497460317472})

**RAG evaluation**

In [11]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [73]:
len(ground_truth)

12500

In [12]:
record = ground_truth[0]

In [13]:
question = record['question']
answer_llm = rag(question)

In [14]:
print(answer_llm)

Supplier 1 is not listed in the provided context. It seems like there might be some confusion with the supplier names or IDs. Could you please clarify or provide more details about Supplier 1?


In [15]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the risk level of Supplier 1 and what compliance issues have they had?
Generated Answer: Supplier 1 is not listed in the provided context. It seems like there might be some confusion with the supplier names or IDs. Could you please clarify or provide more details about Supplier 1?

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [16]:
import json


In [17]:
df_sample = df_question.sample(n=1250, random_state=1)
sample = df_sample.to_dict(orient='records')

In [18]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )
+
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/1250 [00:00<?, ?it/s]

In [19]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [20]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.6760
NON_RELEVANT       0.3088
PARTLY_RELEVANT    0.0152
Name: proportion, dtype: float64

In [24]:
df_eval.to_csv(r'/workspaces/Supply-Chain-Management/Data/Evaluation/rag-eval-Llama3.csv', index=False)

In [22]:

df_eval[df_eval.relevance == 'NON_RELEVANT']


Unnamed: 0,answer,id,question,relevance,explanation
2,Supplier 163 is not mentioned in the provided ...,S0163,Have there been any supply chain disruptions w...,NON_RELEVANT,The generated answer does not provide any info...
5,Supplier 961 is not mentioned in the provided ...,S0961,What are the quality and cost metrics for Supp...,NON_RELEVANT,The generated answer does not provide any info...
7,Supplier 929 has not been mentioned in the pro...,S0929,How has Supplier 929 performed in the past?,NON_RELEVANT,The generated answer does not provide any info...
8,Supplier 2322 is not listed in the provided co...,S2322,What is the risk level of Supplier 2322?,NON_RELEVANT,The generated answer does not provide any info...
11,Supplier 652 is not mentioned in the provided ...,S0652,Has there been any supply chain disruption wit...,NON_RELEVANT,The generated answer does not provide any info...
...,...,...,...,...,...
1239,Supplier 309 is not listed in the provided con...,S0309,What are the quality and cost metrics for Supp...,NON_RELEVANT,The generated answer does not provide any info...
1241,Supplier 1855 is not mentioned in the provided...,S1855,How has Supplier 1855 performed in the past an...,NON_RELEVANT,The generated answer does not address the ques...
1242,Supplier 1200 is not mentioned in the provided...,S1200,What is the risk level of Supplier 1200?,NON_RELEVANT,The generated answer does not provide any info...
1248,Supplier 263 has not been mentioned in the pro...,S0263,What compliance issues has Supplier 263 faced ...,NON_RELEVANT,The generated answer does not address the ques...


In [23]:

evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/1250 [00:00<?, ?it/s]

NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-4o` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'code': 'model_not_found'}}

In [None]:

df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [None]:
df_eval.to_csv(r'/workspaces/Supply-Chain-Management/Data/Evaluation/rag-eval-gpt-4o.csv', index=False)

In [22]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


# Pre-trained sentence similarity model for embedding-based similarity
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Pre-trained model for perplexity evaluation
perplexity_model = pipeline("perplexity", model="Llama3-groq-70b-8192-tool-use-preview")

# Weak supervision rule-based function (Modify the rules based on your data)
def auto_classify_relevance(question, answer_llm):
    """
    Automatically classify relevance based on simple rules or heuristics.
    You can modify or expand this function.
    """
    # Define rule-based heuristics for automatic classification
    if "important_keyword" in answer_llm:  # Example rule based on keyword
        return "RELEVANT"
    elif len(answer_llm) > 50:  # Rule based on the length of the answer
        return "PARTLY_RELEVANT"
    else:
        return "NON_RELEVANT"

# Embedding-based similarity evaluation
def compute_embedding_similarity(question, answer_llm):
    """
    Compute cosine similarity between the question and the generated answer using sentence embeddings.
    """
    embeddings = embedding_model.encode([question, answer_llm])
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity_score

# Perplexity-based evaluation
def compute_perplexity(answer_llm):
    """
    Compute perplexity score of the generated answer.
    """
    perplexity_score = perplexity_model(answer_llm)
    return perplexity_score

# Evaluate on a sample set without manual relevance classification
def evaluate_generated_answers(df_sample):
    evaluations = []

    for record in tqdm(df_sample.to_dict(orient='records')):
        question = record['question']

        # Generate answer with RAG system (Placeholder for your actual RAG system call)
        answer_llm = rag(question)  # Replace with the RAG system call

        # Automatic classification using weak supervision rules
        relevance = auto_classify_relevance(question, answer_llm)

        # Compute embedding similarity
        similarity_score = compute_embedding_similarity(question, answer_llm)

        # Compute perplexity
        perplexity_score = compute_perplexity(answer_llm)

        evaluations.append({
            'id': record['id'],
            'question': question,
            'answer': answer_llm,
            'relevance': relevance,
            'similarity_score': similarity_score,
            'perplexity': perplexity_score
        })

    return pd.DataFrame(evaluations)

# Sample data from the ground truth (Assuming 12,500 records)
df_sample = df_question.sample(n=1250, random_state=1)

# Perform automatic RAG evaluation
df_eval_auto = evaluate_generated_answers(df_sample)

# Analyze the results
print(df_eval_auto['relevance'].value_counts(normalize=True))
print(df_eval_auto[['similarity_score', 'perplexity']].describe())

# Save results to CSV
# df_eval_auto.to_csv(r'/workspaces/Supply-Chain-Management/Data/Evaluation/automatic_rag_evaluation.csv', index=False)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

OSError: Llama3-groq-70b-8192-tool-use-preview is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`