<a href="https://colab.research.google.com/github/scandy-dxd/scandy-dxd/blob/main/Untitled14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
!pip install pdfplumber



In [77]:

import pdfplumber
import spacy  # For NLP tasks
from transformers import AutoModel, AutoTokenizer
import torch


In [94]:
def load_nlp_model():
    """Loads the spaCy NLP model for text processing."""
    nlp = spacy.load("en_core_web_sm")  # Consider a larger model for better accuracy if needed
    return nlp
def preprocess_text(text):
    """Preprocesses text for NLP tasks: lowercase conversion and tokenization."""
    nlp = load_nlp_model()
    doc = nlp(text)
    return [token.text.lower() for token in doc]


In [95]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF document using pdfplumber."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            policy_text = "/content/sample_data/Company-Policy-and-Procedure-June-1.18-V6.0.pdf"
            for page in pdf.pages:
                policy_text += page.extract_text()
        return policy_text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None


In [102]:
def extract_candidate_rules(policy_text):
    """Extracts candidate rules (conditions and actions) from policy text using NLP."""
    nlp = load_nlp_model()  # Load the NLP model here
    nlp_doc = nlp(policy_text)  # Preprocess and tokenize the policy text
    candidate_rules = []

    for token in nlp_doc:
        # Example rule condition: "harassment" and "report" but not "supervisor" or "HR"
        if token.text == "harassment" and "report" in [t.text for t in nlp_doc]:
            if "supervisor" not in [t.text for t in nlp_doc] and "HR" not in [t.text for t in nlp_doc]:
                condition = "If a sentence mentions 'harassment' and 'report' but not 'supervisor' or 'HR'"
                action = "it suggests a potential policy violation."
                candidate_rules.append({"condition": condition, "action": action})

        return candidate_rules



In [103]:
def evaluate_rules_with_model(model, tokenizer, rules, user_prompt, policy_sections):
    """Evaluates rules using the sentence transformer model and similarity scores."""
    encoded_user_prompt = tokenizer(user_prompt, return_tensors="pt")
    user_prompt_embedding = model(**encoded_user_prompt).pooler_output

    rule_evaluations = []
    for rule in rules:
        condition_snippets = [f"I was {rule['condition']}."]  # Generate snippets for each condition
        rule_similarities = []
        for snippet in condition_snippets:
            encoded_snippet = tokenizer(snippet, return_tensors="pt")
            snippet_embedding = model(**encoded_snippet).pooler_output
            similarity = torch.nn.functional.cosine_similarity(user_prompt_embedding, snippet_embedding).item()
            rule_similarities.append(similarity)

                  # Consider averaging similarities or using a more robust aggregation method
        average_similarity = sum(rule_similarities) / len(rule_similarities)
        rule_evaluations.append({
            "rule": rule,
            "average_similarity": average_similarity,
            "policy_sections": evaluate_rule_in_policy(policy_sections, rule["condition"])  # Check in policy sections
        })

    return rule_evaluations



In [104]:
def evaluate_rule_in_policy(policy_sections, rule_condition):
    """Checks if the rule condition is mentioned in relevant policy sections (optional, can be customized)."""
    # This is a simplified example. You might need more sophisticated comparison based on NLP tasks.
    matches = []
    for section in policy_sections:
        if rule_condition in section:
            matches.append(section)
    return matches



In [105]:
def refine_and_integrate_rules(rule_evaluations, threshold=0.7):
    """Refines and integrates rules based on similarity scores and manual review."""
    refined_rules = []
    for evaluation in rule_evaluations:
        if evaluation["average_similarity"] > threshold and evaluation["policy_sections"]:  # Consider both similarity and presence in policy
            print(f"Rule: {evaluation['rule']['condition']}\nAction: {evaluation['rule']['action']}")
            print(f"Similarity: {evaluation['average_similarity']}\nPolicy Sections: {evaluation['policy_sections']}")
            user_confirmation = input("Is this rule valid (y/n)? ")
            if user_confirmation.lower() == "y":
                refined_rules.append(evaluation["rule"])

    return refined_rules

In [116]:
# Example usage (replace with your actual model, tokenizer, and PDF path)
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
pdf_path = "/content/sample_data/Company-Policy-and-Procedure-June-1.18-V6.0.pdf"

# Extract text from the PDF
policy_text = extract_text_from_pdf(pdf_path)

# Extract candidate rules from the policy text
if policy_text:
    rules = extract_candidate_rules(str(policy_text))

    # Assuming you have a way to obtain relevant policy sections (manually or through parsing)
    policy_sections = ["HEALTH LITERACY "]  # Replace with actual sections

    # Evaluate rules with the model and refine them
    if rules:
        rule_evaluations = evaluate_rules_with_model(model, tokenizer, rules, user_prompt="What language should consumer materials be in ?.",policy_sections=policy_sections)
        refined_rules = refine_and_integrate_rules(rule_evaluations)

        if refined_rules:
            print("Refined Rules:")
            for rule in refined_rules:
                print(f"- {rule['condition']}: {rule['action']}")
        else:
            print("No refined rules generated based on the current settings.")
    else:
        print("No candidate rules extracted from the policy text.")
else:
    print("Error extracting text from the PDF.")

No candidate rules extracted from the policy text.
