# 🧠 Code Requirement Inference Tool
This notebook helps you extract features from a Python script and infer business requirements using a glossary, optionally auto-generating one with spaCy.

In [None]:
import ast
import tokenize
import difflib
import json
import re
from io import StringIO
from pathlib import Path
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_features_from_code(code: str):
    features = {
        "functions": [],
        "variables": [],
        "strings": [],
        "comments": []
    }
    try:
        tree = ast.parse(code)
    except SyntaxError:
        return features

    try:
        tokens = tokenize.generate_tokens(StringIO(code).readline)
        for token_type, token_string, *_ in tokens:
            if token_type == tokenize.COMMENT:
                features["comments"].append(token_string.strip())
    except tokenize.TokenError:
        pass

    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            doc = ast.get_docstring(node)
            features["functions"].append({
                "name": node.name,
                "doc": doc.splitlines()[0] if doc else ""
            })
        elif isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name):
                    features["variables"].append(target.id)
        elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Constant) and isinstance(node.value.value, str):
            features["strings"].append(node.value.value)
        elif isinstance(node, ast.Constant) and isinstance(node.value, str):
            features["strings"].append(node.value.value)
    return features


In [None]:
def guess_description(term):
    doc = nlp(term.replace("_", " "))
    nouns = [t.text for t in doc if t.pos_ == "NOUN" or t.pos_ == "PROPN"]
    base = "This represents"
    if nouns:
        return f"{base} a {nouns[0].lower()}."
    elif term.isidentifier():
        return f"{base} `{term}` in the system."
    return f"{base} an unknown concept: `{term}`."

def scaffold_glossary(features):
    terms = set(features.get("variables", []) + features.get("strings", []))
    return {term: guess_description(term) for term in terms if isinstance(term, str)}


In [None]:
def fuzzy_match(term, glossary):
    matches = difflib.get_close_matches(term.lower(), glossary.keys(), n=1, cutoff=0.6)
    if matches:
        return matches[0], glossary[matches[0]]
    return None, None

def infer_requirements(features, glossary):
    requirements = []
    def score_confidence(evidence_count):
        return round(min(1.0, 0.5 + 0.1 * evidence_count), 2)

    for func in features["functions"]:
        if re.search(r"fee|penalt", func["name"], re.IGNORECASE):
            requirements.append({
                "text": "The system calculates a penalty or fee for a condition.",
                "confidence": score_confidence(2),
                "evidence": f"Function: `{func['name']}`"
            })

    for var in features["variables"]:
        match, desc = fuzzy_match(var, glossary)
        if match:
            requirements.append({
                "text": f"The system uses `{match}`: {desc}.",
                "confidence": score_confidence(2),
                "evidence": f"Variable: `{var}` matched glossary: `{match}`"
            })

    for comment in features["comments"]:
        if 'validate' in comment.lower():
            requirements.append({
                "text": "The system includes validation logic.",
                "confidence": score_confidence(1),
                "evidence": f"Comment: `{comment}`"
            })

    return requirements


In [None]:
# 📥 Input Python script (you can paste it here or load from file)
code = """# Calculate late fees
def calculate_late_fee(days_overdue):
    if days_overdue > 0:
        return days_overdue * 5
    return 0

customer_id = "ABC123"
"""

features = extract_features_from_code(code)
features


In [None]:
# 📚 Generate glossary from code
glossary = scaffold_glossary(features)
glossary


In [None]:
# 💡 Infer requirements
requirements = infer_requirements(features, glossary)
requirements
