In [1]:
# Cell 1: imports and helper
import hashlib
import json
import os
from pathlib import Path
import time
from typing import Dict, Tuple

def hash_file(path: Path, method: str = "sha256", chunk_size: int = 8192) -> str:
    """
    Return hex digest of file at `path` using `method` ('md5' or 'sha256').
    Reads in chunks to support large files.
    """
    method = method.lower()
    if method == "md5":
        hasher = hashlib.md5()
    elif method in ("sha256", "sha-256", "sha_256"):
        hasher = hashlib.sha256()
    else:
        raise ValueError("Unsupported hash method. Use 'md5' or 'sha256'.")

    with path.open("rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)
    return hasher.hexdigest()


In [2]:
# Cell 2: create baseline and load/save
def create_baseline(root_dir: str, baseline_path: str, hash_method: str = "sha256") -> Dict[str, Dict]:
    """
    Walk `root_dir` and create a baseline mapping of relative_path -> {hash, mtime, size}.
    Saves baseline as JSON at baseline_path and also returns the dict.
    """
    root = Path(root_dir).resolve()
    baseline = {}
    for p in root.rglob("*"):
        if p.is_file():
            rel = str(p.relative_to(root))
            file_hash = hash_file(p, method=hash_method)
            baseline[rel] = {
                "hash": file_hash,
                "mtime": p.stat().st_mtime,
                "size": p.stat().st_size
            }
    # Save to JSON
    with open(baseline_path, "w") as f:
        json.dump({
            "root": str(root),
            "hash_method": hash_method,
            "generated_at": time.time(),
            "files": baseline
        }, f, indent=2)
    print(f"‚úÖ Baseline created for '{root}' ({len(baseline)} files) -> '{baseline_path}'")
    return baseline

def load_baseline(baseline_path: str) -> Tuple[Path, str, Dict[str, Dict]]:
    """
    Load baseline JSON and return (root_path, hash_method, files_dict).
    """
    with open(baseline_path, "r") as f:
        data = json.load(f)
    root = Path(data["root"])
    return root, data["hash_method"], data["files"]


In [3]:
# Cell 3: detection function
def detect_changes(baseline_path: str, verbose: bool = True) -> Dict[str, list]:
    """
    Compare current filesystem under baseline root to the baseline JSON.
    Returns a dict with lists: added, deleted, modified, unchanged.
    """
    root, method, baseline_files = load_baseline(baseline_path)
    root = root.resolve()

    current_files = {}
    for p in root.rglob("*"):
        if p.is_file():
            rel = str(p.relative_to(root))
            current_files[rel] = {
                "hash": hash_file(p, method=method),
                "mtime": p.stat().st_mtime,
                "size": p.stat().st_size
            }

    baseline_set = set(baseline_files.keys())
    current_set = set(current_files.keys())

    added = sorted(list(current_set - baseline_set))
    deleted = sorted(list(baseline_set - current_set))
    possibly_modified = baseline_set & current_set

    modified = []
    unchanged = []

    for rel in sorted(possibly_modified):
        if baseline_files[rel]["hash"] != current_files[rel]["hash"]:
            modified.append(rel)
        else:
            unchanged.append(rel)

    report = {
        "added": added,
        "deleted": deleted,
        "modified": modified,
        "unchanged": unchanged,
        "summary": {
            "baseline_files": len(baseline_files),
            "current_files": len(current_files),
            "added": len(added),
            "deleted": len(deleted),
            "modified": len(modified),
            "unchanged": len(unchanged)
        }
    }

    if verbose:
        print("üîç File Integrity Check Report")
        print("=" * 40)
        s = report["summary"]
        print(f"Baseline files: {s['baseline_files']}  |  Current files: {s['current_files']}")
        print(f"Added: {s['added']}  |  Deleted: {s['deleted']}  |  Modified: {s['modified']}")
        print("-" * 40)
        if added:
            print("‚ûï Added files:")
            for f in added: print("   ", f)
        if deleted:
            print("‚ùå Deleted files:")
            for f in deleted: print("   ", f)
        if modified:
            print("‚ö†Ô∏è Modified files:")
            for f in modified:
                print("   ", f)
                b = baseline_files[f]
                c = current_files[f]
                print(f"      baseline hash: {b['hash']}")
                print(f"      current  hash: {c['hash']}")
        if not (added or deleted or modified):
            print("‚úÖ No changes detected (files unchanged).")
        print("=" * 40)
    return report


In [4]:
# Cell 4: demonstration (safe small workspace in notebook)
from pathlib import Path
import shutil

demo_root = Path("integrity_demo")
# Clean start
if demo_root.exists():
    shutil.rmtree(demo_root)
demo_root.mkdir()

# Create sample files
files = {
    "bin/run.exe": b"original binary content v1",
    "lib/module.py": b"print('hello world')\n",
    "config/settings.ini": b"[DEFAULT]\nsetting=1\n",
    "readme.txt": b"This is a demo for integrity checking.\n"
}

# Write files
for rel, content in files.items():
    p = demo_root / rel
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_bytes(content)

# Create baseline
baseline_file = "baseline_demo.json"
create_baseline(str(demo_root), baseline_file, hash_method="sha256")

# Simulate tampering:
#  - modify lib/module.py
#  - delete config/settings.ini
#  - add new file suspicious.txt
(mod := (demo_root / "lib" / "module.py")).write_text("print('pwned by malware')\n")
(demo_root / "config" / "settings.ini").unlink()  # delete
(demo_root / "suspicious.txt").write_text("malicious changes here\n")

# Run detection
report = detect_changes(baseline_file)


‚úÖ Baseline created for 'C:\Users\awans\integrity_demo' (4 files) -> 'baseline_demo.json'
üîç File Integrity Check Report
Baseline files: 4  |  Current files: 4
Added: 1  |  Deleted: 1  |  Modified: 1
----------------------------------------
‚ûï Added files:
    suspicious.txt
‚ùå Deleted files:
    config\settings.ini
‚ö†Ô∏è Modified files:
    lib\module.py
      baseline hash: 2d543015627a771436b30ea79fd0ecda8df8bcd77b3d55661caf5a0d6e809886
      current  hash: 8d37da138367eeeb83c76cfca6262dfbfcf2e598cc4466d070ef0078e6202a59
