# TF‑IDF & Cosine Similarity — Derivations

This notebook documents the mathematical formulas and implements reproducible code snippets for TF‑IDF based compliance drift detection.

Outline:
1. Environment & dependencies
2. Project layout & path variables
3. Load metadata and documents
4. Text preprocessing
5. TF‑IDF vectorization
6. Cosine similarity calculations
7. Drift computation across versions
8. Threshold calibration & statistical checks
9. Alerting & export of results
10. Streamlit dashboard integration
11. Unit tests for core modules
12. Reproduce experiments & notebooks
13. Run & debug in VS Code

In [None]:
# Section 1 — Environment & dependencies

# Install packages (run in your environment once)
!pip install -r ../requirements.txt

# Verify python and key package versions
!python --version
python -c "import sklearn, pandas, streamlit; import sys; print('sklearn', sklearn.__version__); print('pandas', pandas.__version__); print('streamlit', streamlit.__version__)"

In [None]:
# Section 2 — Project layout & path variables
from pathlib import Path
ROOT = Path('..').resolve()
DATA = ROOT / 'data'
SRC = ROOT / 'src'
NOTEBOOKS = ROOT / 'notebooks'
RESULTS = ROOT / 'results'
DASHBOARD = ROOT / 'dashboard'

print('ROOT:', ROOT)
print('DATA:', DATA)

# helper: list files
print('\nreference files:')
for p in sorted((DATA / 'reference').glob('*.txt')):
    print('-', p.name)

print('\ninternal files:')
for p in sorted((DATA / 'internal').glob('*.txt')):
    print('-', p.name)

METADATA_CSV = DATA / 'metadata.csv'
print('\nmetadata:', METADATA_CSV)


In [None]:
# Section 3 — Load metadata and documents
import pandas as pd
from pathlib import Path

meta = pd.read_csv(METADATA_CSV)
meta.head()

# Map doc_id -> file path (assumes filenames in metadata)
meta['path'] = meta['filename'].apply(lambda fn: DATA / ('reference' if fn.startswith('guideline') else 'internal') / fn)

# Load text into dataframe
def load_texts(df):
    texts = []
    for _, row in df.iterrows():
        p = row['path']
        try:
            texts.append(p.read_text(encoding='utf-8'))
        except Exception as e:
            texts.append('')
    df['text'] = texts
    return df

meta = load_texts(meta)
meta[['doc_id','version','date','path','text']].head()

In [None]:
# Section 4 — Text preprocessing (using src/preprocess.py)
from src.preprocess import clean_text

# simple tokenization example
sample = meta['text'].iloc[0] if len(meta)>0 else 'Example text, with punctuation.'
cleaned = clean_text(sample)
print('Original:', sample[:200])
print('\nCleaned:', cleaned[:200])

# simple tokenization and unit test
tokens = cleaned.split()
assert isinstance(tokens, list)
print('\nTokens count:', len(tokens))

In [None]:
# Section 5 — TF‑IDF vectorization (src/vectorize.py)
from src.vectorize import fit_vectorizer, transform_documents

# prepare corpora: reference vs internal
refs = meta[meta['path'].str.contains('reference')]['text'].tolist()
internals = meta[~meta['path'].str.contains('reference')]['text'].tolist()

vect, X_refs = fit_vectorizer(refs, max_features=2000)
X_internals = transform_documents(vect, internals)

print('Reference matrix shape:', X_refs.shape)
print('Internal matrix shape:', X_internals.shape)

# TF-IDF formula
display_markdown = """
The TF‑IDF weighting used is:

$$tfidf_{t,d} = tf_{t,d} \cdot \log\frac{N}{df_t}$$

where $tf_{t,d}$ is term frequency and $df_t$ is document frequency.
"""

from IPython.display import Markdown
Markdown(display_markdown)


In [None]:
# Section 6 — Cosine similarity calculations (src/similarity.py)
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# compute pairwise similarities between internals and refs
sim_matrix = cosine_similarity(X_internals, X_refs)
print('Similarity matrix shape (internals x refs):', sim_matrix.shape)

# show top-1 match for each internal doc
top_indices = np.argmax(sim_matrix, axis=1)
for i, idx in enumerate(top_indices):
    print(f'Internal doc {i} best matches reference {idx} with sim={sim_matrix[i, idx]:.3f}')

# formula
Markdown('''$$\cos(\mathbf{a},\mathbf{b})=\frac{\mathbf{a}\cdot\mathbf{b}}{\|\mathbf{a}\|\,\|\mathbf{b}\|}$$''')

In [None]:
# Section 7 — Drift computation across versions (src/drift.py)
import numpy as np
from src.drift import compute_drift

# compute drift per internal doc as 1 - max similarity to refs
drift_scores = compute_drift(sim_matrix)
print('Drift scores:', drift_scores)

# If metadata contains versions, we can aggregate per doc_id across versions
# Example: construct a simple dataframe tying internals to doc_ids (if present)
internal_meta = meta[~meta['path'].str.contains('reference')].reset_index(drop=True)
internal_meta['drift_score'] = drift_scores
internal_meta[['doc_id','version','date','drift_score']].head()

In [None]:
# Section 8 — Threshold calibration & statistical checks
import numpy as np

# percentile-based threshold example
threshold_95 = np.percentile(internal_meta['drift_score'].fillna(0), 95)
print('95th percentile threshold:', threshold_95)

# simple z-score calibration
mu = internal_meta['drift_score'].mean()
sigma = internal_meta['drift_score'].std(ddof=0)
internal_meta['zscore'] = (internal_meta['drift_score'] - mu) / (sigma + 1e-9)
print('\nmean, std:', mu, sigma)

# bootstrap example (small, illustrative only)
import random
def bootstrap_percentile(scores, n=1000):
    samples = []
    for _ in range(n):
        s = [random.choice(list(scores)) for _ in range(len(scores))]
        samples.append(np.percentile(s, 95))
    return np.percentile(samples, [2.5, 97.5])

if len(internal_meta)>0:
    ci = bootstrap_percentile(internal_meta['drift_score'].fillna(0).values, n=200)
    print('Bootstrap 95% CI for 95th percentile:', ci)
else:
    print('Not enough data for bootstrap example')

In [None]:
# Section 9 — Alerting & export of results (src/alerts.py)
from src.alerts import make_alerts

threshold = float(threshold_95) if not np.isnan(threshold_95) else 0.4
alerts_df = make_alerts(internal_meta['doc_id'].tolist(), internal_meta['drift_score'].fillna(0).values, threshold)

# add extra metadata columns
alerts_df['version'] = internal_meta['version']
alerts_df['date'] = internal_meta['date']

# save results
RESULTS_PATH = RESULTS / 'drift_alerts.csv'
SIM_PATH = RESULTS / 'similarity_scores.csv'
alerts_df.to_csv(RESULTS_PATH, index=False)
print('Wrote', RESULTS_PATH)

# simplified similarity export: top match per internal doc
sim_rows = []
for i, doc in internal_meta.iterrows():
    ref_idx = int(top_indices[i]) if i < len(top_indices) else -1
    sim_rows.append({'doc_id': doc['doc_id'], 'ref_id': ref_idx, 'similarity': float(sim_matrix[i, ref_idx]) if ref_idx>=0 else 0.0})

import pandas as pd
pd.DataFrame(sim_rows).to_csv(SIM_PATH, index=False)
print('Wrote', SIM_PATH)


In [None]:
# Section 10 — Streamlit dashboard integration (dashboard/app.py)

# Run with:
# streamlit run ../dashboard/app.py

import pandas as pd
from IPython.display import Markdown

Markdown('Streamlit app uses `results/drift_alerts.csv` and `results/similarity_scores.csv`.')

# Quick preview of results
try:
    df_alerts = pd.read_csv(RESULTS_PATH)
    df_alerts.head()
except Exception as e:
    print('No results yet — run the export cell to generate CSVs.')


In [None]:
# Section 11 — Unit tests for core modules (examples)

# Example pytest-style tests (place under tests/test_core.py)
example_tests = '''
import pytest
from src.preprocess import clean_text
from src.vectorize import fit_vectorizer


def test_clean_text():
    s = 'Hello, WORLD!!!'
    assert clean_text(s) == 'hello world'


def test_vectorizer_small():
    docs = ['a b c', 'a b', 'b c']
    vect, X = fit_vectorizer(docs, max_features=10)
    assert X.shape[0] == 3
'''

print(example_tests)

# Run tests: `pytest -q`

In [None]:
# Section 12 — Reproduce experiments & notebooks

import random
import numpy as np
random.seed(42)
np.random.seed(42)

# Example: sweep max_features
params = [500, 1000, 2000]
results = []
for mf in params:
    vect, Xr = fit_vectorizer(refs, max_features=mf)
    Xi = transform_documents(vect, internals)
    sim = cosine_similarity(Xi, Xr)
    drift = 1 - sim.max(axis=1)
    results.append({'max_features': mf, 'mean_drift': float(drift.mean())})

import pandas as pd
pd.DataFrame(results)

# Section 13 — Run & debug in VS Code

# Tips:
# - Run individual cells with the Run Cell button.
# - Use the integrated terminal to run modules: `python -m src.vectorize` (if you add an entrypoint).
# - Run tests from Test Explorer or via `pytest -q`.
# - To debug, open a `.py` file, set breakpoints, and Run > Start Debugging or use the debug pane.

print('Notebook: environment & reproducible math derivations for TF-IDF drift detection')