In [14]:
import os
import random
import pickle
import hashlib
from typing import List, Set

class MinHash:
    def __init__(self, num_hashes: int = 100):
        self.num_hashes = num_hashes
        self.max_hash = (1 << 32) - 1
        self.hash_funcs = self._generate_hash_functions()

    def _generate_hash_functions(self):
        random.seed(42)
        return [
            (random.randint(1, self.max_hash), random.randint(0, self.max_hash))
            for _ in range(self.num_hashes)
        ]

    def _hash(self, x: str, a: int, b: int) -> int:
        return (a * hash(x) + b) % self.max_hash

    def compute_signature(self, tokens: Set[str]) -> List[int]:
        signature = []
        for a, b in self.hash_funcs:
            min_hash = min(self._hash(token, a, b) for token in tokens)
            signature.append(min_hash)
        return signature

class LSHIndex:
    def __init__(self, num_bands: int = 20):
        self.num_bands = num_bands
        self.buckets = [{} for _ in range(num_bands)]
        self.documents = {}

    def _band_hash(self, band: List[int]) -> str:
        return hashlib.sha1(str(band).encode()).hexdigest()

    def add(self, doc_id: str, signature: List[int]):
        self.documents[doc_id] = signature
        rows_per_band = len(signature) // self.num_bands
        for i in range(self.num_bands):
            band = signature[i * rows_per_band: (i + 1) * rows_per_band]
            band_hash = self._band_hash(band)
            if band_hash not in self.buckets[i]:
                self.buckets[i][band_hash] = set()
            self.buckets[i][band_hash].add(doc_id)

    def query(self, signature: List[int]) -> Set[str]:
        candidates = set()
        rows_per_band = len(signature) // self.num_bands
        for i in range(self.num_bands):
            band = signature[i * rows_per_band: (i + 1) * rows_per_band]
            band_hash = self._band_hash(band)
            if band_hash in self.buckets[i]:
                candidates.update(self.buckets[i][band_hash])
        return candidates

    def save(self, path: str):
        with open(path, 'wb') as f:
            pickle.dump((self.buckets, self.documents), f)

    def load(self, path: str):
        with open(path, 'rb') as f:
            self.buckets, self.documents = pickle.load(f)

class LicenseClassifier:
    def __init__(self, num_hashes: int = 100, num_bands: int = 20):
        self.minhash = MinHash(num_hashes)
        self.index = LSHIndex(num_bands)

    def _tokenize(self, text: str) -> Set[str]:
        return set(text.lower().split())

    def index_from_folders(self, folder_paths: List[str], n_samples: int = 2):
        for folder in folder_paths:
            all_files = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
            selected = random.sample(all_files, min(n_samples, len(all_files)))
            for file_path in selected:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    tokens = self._tokenize(f.read())
                    signature = self.minhash.compute_signature(tokens)
                    self.index.add(file_path, signature)

    def classify(self, text: str, threshold: float = 0.8) -> bool:
        tokens = self._tokenize(text)
        signature = self.minhash.compute_signature(tokens)
        candidates = self.index.query(signature)
        for candidate in candidates:
            candidate_sig = self.index.documents[candidate]
            similarity = self._jaccard_similarity(signature, candidate_sig)
            if similarity >= threshold:
                return True
        return False

    def _jaccard_similarity(self, sig1: List[int], sig2: List[int]) -> float:
        matches = sum(1 for i, j in zip(sig1, sig2) if i == j)
        return matches / len(sig1)

    def save_index(self, path: str):
        self.index.save(path)

    def load_index(self, path: str):
        self.index.load(path)


In [15]:
# Create the classifier
classifier = LicenseClassifier(num_hashes=100, num_bands=20)

# Index 2 files randomly from each license folder
license_folders = ["Split-DB-Foss-Licenses", "Split-SPDX-licenses"]
classifier.index_from_folders(license_folders, n_samples=40)

In [16]:
# Save the index to disk
classifier.save_index("license_index.pkl")

In [20]:
classifier = LicenseClassifier()
classifier.load_index("license_index.pkl")

# Step 2: Define test queries and their expected labels
# True = license, False = non-license
test_cases = [
    ("Permission is hereby granted, free of charge, to any person obtaining a copy...", True),
    ("This software is licensed under the MIT License.", True),
    ("Welcome to our website. We use cookies for analytics.", False),
    ("Please enter your name and email address below.", False),
    ("Redistribution and use in source and binary forms, with or without modification...", True),
    ("To continue, please update your payment method.", False)
]

# Step 3: Classify and evaluate
correct = 0
for idx, (text, expected) in enumerate(test_cases):
    predicted = classifier.classify(text, threshold=0.5)
    result = "✅" if predicted == expected else "❌"
    print(f"Query {idx+1}: {result} | Expected: {expected} | Predicted: {predicted}")

    if predicted == expected:
        correct += 1

# Step 4: Print summary
total = len(test_cases)
print(f"\nAccuracy: {correct}/{total} ({(correct/total)*100:.2f}%)")

Query 1: ❌ | Expected: True | Predicted: False
Query 2: ❌ | Expected: True | Predicted: False
Query 3: ✅ | Expected: False | Predicted: False
Query 4: ✅ | Expected: False | Predicted: False
Query 5: ❌ | Expected: True | Predicted: False
Query 6: ✅ | Expected: False | Predicted: False

Accuracy: 3/6 (50.00%)
