# MMD 2024, Problem Sheet 6

Group: Daniela Fichiu, Aaron Maekel, Manuel Senger

# Exercise 1

# Exercise 2
 

# Exercise 3

# Exercise 4

In [1]:
from mpmath import mp
import random


def compute_k_shingles(s: str, k: int):
    """
    Compute k-shingles for a given string of digits.
    Return a sorted list of integers corresponding to the k-shingles.
    """
    shingles = set()
    for i in range(len(s) - k + 1):
        # Extract k-shingle
        k_shingle_str = s[i:i+k]
        # Convert to integer position
        k_shingle_int = int(k_shingle_str)
        shingles.add(k_shingle_int)
    return sorted(shingles)

example_s = "1234567"
example_k = 4
positions = compute_k_shingles(example_s, example_k)
print("Positions for s='1234567', k=4:", positions)

# Set precision (a bit higher than needed)
mp.dps = 10100
pi_str = str(mp.pi)
pi_digits = pi_str[2:10002]  # first 10,000 digits after decimal

k = 12
positions_pi = compute_k_shingles(pi_digits, k)

# Write to file
output_file = "pi_10000digits_k12_positions.txt"
with open(output_file, "w") as f:
    for pos in positions_pi:
        f.write(str(pos) + "\n")

print(f"Number of distinct k-shingles: {len(positions_pi)}")
print(f"Positions saved to {output_file}")

def minhash_signatures(positions, hash_funcs):
    """
    Compute the MinHash signature for a single column using given hash functions.

    Parameters:
    - positions: sorted list of integer positions of set bits.
    - hash_funcs: list of tuples (a, b, p)

    Returns:
    - A list containing the minhash signature, one entry per hash function.
    """
    signature = []
    N = 10**12
    for (a, b, p) in hash_funcs:
        min_val = None
        for x in positions:
            h = ((a*x + b) % p) % N+1
            if min_val is None or h < min_val:
                min_val = h
        signature.append(min_val)
    return signature

# Given parameters:
p_base = 10**15
hash_funcs = []
# First hash function:
hash_funcs.append((37, 126, p_base + 223))

# Generate remaining 4 hash functions
# p-values: p_base + 37, p_base + 91, p_base + 159, p_base + 187
p_offsets = [37, 91, 159, 187]
for i in range(4):
    a = random.randint(0, 10**12)
    b = random.randint(0, 10**12)
    p = p_base + p_offsets[i]
    hash_funcs.append((a, b, p))

print("Hash functions used (a,b,p):")
for hf in hash_funcs:
    print(hf)

# Run on pi_positions obtained above
signature = minhash_signatures(positions_pi, hash_funcs)

print("MinHash signature:", signature)



Positions for s='1234567', k=4: [1234, 2345, 3456, 4567]
Number of distinct k-shingles: 9989
Positions saved to pi_10000digits_k12_positions.txt
Hash functions used (a,b,p):
(37, 126, 1000000000000223)
(157798959461, 340145837323, 1000000000000037)
(525745399221, 405055330803, 1000000000000091)
(512627524501, 116083211787, 1000000000000159)
(734773935500, 444234672761, 1000000000000187)
MinHash signature: [24916664, 14851362, 20317102, 49706047, 64743897]


# Exercise 5

a)

In [86]:
import numpy as np
def create_dataset(m=100,q=20000,delta=0.02,N=10**8):
    C= []
    C.append( np.random.choice(np.arange(N),size=q,replace=False))
    print(C)
    for i in range(m-1):
        last_col = C[-1] 
        fract_old_values= np.random.choice(np.arange(q),size=int(delta*q),replace=False)
        replacements = np.random.randint(N,size=len(fract_old_values))
        for i in range(len(replacements)):
            while replacements[i] in last_col:
               
                print("replacing ",i)
                replacements[i] = np.random.randint(N,size=1)[0]
        
        existing_old_values = np.delete(last_col,fract_old_values)
        C.append(np.concatenate((existing_old_values,replacements)))
    return C

print(create_dataset())


[array([11676322, 52507936,  4499864, ..., 98510569, 39974635, 85749576])]
replacing  243
replacing  226
replacing  45
replacing  41
replacing  183
replacing  377
replacing  22
[array([11676322, 52507936,  4499864, ..., 98510569, 39974635, 85749576]), array([11676322, 52507936,  4499864, ...,  6201751, 16427795, 29337054]), array([11676322, 52507936,  4499864, ..., 55099085, 39155570,  5313333]), array([11676322, 52507936,  4499864, ..., 99274390, 83825795, 94407369]), array([11676322, 52507936,  4499864, ..., 90368153,  1590663, 99566196]), array([11676322, 52507936,  4499864, ..., 14939640, 73994100, 39725901]), array([11676322, 52507936,  4499864, ..., 29850549, 58250773,  4174383]), array([11676322, 52507936,  4499864, ..., 37559881,   545719, 47248897]), array([11676322, 52507936,  4499864, ..., 53574007, 92636667, 35047726]), array([11676322, 52507936,  4499864, ..., 80478041, 39573085, 15643248]), array([11676322, 52507936,  4499864, ..., 56166246, 36317433, 36909995]), array([1

# Exercise 6

# Exercise 7