In [109]:
import struct
import numpy as np
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching


def read_float_binary_file(filename):
    try:
        with open(filename, 'rb') as f:
            # Read the first 8 bytes (number of rows and columns)
            num_rows, num_columns = struct.unpack('ii', f.read(8))
            
            # Read the entire data matrix at once
            raw_data = np.fromfile(f, dtype=np.float32, count=num_rows * num_columns)
            raw_data = raw_data.reshape(num_rows, num_columns)
         
            return num_rows, num_columns, raw_data
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return None

def read_i8_binary_file(filename):
    try:
        with open(filename, 'rb') as f:
            # Read the first 8 bytes (number of rows and columns)
            num_rows, num_columns = struct.unpack('ii', f.read(8))
            
            # Read the entire data matrix at once
            raw_data = np.fromfile(f, dtype=np.int8, count=num_rows * num_columns)
            raw_data = raw_data.reshape(num_rows, num_columns)
            data = raw_data.astype(np.float32)
         
            return num_rows, num_columns, data
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return None

def read_u8_binary_file(filename):
    try:
        with open(filename, 'rb') as f:
            # Read the first 8 bytes (number of rows and columns)
            num_rows, num_columns = struct.unpack('ii', f.read(8))
            
            # Read the entire data matrix at once
            raw_data = np.fromfile(f, dtype=np.uint8, count=num_rows * num_columns)
            raw_data = raw_data.reshape(num_rows, num_columns)
            data = raw_data.astype(np.float32)
         
            return num_rows, num_columns, data
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return None


def write_float_binary_file(filename, data):
# Get the shape of the matrix
    n_rows, n_columns = data.shape

# Pack the metadata and data
    header = struct.pack('ii', n_rows, n_columns)

# Write the header and data to the binary file
    with open(filename, 'wb') as f:
        f.write(header)
        f.write(data)    

def compute_mean(data):
    row_means = np.mean(data, axis=0)    
    return row_means
    
def center_data(data, row_means):
    data -= row_means[np.newaxis,:]



In [110]:

# Example: Create a weighted complete bipartite graph
# Suppose we have a 3x3 biadjacency matrix with equal weights
biadjacency_matrix = csr_matrix([[3, 4, 1], [3, 1, 2], [1, 2, 3]])

# Compute the minimum weight full matching
row_ind, col_ind = min_weight_full_bipartite_matching(biadjacency_matrix)

# Print the matching indices
print("Row indices:", row_ind)
print("Column indices:", col_ind)

Row indices: [0 1 2]
Column indices: [2 1 0]


In [126]:
import numpy as np

# Assuming you have a large dimensional matrix A
#A = np.random.rand(1000000, 1000)  # Example: 1000x1000 matrix

# Example usage
#filename = '/mnt/nvme0/datasets/fbv5/fbv5_rnd1m_data.bin'
#queryfile = '/mnt/nvme0/datasets/fbv5/fbv5_query.bin'
#rows, columns, A = read_i8_binary_file(filename)
#rq, cq, Q = read_i8_binary_file(queryfile)

#filename = '/mnt/nvme0/datasets/fbv6/fbv6_rnd1m_data.bin'
#queryfile = '/mnt/nvme0/datasets/fbv6/fbv6_query.bin'
#rows, columns, A = read_i8_binary_file(filename)
#rq, cq, Q = read_i8_binary_file(queryfile)

#filename = '/mnt/nvme0/datasets/text2image1B/base_rnd1m_data.bin'
#queryfile = '/mnt/nvme0/datasets/text2image1B/query.public.100K.fbin'
#rows, columns, A = read_float_binary_file(filename)
#rq, cq, Q = read_float_binary_file(queryfile)


#filename = '/mnt/nvme0/datasets/wikipedia_large/wikipedia_rnd1m_data.bin'
#queryfile = '/mnt/nvme0/datasets/wikipedia_large/wikipedia_query.bin'
#rows, columns, A = read_float_binary_file(filename)
#rq, cq, Q = read_float_binary_file(queryfile)

#filename = '/mnt/nvme0/datasets/yfcc100M/yfcc_rnd1m_data.bin'
#queryfile = '/mnt/nvme0/datasets/yfcc100M/query.public.100K.u8bin'
#rows, columns, A = read_u8_binary_file(filename)
#rq, cq, Q = read_u8_binary_file(queryfile)


print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}")
mu = compute_mean(A)
center_data(A, mu)
center_data(Q, mu)

savebase = filename + "_centered"
savequery = queryfile + "_centered"
write_float_binary_file(savebase,A)
write_float_binary_file(savequery,Q)



Number of rows: 979568
Number of columns: 768


In [103]:

# Compute SVD
U, S, VT = np.linalg.svd(A, full_matrices=False)

# U: Left singular vectors
# S: Singular values
# VT: Right singular vectors (transpose)

# If you only need the top k singular values/vectors, you can use a truncated SVD
k = 10
U_truncated = U[:, :k]
S_truncated = S[:k]
VT_truncated = VT[:k, :]

# This would give you the approximation of the original matrix A using only the top k singular values/vectors.



In [104]:
print(U.shape)
print(VT.shape)
print(U_truncated.shape)
print(S)
np.savetxt("svals.txt", S)

(1000586, 192)
(192, 192)
(1000586, 10)
[94489.39   71252.484  65543.234  52269.742  48433.586  46481.35
 44180.484  39244.94   38199.73   37274.68   36611.766  34043.523
 32901.785  31737.547  31013.914  29359.715  28837.68   28212.18
 27835.459  27533.197  27272.121  26238.479  26101.326  25575.314
 24954.107  24826.785  24786.848  24218.105  23907.52   23414.385
 23081.846  22977.086  22747.277  22314.668  22221.037  21881.436
 21769.145  21593.906  21399.049  21102.041  20871.883  20809.951
 20392.408  20351.57   20109.479  20041.54   19911.256  19738.055
 19627.883  19519.115  19478.83   19388.604  19167.58   19029.537
 18793.594  18705.592  18546.064  18501.043  18444.916  18404.78
 18278.664  18065.736  17988.684  17927.184  17847.082  17786.254
 17679.018  17655.316  17638.86   17567.955  17484.814  17256.617
 17177.621  17069.467  17058.447  16961.646  16864.002  16759.758
 16701.312  16592.836  16545.213  16520.184  16455.172  16408.432
 16334.125  16281.187  16173.98   16079