In [17]:
import os
import json
import numpy as np
from tqdm import tqdm

In [21]:
def convert_mmc4_shard(jsonl_filepath, embed_filepath, shard_index, dataset):
    # Pre-allocate np array
	embed_dict = np.load(embed_filepath, allow_pickle=True)
	num_rows = len(embed_dict)
	num_cols = embed_dict[list(embed_dict.keys())[0]].shape[0]
	embeddings = np.zeros((num_rows, num_cols))

	# Create map from image id to index
	image_id_to_index = {}

	# Load jsonl file
	jsonl_list = []
	with open(jsonl_filepath, 'r') as f:
		# Iterate over jsonl file
		for idx, line in tqdm(enumerate(f.readlines())):
			data = json.loads(line)

			text_list = data['text_list']

			# Get image id
			for image in data['image_info']:
				image_name = image['image_name']

				# Get image index if image id in map
				image_index = image_id_to_index.get(image_name, None)

				# If image id not in map, add to map and add to embeddings
				if image_index is None:
					image_index = len(image_id_to_index)
					image_id_to_index[image_name] = image_index
					embeddings[len(image_id_to_index) - 1] = embed_dict[image_name]

				text_list.insert(image['matched_index'], f"<image><<{shard_index},{image_index},{dataset},image>></image>")

			text = " ".join(text_list)

			# Add to jsonl list
			data['text'] = text
			jsonl_list.append(data)

	# Return new jsonl file and embeddings array
	return jsonl_list, embeddings

In [22]:
mmc4_data_dir ='/path/to/mmc4/'
jsonl_filepath = os.path.join(mmc4_data_dir, 'docs_no_face_shard_0_v2.jsonl')
embed_filepath = os.path.join(mmc4_data_dir, "clip_vitl14_shard_0_features.pkl")

In [26]:
new_jsonl_list, embed = convert_mmc4_shard(jsonl_filepath, embed_filepath, 0, 'mmc4')

3001it [00:00, 14339.46it/s]


In [31]:
original_jsonl = open(jsonl_filepath, 'r').readlines()
original_embed = np.load(embed_filepath, allow_pickle=True)
assert(len(new_jsonl_list) == len(f))
assert(embed.shape[0] == len(original_embed.keys()))