# Imports

In [None]:
import os
import bson
import itertools
import pandas as pd
from tqdm.notebook import tqdm
import gc

# Configuration

In [None]:
class Config:
    DATA_DIR = "../input/cdiscount-image-classification-challenge"
    NPRODS_TRAIN = 7_069_896
    NPRODS_TEST = 1_768_182
    GC_LIMIT = 1000
    DECAY = 100_000
    DECAY_AMOUNT = 0.99
    
    @classmethod
    def get_data_path(cls, filename):
        return os.path.join(cls.DATA_DIR, filename)

In [None]:
def get_metadata(filename, *, nprods=None, has_labels=True):
    filepath = Config.get_data_path(filename)
    
    gc_limit = Config.GC_LIMIT
    decay = Config.DECAY
    decay_amt = Config.DECAY_AMOUNT
    
    gc_limit_minus_1, decay_minus_1 = gc_limit - 1, decay - 1
        
    with open(filepath, 'rb') as f:
        bdata = bson.decode_file_iter(f)
        
        if nprods is not None:
            # Limit the number of records
            bdata = itertools.islice(bdata, None, nprods)
        else:
            nprods = Config.NPRODS_TRAIN if "train" in filename else Config.NPRODS_TEST
            
        pbdata = tqdm(bdata, total=nprods, desc=filename)
        
        metadata = []
        
        # Record the current position
        curr = f.tell()
        
        for idx, d in enumerate(pbdata):
            # Get the starting position
            # And change the current position since once bson decodes a record
            # File pointer has already moved to the next record
            start, curr = curr, f.tell()
            
            # Get length of record
            length = curr - start
            
            record = (d["_id"], start, length, len(d["imgs"]))
            
            if has_labels is True:
                record += (d["category_id"],)
            
            metadata.append(record)
            
            # To manage RAM usage
            del d
            
            # Force garbage collection
            if idx % gc_limit == gc_limit_minus_1:
                gc.collect()
                
            # Increase the frequency of garbage collection as time progresses
            if idx % decay == decay_minus_1:
                gc_limit *= decay_amt
                gc_limit = int(gc_limit)
            
        return metadata

In [None]:
def make_csv(filename, *, nprods=None, has_labels=True):
    metadata = get_metadata(filename, nprods=nprods, has_labels=has_labels)
    
    gc.collect()
    
    cols = ["pid", "start", "length", "n_imgs"]
    
    if has_labels is True:
        cols.append("category_id")
    
    df = pd.DataFrame(metadata, columns=cols)
    
    name, _ = os.path.splitext(filename)
    dest = f"{name}_metadata.csv"
    
    df.to_csv(dest, index=False)
    
    del metadata
    del df
    gc.collect()

In [None]:
make_csv("train.bson")
gc.collect()
make_csv("test.bson", has_labels=False)
gc.collect()