In [1]:
import numpy as np
from gensim.models.fasttext import load_facebook_model, _load_fasttext_format

In [2]:
import struct
# https://github.com/RaRe-Technologies/gensim/blob/68ec5b8ed7f18e75e0b13689f4da53405ef3ed96/gensim/models/_fasttext_bin.py#L162
def _struct_unpack(fin, fmt):
    num_bytes = struct.calcsize(fmt)
    return struct.unpack(fmt, fin.read(num_bytes))

In [3]:
import gzip
fin = gzip.open("e:/langon/resources/cc.zh.300.bin.gz", "rb")

In [4]:
magic, version = _struct_unpack(fin, '@2i')

In [5]:
magic, version

(793712314, 12)

In [6]:
_FASTTEXT_FILEFORMAT_MAGIC = np.int32(793712314)
_NEW_HEADER_FORMAT = [
    ('dim', 'i'),
    ('ws', 'i'),
    ('epoch', 'i'),
    ('min_count', 'i'),
    ('neg', 'i'),
    ('word_ngrams', 'i'),   # Unused in loading
    ('loss', 'i'),
    ('model', 'i'),
    ('bucket', 'i'),
    ('minn', 'i'),
    ('maxn', 'i'),
    ('lr_update_rate', 'i'),   # Unused in loading
    ('t', 'd'),
]
header_spec = _NEW_HEADER_FORMAT
new_format = magic == _FASTTEXT_FILEFORMAT_MAGIC
model = {name: _struct_unpack(fin, fmt)[0] for (name, fmt) in header_spec}

In [7]:
model

{'dim': 300,
 'ws': 5,
 'epoch': 1,
 'min_count': 5,
 'neg': 10,
 'word_ngrams': 1,
 'loss': 2,
 'model': 1,
 'bucket': 2000000,
 'minn': 5,
 'maxn': 5,
 'lr_update_rate': 100,
 't': 9.999999747378752e-06}

In [11]:
from gensim.models._fasttext_bin import _load_vocab
import inspect
print(inspect.getsource(_load_vocab))

def _load_vocab(fin, new_format, encoding='utf-8'):
    """Load a vocabulary from a FB binary.

    Before the vocab is ready for use, call the prepare_vocab function and pass
    in the relevant parameters from the model.

    Parameters
    ----------
    fin : file
        An open file pointer to the binary.
    new_format: boolean
        True if the binary is of the newer format.
    encoding : str
        The encoding to use when decoding binary data into words.

    Returns
    -------
    tuple
        The loaded vocabulary.  Keys are words, values are counts.
        The vocabulary size.
        The number of words.
    """
    vocab_size, nwords, nlabels = _struct_unpack(fin, '@3i')

    # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
    if nlabels > 0:
        raise NotImplementedError("Supervised fastText models are not supported")
    logger.info("loading %s words for fastText model from %s", vocab_size, fin

In [8]:
raw_vocab, vocab_size, nwords = _load_vocab(fin, new_format, encoding="UTF-8")

In [9]:
vocab_size

2000000

In [13]:
import wasp
import pickle
with open(wasp.get_resource_path("", "fasttext_vocab_freq.pkl"), "wb") as fout:
    pickle.dump(raw_vocab, fout)