# Training ECCO

In [2]:
import os
import gensim
import gzip
import random
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  # Add this line


def train_word2vec_models(input_file, num_runs=1, n=7_000_000):
    logging.info(f"Loading {input_file}")
    all_sentences = []
    with gzip.open(input_file) as f:
        for line in f:
            all_sentences.append(line.decode("utf-8").strip().split())
    logging.info(f"Loaded {len(all_sentences):,} sentences")

    for run in range(1, num_runs + 1):
        ofn = ["word2vec"]
        ofn += os.path.basename(input_file).split(".")[1:3]
        ofn += [f"run_{run:02d}", "skipgram_n=5", "model"]
        # Save model vectors
        ofn_model = os.path.join("../data/models/ecco", ".".join(ofn) + ".txt.gz")
        ofn_vocab = os.path.join("../data/models/ecco", ".".join(ofn) + ".vocab.txt")

        if os.path.exists(ofn_model):
            logging.info(f"Skipping {ofn_model} because it already exists")
            continue

        # Train model
        sentences = (
            random.sample(all_sentences, n)
            if n and len(all_sentences) > n
            else all_sentences
        )
        model = gensim.models.Word2Vec(
            sentences=sentences,
            vector_size=100,
            window=5,
            min_count=10,
            workers=8,
            sg=1,  # skipgram
        )

        # Generate output filename
        

        os.makedirs(os.path.dirname(ofn_model), exist_ok=True)

        model.wv.save_word2vec_format(ofn_model, fvocab=ofn_vocab, binary=False)

        print(f"Saved {ofn_model}")

In [1]:
# train_word2vec_models('../data/skipgrams/ecco/skipgrams.ECCO.1700-1709.txt.gz')
idir = '../data/skipgrams/ecco'
for fn in sorted(os.listdir(idir)):
    if fn.endswith('.txt.gz'):
        dec = fn.split('.')[2]
        if dec < '1770': continue
        logging.info(f"Training {fn}")
        train_word2vec_models(os.path.join(idir, fn), num_runs=10)


2024-11-22 23:07:36,094 : INFO : Training skipgrams.ECCO.1770-1779.txt.gz
2024-11-22 23:07:36,094 : INFO : Loading ../data/skipgrams/ecco/skipgrams.ECCO.1770-1779.txt.gz
2024-11-22 23:14:19,464 : INFO : Loaded 26,241,539 sentences
2024-11-22 23:14:19,509 : INFO : Skipping ../data/models/ecco/word2vec.ECCO.1770-1779.run_01.skipgram_n=5.model.txt.gz because it already exists
2024-11-22 23:14:19,512 : INFO : Skipping ../data/models/ecco/word2vec.ECCO.1770-1779.run_02.skipgram_n=5.model.txt.gz because it already exists
2024-11-22 23:20:05,378 : INFO : collecting all words and their counts
2024-11-22 23:20:05,388 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-11-22 23:20:06,899 : INFO : PROGRESS: at sentence #10000, processed 99998 words, keeping 27393 word types
2024-11-22 23:20:07,999 : INFO : PROGRESS: at sentence #20000, processed 199994 words, keeping 46795 word types
2024-11-22 23:20:08,838 : INFO : PROGRESS: at sentence #30000, processed 299990 words,

Saved ../data/models/ecco/word2vec.ECCO.1770-1779.run_03.skipgram_n=5.model.txt.gz


2024-11-22 23:59:54,392 : INFO : collecting all words and their counts
2024-11-22 23:59:54,415 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-11-22 23:59:56,448 : INFO : PROGRESS: at sentence #10000, processed 99996 words, keeping 27598 word types
2024-11-22 23:59:57,926 : INFO : PROGRESS: at sentence #20000, processed 199991 words, keeping 47402 word types
2024-11-22 23:59:58,949 : INFO : PROGRESS: at sentence #30000, processed 299987 words, keeping 64982 word types
2024-11-22 23:59:59,777 : INFO : PROGRESS: at sentence #40000, processed 399983 words, keeping 80912 word types
2024-11-23 00:00:01,067 : INFO : PROGRESS: at sentence #50000, processed 499980 words, keeping 96382 word types
2024-11-23 00:00:01,963 : INFO : PROGRESS: at sentence #60000, processed 599979 words, keeping 110974 word types
2024-11-23 00:00:02,732 : INFO : PROGRESS: at sentence #70000, processed 699975 words, keeping 125092 word types
2024-11-23 00:00:03,609 : INFO : PROGRESS: at