### Logging

In [2]:
import logging
import re
import csv
import time
from pathlib import Path
from gensim.parsing.preprocessing import * #provides a number of convenience preprocessing functions optimized for speed
from gensim.models.word2vec import Word2Vec
import gensim
from multiprocessing import cpu_count
import pandas as pd
import os
from langua import Predict
languagePredictor = Predict()
import time

from itertools import islice
#logging.getLogger('').handlers = []  #To delete previous logging configuration
from dataDictionariesLexicons import dataDict, LexiconsEnsembl


logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
    #If log into standard output and to a file is desired:
    #handlers=[logging.FileHandler("{0}/{1}.log".format('./', uniName)), #path, File name
    #         logging.StreamHandler()]
)

from outletsBiasRatings import outletsBiasRatingsAllSides, outletAbbreviationToFullName 

outletsBiasRatings = outletsBiasRatingsAllSides
outlets = list(outletsBiasRatings.keys())
print(len(outlets),outlets)

47 ['alternet', 'democracynow', 'db', 'hp', 'theintercept', 'jacobin', 'motherjones', 'thenewyorker', 'thenation', 'slate', 'vox', 'cnn', 'nyt', 'abcnews', 'theatlantic', 'buzzfeed', 'cbs', 'economist', 'guardian', 'nbcnews', 'politico', 'timemagazine', 'wp', 'npr', 'ap', 'bbc', 'bloomberg', 'csm', 'reuters', 'thehill', 'usatoday', 'wsj', 'reason', 'we', 'wt', 'fox', 'americanspectator', 'bre', 'theblaze', 'cbn', 'dailycaller', 'dailymail', 'dailywire', 'thefederalist', 'nationalreview', 'nyp', 'newsmax']


In [22]:
# Pick 6 outlets to analyze (2 left, 2 center, 2 right) considering time range of articles
# and Allsides ratings
outlets = ['cbs','reuters','nyt', 'slate', 'csm', 'thehill', 'wt', 'nyp', 'dailymail']

## Computing years intervals

In [12]:
startYear=2000
endYear = 2019
years = list(range(startYear,endYear+1))
intervalLength = 5

intervalsList = []
interval = []

for year in range(startYear,endYear+1):
    if year%intervalLength==0:
        intervalsList.append(interval)
        interval = []
    interval.append(year)
    
intervalsList.append(interval)
    
intervalsList.pop(0)

for interval in intervalsList:
    intervalName = str(interval[0])+'-'+str(interval[-1])
    print(intervalName)
intervalsList

2000-2004
2005-2009
2010-2014
2015-2019


[[2000, 2001, 2002, 2003, 2004],
 [2005, 2006, 2007, 2008, 2009],
 [2010, 2011, 2012, 2013, 2014],
 [2015, 2016, 2017, 2018, 2019]]

In [13]:
workers = cpu_count()-2 #Number of workers to use for parallel processing. If None all available cores will be used.
print('Number of workers:', workers)

Number of workers: 14


In [23]:
current_dir = Path(os.getcwd())
base_dir = current_dir.parent
input_dir = base_dir / "scrambledArticlesForWord2vec"
model_dir = base_dir / "models"

In [24]:
import logging

def load_and_preprocess_sentences(file_path):
    try:
        with file_path.open('r', encoding='utf-8') as file:
            return [line.strip().split(' ') for line in file.readlines()]
    except Exception as e:
        logging.error(f"Failed to read from {file_path}: {e}")
        return []


def train_model(sentences, previous_model_path):
    if not sentences:
        print("No sentences provided for training.")
        return None  # Return None if no sentences to train on

    tic = time.time()
    if previous_model_path and os.path.exists(previous_model_path):
        try:
            print(f"Loading model from {previous_model_path}")
            model = Word2Vec.load(previous_model_path)
            model.build_vocab(sentences, update=True)  # Update the vocabulary
        except Exception as e:
            print(f"Error loading the model from {previous_model_path}: {e}")
            print("Training new model instead")
            model = Word2Vec(sentences, vector_size=300, window=10, min_count=5, workers=4)
            model.build_vocab(sentences)  # Building vocabulary for the new model
    else:
        print("Training new model")
        model = Word2Vec(vector_size=300, window=10, min_count=5, workers=workers, hs=0, negative = 10, sample = 0.0001, epochs=5)
        model.build_vocab(sentences)  # Building vocabulary for the new model

    model.train(sentences, total_examples=len(sentences), epochs=model.epochs)

    toc = time.time()
    computation_time = toc - tic
    print(f"Computing time for training the model: {computation_time} seconds")
    words_in_corpus = sum(len(sentence) for sentence in sentences)
    print(f"Number of words processed per second: {words_in_corpus / computation_time}")
    print(f"Most frequent words in model: {model.wv.index_to_key[:10]}")

    return model



In [25]:
for outlet in outlets:
    logging.info(outlet)
    previous_model_path = None  # Initialize the previous model path

    for interval in intervalsList:
        output_file_name = f"{interval[0]}-{interval[-1]}"
        print(f"Processing interval {output_file_name}")
        input_file_path = input_dir / outlet / f"{output_file_name}.csv"
        output_path = model_dir / outlet
        output_path.mkdir(parents=True, exist_ok=True)
        
        sentences = load_and_preprocess_sentences(input_file_path)
        if not sentences:
            print(f"No sentences found in {input_file_path}, skipping interval {output_file_name}")
            continue

        model = train_model(sentences, previous_model_path)
        if model:
            output_file_path = output_path / f"{output_file_name}.model"
            model.save(str(output_file_path))
            print(f"Model saved to {output_file_path}")
            previous_model_path = str(output_file_path)  # Update the previous model path for the next interval
        else:
            print(f"\033[1;31mSkipping training for interval {output_file_name} due to insufficient data\033[0m")
            previous_model_path = None  # Reset if training was not successful

        # Evaluation
        try:
            logging.info(" Similarity evaluation: %s", model.wv.evaluate_word_pairs('wordsim353.tsv', restrict_vocab=50000))
        except Exception as e:
            logging.error("Exception during word similarity evaluation: %s", e)
        # Analogies
        try:
            model.wv.evaluate_word_analogies('questions-words.txt', restrict_vocab=30000)
        except Exception as e:
            logging.error("Exception during word analogy evaluation: %s", e)

        logging.info("----------------------------------------------------")            
    logging.info("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX") 

2024-05-01 03:10:43,867 : INFO : cbs


Processing interval 2000-2004


2024-05-01 03:10:44,716 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T03:10:44.716234', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 03:10:44,716 : INFO : collecting all words and their counts
2024-05-01 03:10:44,716 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:10:44,909 : INFO : PROGRESS: at sentence #10000, processed 2792625 words, keeping 75162 word types


Training new model


2024-05-01 03:10:45,133 : INFO : PROGRESS: at sentence #20000, processed 5919188 words, keeping 117194 word types
2024-05-01 03:10:45,413 : INFO : PROGRESS: at sentence #30000, processed 9138063 words, keeping 151612 word types
2024-05-01 03:10:45,670 : INFO : PROGRESS: at sentence #40000, processed 12630907 words, keeping 181733 word types
2024-05-01 03:10:45,919 : INFO : PROGRESS: at sentence #50000, processed 16271370 words, keeping 208892 word types
2024-05-01 03:10:46,243 : INFO : PROGRESS: at sentence #60000, processed 19919024 words, keeping 235961 word types
2024-05-01 03:10:46,273 : INFO : collected 238302 word types from a corpus of 20224891 raw words and 60869 sentences
2024-05-01 03:10:46,274 : INFO : Creating a fresh vocabulary
2024-05-01 03:10:46,365 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 69924 unique words (29.34% of original 238302, drops 168378)', 'datetime': '2024-05-01T03:10:46.364998', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep

Computing time for training the model: 26.49495577812195 seconds
Number of words processed per second: 763348.7358639255
Most frequent words in model: ['said', 'he', 'his', 'she', 'people', 'new', 'says', 'her', 'year', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model


2024-05-01 03:11:11,875 : INFO : capital-common-countries: 50.4% (233/462)
2024-05-01 03:11:13,134 : INFO : capital-world: 41.7% (500/1200)
2024-05-01 03:11:13,421 : INFO : currency: 12.5% (5/40)
2024-05-01 03:11:15,515 : INFO : city-in-state: 17.6% (399/2266)
2024-05-01 03:11:15,900 : INFO : family: 43.1% (181/420)
2024-05-01 03:11:16,510 : INFO : gram1-adjective-to-adverb: 3.4% (28/812)
2024-05-01 03:11:16,806 : INFO : gram2-opposite: 2.6% (10/380)
2024-05-01 03:11:17,984 : INFO : gram3-comparative: 24.0% (320/1332)
2024-05-01 03:11:18,771 : INFO : gram4-superlative: 8.0% (79/992)
2024-05-01 03:11:19,362 : INFO : gram5-present-participle: 32.9% (231/702)
2024-05-01 03:11:20,350 : INFO : gram6-nationality-adjective: 58.0% (713/1229)
2024-05-01 03:11:21,506 : INFO : gram7-past-tense: 22.4% (350/1560)
2024-05-01 03:11:22,439 : INFO : gram8-plural: 36.5% (409/1122)
2024-05-01 03:11:22,808 : INFO : gram9-plural-verbs: 17.4% (73/420)
2024-05-01 03:11:22,809 : INFO : Quadruplets with out-of

Processing interval 2005-2009


2024-05-01 03:11:24,611 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model
2024-05-01 03:11:24,618 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model.wv.* with mmap=None
2024-05-01 03:11:24,618 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 03:11:24,628 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model.syn1neg.npy with mmap=None
2024-05-01 03:11:24,642 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:11:24,807 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model', 'datetime': '2024-05-01T03:11:24.807582', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:11:24,807 : INFO : collecting all words and their count

Loading model from /Users/eddyji/GitRepos/01_TAD/models/cbs/2000-2004.model


2024-05-01 03:11:25,033 : INFO : PROGRESS: at sentence #10000, processed 3295231 words, keeping 84298 word types
2024-05-01 03:11:25,237 : INFO : PROGRESS: at sentence #20000, processed 6401074 words, keeping 119269 word types
2024-05-01 03:11:25,427 : INFO : PROGRESS: at sentence #30000, processed 9317936 words, keeping 148554 word types
2024-05-01 03:11:25,617 : INFO : PROGRESS: at sentence #40000, processed 12121341 words, keeping 175631 word types
2024-05-01 03:11:25,791 : INFO : PROGRESS: at sentence #50000, processed 14769631 words, keeping 202504 word types
2024-05-01 03:11:25,958 : INFO : PROGRESS: at sentence #60000, processed 17293821 words, keeping 227186 word types
2024-05-01 03:11:26,143 : INFO : PROGRESS: at sentence #70000, processed 19963177 words, keeping 250545 word types
2024-05-01 03:11:26,302 : INFO : PROGRESS: at sentence #80000, processed 22383269 words, keeping 272492 word types
2024-05-01 03:11:26,451 : INFO : PROGRESS: at sentence #90000, processed 24592863 wo

Computing time for training the model: 53.675474882125854 seconds
Number of words processed per second: 750804.3494445167
Most frequent words in model: ['said', 'he', 'his', 'she', 'people', 'new', 'says', 'her', 'year', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model


2024-05-01 03:12:18,885 : INFO : capital-common-countries: 61.9% (286/462)
2024-05-01 03:12:20,245 : INFO : capital-world: 52.5% (630/1200)
2024-05-01 03:12:20,328 : INFO : currency: 20.0% (8/40)
2024-05-01 03:12:22,087 : INFO : city-in-state: 58.9% (1334/2266)
2024-05-01 03:12:22,370 : INFO : family: 53.6% (225/420)
2024-05-01 03:12:22,935 : INFO : gram1-adjective-to-adverb: 8.4% (68/812)
2024-05-01 03:12:23,236 : INFO : gram2-opposite: 9.5% (36/380)
2024-05-01 03:12:24,300 : INFO : gram3-comparative: 39.8% (530/1332)
2024-05-01 03:12:25,009 : INFO : gram4-superlative: 14.3% (142/992)
2024-05-01 03:12:25,535 : INFO : gram5-present-participle: 44.9% (315/702)
2024-05-01 03:12:26,270 : INFO : gram6-nationality-adjective: 72.2% (887/1229)
2024-05-01 03:12:27,304 : INFO : gram7-past-tense: 40.3% (628/1560)
2024-05-01 03:12:28,105 : INFO : gram8-plural: 57.0% (639/1122)
2024-05-01 03:12:28,419 : INFO : gram9-plural-verbs: 31.2% (131/420)
2024-05-01 03:12:28,420 : INFO : Quadruplets with ou

Processing interval 2010-2014


2024-05-01 03:12:44,321 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model
2024-05-01 03:12:44,336 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model.wv.* with mmap=None
2024-05-01 03:12:44,336 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 03:12:44,378 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 03:12:44,442 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model


2024-05-01 03:12:44,697 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/cbs/2005-2009.model', 'datetime': '2024-05-01T03:12:44.697196', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:12:44,697 : INFO : collecting all words and their counts
2024-05-01 03:12:44,697 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:12:44,876 : INFO : PROGRESS: at sentence #10000, processed 2552525 words, keeping 80950 word types
2024-05-01 03:12:45,052 : INFO : PROGRESS: at sentence #20000, processed 5127253 words, keeping 118009 word types
2024-05-01 03:12:45,218 : INFO : PROGRESS: at sentence #30000, processed 7583755 words, keeping 148671 word types
2024-05-01 03:12:45,386 : INFO : PROGRESS: at sentence #40000, processed 10018896 words, keeping 174774 word types
2024-05-01 03:12:45,557 : INFO : PROGRESS: at sentence #

Computing time for training the model: 70.08129405975342 seconds
Number of words processed per second: 731724.3879126571
Most frequent words in model: ['said', 'he', 'his', 'she', 'people', 'new', 'says', 'her', 'year', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model


2024-05-01 03:13:55,182 : INFO : capital-common-countries: 67.5% (312/462)
2024-05-01 03:13:56,702 : INFO : capital-world: 58.1% (697/1200)
2024-05-01 03:13:56,739 : INFO : currency: 12.5% (5/40)
2024-05-01 03:13:58,958 : INFO : city-in-state: 70.9% (1606/2266)
2024-05-01 03:13:59,340 : INFO : family: 53.6% (225/420)
2024-05-01 03:14:00,194 : INFO : gram1-adjective-to-adverb: 16.5% (134/812)
2024-05-01 03:14:00,607 : INFO : gram2-opposite: 17.6% (67/380)
2024-05-01 03:14:01,759 : INFO : gram3-comparative: 43.5% (579/1332)
2024-05-01 03:14:02,614 : INFO : gram4-superlative: 21.3% (211/992)
2024-05-01 03:14:03,181 : INFO : gram5-present-participle: 45.4% (319/702)
2024-05-01 03:14:04,184 : INFO : gram6-nationality-adjective: 82.3% (1012/1229)
2024-05-01 03:14:05,458 : INFO : gram7-past-tense: 44.1% (688/1560)
2024-05-01 03:14:06,440 : INFO : gram8-plural: 68.2% (765/1122)
2024-05-01 03:14:06,794 : INFO : gram9-plural-verbs: 24.5% (103/420)
2024-05-01 03:14:06,796 : INFO : Quadruplets wit

Processing interval 2015-2019


2024-05-01 03:14:08,837 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model
2024-05-01 03:14:08,853 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model.wv.* with mmap=None
2024-05-01 03:14:08,853 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 03:14:08,886 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model.syn1neg.npy with mmap=None
2024-05-01 03:14:08,936 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model


2024-05-01 03:14:09,248 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/cbs/2010-2014.model', 'datetime': '2024-05-01T03:14:09.248379', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:14:09,248 : INFO : collecting all words and their counts
2024-05-01 03:14:09,248 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:14:09,428 : INFO : PROGRESS: at sentence #10000, processed 2586396 words, keeping 81607 word types
2024-05-01 03:14:09,613 : INFO : PROGRESS: at sentence #20000, processed 5204377 words, keeping 126713 word types
2024-05-01 03:14:09,800 : INFO : PROGRESS: at sentence #30000, processed 7875535 words, keeping 160195 word types
2024-05-01 03:14:09,994 : INFO : PROGRESS: at sentence #40000, processed 10634564 words, keeping 189692 word types
2024-05-01 03:14:10,179 : INFO : PROGRESS: at sentence #

Computing time for training the model: 54.65019631385803 seconds
Number of words processed per second: 722918.9218846734
Most frequent words in model: ['said', 'he', 'his', 'she', 'people', 'new', 'says', 'her', 'year', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/cbs/2015-2019.model


2024-05-01 03:15:04,150 : INFO : capital-common-countries: 67.7% (313/462)
2024-05-01 03:15:05,267 : INFO : capital-world: 57.9% (695/1200)
2024-05-01 03:15:05,302 : INFO : currency: 10.0% (4/40)
2024-05-01 03:15:07,087 : INFO : city-in-state: 78.9% (1789/2266)
2024-05-01 03:15:07,411 : INFO : family: 55.5% (233/420)
2024-05-01 03:15:08,070 : INFO : gram1-adjective-to-adverb: 18.2% (148/812)
2024-05-01 03:15:08,362 : INFO : gram2-opposite: 14.2% (54/380)
2024-05-01 03:15:09,447 : INFO : gram3-comparative: 46.4% (618/1332)
2024-05-01 03:15:10,262 : INFO : gram4-superlative: 25.8% (256/992)
2024-05-01 03:15:10,819 : INFO : gram5-present-participle: 44.6% (313/702)
2024-05-01 03:15:11,710 : INFO : gram6-nationality-adjective: 84.3% (1036/1229)
2024-05-01 03:15:12,939 : INFO : gram7-past-tense: 44.2% (690/1560)
2024-05-01 03:15:13,754 : INFO : gram8-plural: 69.5% (780/1122)
2024-05-01 03:15:14,097 : INFO : gram9-plural-verbs: 35.7% (150/420)
2024-05-01 03:15:14,101 : INFO : Quadruplets wit

Processing interval 2000-2004


2024-05-01 03:15:14,391 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T03:15:14.391634', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 03:15:14,391 : INFO : collecting all words and their counts
2024-05-01 03:15:14,392 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:15:14,392 : INFO : collected 2 word types from a corpus of 6 raw words and 6 sentences
2024-05-01 03:15:14,392 : INFO : Creating a fresh vocabulary
2024-05-01 03:15:14,392 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1 unique words (50.00% of original 2, drops 1)', 'datetime': '2024-05-01T03:15:14.392553', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-05-01 03:

Training new model
Computing time for training the model: 0.01347494125366211 seconds
Number of words processed per second: 445.2709579249089
Most frequent words in model: [',']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/reuters/2000-2004.model
Processing interval 2005-2009


2024-05-01 03:15:27,102 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/reuters/2000-2004.model
2024-05-01 03:15:27,104 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/reuters/2000-2004.model.wv.* with mmap=None
2024-05-01 03:15:27,104 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:15:27,104 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/reuters/2000-2004.model', 'datetime': '2024-05-01T03:15:27.104745', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:15:27,104 : INFO : collecting all words and their counts
2024-05-01 03:15:27,105 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:15:27,239 : INFO : PROGRESS: at sentence #10000, processed 1815463 words, keeping 64874 word types


Loading model from /Users/eddyji/GitRepos/01_TAD/models/reuters/2000-2004.model


2024-05-01 03:15:27,347 : INFO : PROGRESS: at sentence #20000, processed 3348646 words, keeping 92128 word types
2024-05-01 03:15:27,489 : INFO : PROGRESS: at sentence #30000, processed 4881552 words, keeping 115747 word types
2024-05-01 03:15:27,620 : INFO : PROGRESS: at sentence #40000, processed 6415632 words, keeping 136677 word types
2024-05-01 03:15:27,760 : INFO : PROGRESS: at sentence #50000, processed 8043182 words, keeping 156818 word types
2024-05-01 03:15:27,900 : INFO : PROGRESS: at sentence #60000, processed 9672001 words, keeping 175960 word types
2024-05-01 03:15:28,041 : INFO : PROGRESS: at sentence #70000, processed 11308488 words, keeping 193906 word types
2024-05-01 03:15:28,150 : INFO : PROGRESS: at sentence #80000, processed 12740474 words, keeping 210145 word types
2024-05-01 03:15:28,277 : INFO : PROGRESS: at sentence #90000, processed 14215240 words, keeping 226493 word types
2024-05-01 03:15:28,412 : INFO : PROGRESS: at sentence #100000, processed 15872466 wor

Computing time for training the model: 185.5567228794098 seconds
Number of words processed per second: 753931.4977604813
Most frequent words in model: [',', 'somalia', 'mogadishu', 'adds', 'ethiopian', 'islamist', 'said', 'prisoners', 'jan', 'war']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model


2024-05-01 03:18:33,910 : INFO : capital-common-countries: 69.9% (323/462)
2024-05-01 03:18:35,357 : INFO : capital-world: 72.6% (1265/1743)
2024-05-01 03:18:35,489 : INFO : currency: 7.8% (16/206)
2024-05-01 03:18:36,476 : INFO : city-in-state: 65.7% (921/1402)
2024-05-01 03:18:36,704 : INFO : family: 46.3% (126/272)
2024-05-01 03:18:37,196 : INFO : gram1-adjective-to-adverb: 14.8% (104/702)
2024-05-01 03:18:37,504 : INFO : gram2-opposite: 13.1% (55/420)
2024-05-01 03:18:38,367 : INFO : gram3-comparative: 38.0% (479/1260)
2024-05-01 03:18:38,802 : INFO : gram4-superlative: 14.8% (89/600)
2024-05-01 03:18:39,272 : INFO : gram5-present-participle: 34.0% (204/600)
2024-05-01 03:18:39,948 : INFO : gram6-nationality-adjective: 92.0% (1006/1093)
2024-05-01 03:18:40,894 : INFO : gram7-past-tense: 36.9% (519/1406)
2024-05-01 03:18:41,437 : INFO : gram8-plural: 52.8% (371/702)
2024-05-01 03:18:41,745 : INFO : gram9-plural-verbs: 19.5% (90/462)
2024-05-01 03:18:41,746 : INFO : Quadruplets with 

Processing interval 2010-2014


2024-05-01 03:20:17,405 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model
2024-05-01 03:20:17,426 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model.wv.* with mmap=None
2024-05-01 03:20:17,427 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 03:20:17,518 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model.syn1neg.npy with mmap=None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model


2024-05-01 03:20:17,612 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:20:18,001 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/reuters/2005-2009.model', 'datetime': '2024-05-01T03:20:18.001258', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:20:18,001 : INFO : collecting all words and their counts
2024-05-01 03:20:18,002 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:20:18,173 : INFO : PROGRESS: at sentence #10000, processed 1907352 words, keeping 62284 word types
2024-05-01 03:20:18,356 : INFO : PROGRESS: at sentence #20000, processed 3812573 words, keeping 91068 word types
2024-05-01 03:20:18,524 : INFO : PROGRESS: at sentence #30000, processed 5761658 words, keeping 115577 word types
2024-05-01 03:20:18,700 : INFO : PROGRESS: at sentence #40000, processed 7711174 words, kee

Computing time for training the model: 362.68059372901917 seconds
Number of words processed per second: 716066.0137058233
Most frequent words in model: [',', 'somalia', 'mogadishu', 'adds', 'ethiopian', 'islamist', 'said', 'prisoners', 'jan', 'war']


2024-05-01 03:26:21,139 : INFO : Word2Vec lifecycle event {'fname_or_handle': '/Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-05-01T03:26:21.139509', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'saving'}
2024-05-01 03:26:21,140 : INFO : storing np array 'vectors' to /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model.wv.vectors.npy
2024-05-01 03:26:21,178 : INFO : storing np array 'syn1neg' to /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model.syn1neg.npy
2024-05-01 03:26:21,228 : INFO : not storing attribute cum_table
2024-05-01 03:26:21,292 : INFO : saved /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model
2024-05-01 03:26:21,302 : INFO : Skipping line #15 with OOV words: cucumber	potato	5.92
2024-05-01 03:26:21,303 : INFO : Skipping line #34 with OOV words: profe

Model saved to /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model


2024-05-01 03:26:22,061 : INFO : capital-common-countries: 75.3% (348/462)
2024-05-01 03:26:23,640 : INFO : capital-world: 76.3% (1330/1743)
2024-05-01 03:26:23,814 : INFO : currency: 9.7% (20/206)
2024-05-01 03:26:24,891 : INFO : city-in-state: 75.2% (1054/1402)
2024-05-01 03:26:25,135 : INFO : family: 45.6% (124/272)
2024-05-01 03:26:25,638 : INFO : gram1-adjective-to-adverb: 19.8% (139/702)
2024-05-01 03:26:25,973 : INFO : gram2-opposite: 17.1% (72/420)
2024-05-01 03:26:26,789 : INFO : gram3-comparative: 40.6% (512/1260)
2024-05-01 03:26:27,244 : INFO : gram4-superlative: 14.3% (86/600)
2024-05-01 03:26:27,692 : INFO : gram5-present-participle: 41.8% (251/600)
2024-05-01 03:26:28,345 : INFO : gram6-nationality-adjective: 95.5% (1044/1093)
2024-05-01 03:26:29,350 : INFO : gram7-past-tense: 42.0% (590/1406)
2024-05-01 03:26:29,838 : INFO : gram8-plural: 59.0% (414/702)
2024-05-01 03:26:30,148 : INFO : gram9-plural-verbs: 22.1% (102/462)
2024-05-01 03:26:30,150 : INFO : Quadruplets wit

Processing interval 2015-2019


2024-05-01 03:28:20,515 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model
2024-05-01 03:28:20,557 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model.wv.* with mmap=None
2024-05-01 03:28:20,558 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 03:28:20,704 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model.syn1neg.npy with mmap=None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model


2024-05-01 03:28:20,798 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:28:21,395 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/reuters/2010-2014.model', 'datetime': '2024-05-01T03:28:21.395140', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:28:21,395 : INFO : collecting all words and their counts
2024-05-01 03:28:21,395 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:28:21,565 : INFO : PROGRESS: at sentence #10000, processed 1793755 words, keeping 63447 word types
2024-05-01 03:28:21,717 : INFO : PROGRESS: at sentence #20000, processed 3535851 words, keeping 92607 word types
2024-05-01 03:28:21,871 : INFO : PROGRESS: at sentence #30000, processed 5265145 words, keeping 118217 word types
2024-05-01 03:28:22,033 : INFO : PROGRESS: at sentence #40000, processed 7008941 words, kee

Computing time for training the model: 244.28743505477905 seconds
Number of words processed per second: 716063.6975077114
Most frequent words in model: [',', 'somalia', 'mogadishu', 'adds', 'ethiopian', 'islamist', 'said', 'prisoners', 'jan', 'war']


2024-05-01 03:32:25,686 : INFO : Word2Vec lifecycle event {'fname_or_handle': '/Users/eddyji/GitRepos/01_TAD/models/reuters/2015-2019.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-05-01T03:32:25.686439', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'saving'}
2024-05-01 03:32:25,686 : INFO : storing np array 'vectors' to /Users/eddyji/GitRepos/01_TAD/models/reuters/2015-2019.model.wv.vectors.npy
2024-05-01 03:32:25,730 : INFO : storing np array 'syn1neg' to /Users/eddyji/GitRepos/01_TAD/models/reuters/2015-2019.model.syn1neg.npy
2024-05-01 03:32:25,787 : INFO : not storing attribute cum_table
2024-05-01 03:32:25,851 : INFO : saved /Users/eddyji/GitRepos/01_TAD/models/reuters/2015-2019.model
2024-05-01 03:32:25,866 : INFO : Skipping line #15 with OOV words: cucumber	potato	5.92
2024-05-01 03:32:25,866 : INFO : Skipping line #34 with OOV words: profe

Model saved to /Users/eddyji/GitRepos/01_TAD/models/reuters/2015-2019.model


2024-05-01 03:32:26,465 : INFO : capital-common-countries: 78.6% (363/462)
2024-05-01 03:32:28,042 : INFO : capital-world: 73.3% (1277/1743)
2024-05-01 03:32:28,206 : INFO : currency: 9.2% (19/206)
2024-05-01 03:32:29,398 : INFO : city-in-state: 61.9% (868/1402)
2024-05-01 03:32:29,612 : INFO : family: 54.0% (147/272)
2024-05-01 03:32:30,183 : INFO : gram1-adjective-to-adverb: 19.2% (135/702)
2024-05-01 03:32:30,571 : INFO : gram2-opposite: 14.5% (61/420)
2024-05-01 03:32:31,456 : INFO : gram3-comparative: 30.9% (389/1260)
2024-05-01 03:32:31,923 : INFO : gram4-superlative: 13.7% (82/600)
2024-05-01 03:32:32,511 : INFO : gram5-present-participle: 34.0% (204/600)
2024-05-01 03:32:33,337 : INFO : gram6-nationality-adjective: 95.1% (1039/1093)
2024-05-01 03:32:34,321 : INFO : gram7-past-tense: 45.6% (641/1406)
2024-05-01 03:32:34,858 : INFO : gram8-plural: 61.0% (428/702)
2024-05-01 03:32:35,213 : INFO : gram9-plural-verbs: 21.6% (100/462)
2024-05-01 03:32:35,220 : INFO : Quadruplets with

Processing interval 2000-2004


2024-05-01 03:32:42,561 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T03:32:42.561897', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 03:32:42,562 : INFO : collecting all words and their counts
2024-05-01 03:32:42,562 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training new model


2024-05-01 03:32:42,794 : INFO : PROGRESS: at sentence #10000, processed 3322280 words, keeping 100470 word types
2024-05-01 03:32:43,066 : INFO : PROGRESS: at sentence #20000, processed 6707830 words, keeping 142905 word types
2024-05-01 03:32:43,314 : INFO : PROGRESS: at sentence #30000, processed 10100748 words, keeping 178008 word types
2024-05-01 03:32:43,546 : INFO : PROGRESS: at sentence #40000, processed 13513916 words, keeping 208941 word types
2024-05-01 03:32:43,787 : INFO : PROGRESS: at sentence #50000, processed 16938381 words, keeping 237055 word types
2024-05-01 03:32:44,040 : INFO : PROGRESS: at sentence #60000, processed 20374082 words, keeping 264216 word types
2024-05-01 03:32:44,283 : INFO : PROGRESS: at sentence #70000, processed 23884707 words, keeping 290217 word types
2024-05-01 03:32:44,532 : INFO : PROGRESS: at sentence #80000, processed 27427271 words, keeping 324129 word types
2024-05-01 03:32:44,791 : INFO : PROGRESS: at sentence #90000, processed 30963406 

Computing time for training the model: 212.0238058567047 seconds
Number of words processed per second: 671712.2137513803
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'she', 'her', 'year', 'like', 'people']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model


2024-05-01 03:36:15,529 : INFO : capital-common-countries: 69.3% (320/462)
2024-05-01 03:36:16,413 : INFO : capital-world: 58.9% (538/914)
2024-05-01 03:36:16,488 : INFO : currency: 11.6% (10/86)
2024-05-01 03:36:18,077 : INFO : city-in-state: 39.9% (823/2064)
2024-05-01 03:36:18,477 : INFO : family: 67.9% (285/420)
2024-05-01 03:36:19,129 : INFO : gram1-adjective-to-adverb: 17.7% (154/870)
2024-05-01 03:36:19,394 : INFO : gram2-opposite: 15.4% (71/462)
2024-05-01 03:36:20,259 : INFO : gram3-comparative: 60.6% (807/1332)
2024-05-01 03:36:20,932 : INFO : gram4-superlative: 20.4% (166/812)
2024-05-01 03:36:21,431 : INFO : gram5-present-participle: 46.7% (328/702)
2024-05-01 03:36:22,691 : INFO : gram6-nationality-adjective: 79.5% (1033/1299)
2024-05-01 03:36:23,826 : INFO : gram7-past-tense: 56.8% (886/1560)
2024-05-01 03:36:24,594 : INFO : gram8-plural: 70.2% (741/1056)
2024-05-01 03:36:24,974 : INFO : gram9-plural-verbs: 39.1% (198/506)
2024-05-01 03:36:24,976 : INFO : Quadruplets with

Processing interval 2005-2009


2024-05-01 03:36:51,812 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model
2024-05-01 03:36:51,840 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model.wv.* with mmap=None
2024-05-01 03:36:51,841 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 03:36:51,938 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model.syn1neg.npy with mmap=None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model


2024-05-01 03:36:52,032 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:36:52,509 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/nyt/2000-2004.model', 'datetime': '2024-05-01T03:36:52.509196', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:36:52,509 : INFO : collecting all words and their counts
2024-05-01 03:36:52,509 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:36:52,856 : INFO : PROGRESS: at sentence #10000, processed 4564803 words, keeping 115715 word types
2024-05-01 03:36:53,172 : INFO : PROGRESS: at sentence #20000, processed 9025703 words, keeping 162439 word types
2024-05-01 03:36:53,529 : INFO : PROGRESS: at sentence #30000, processed 13591788 words, keeping 201459 word types
2024-05-01 03:36:53,866 : INFO : PROGRESS: at sentence #40000, processed 18136641 words, kee

Computing time for training the model: 172.32420706748962 seconds
Number of words processed per second: 638408.8624119653
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'she', 'her', 'year', 'like', 'people']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model


2024-05-01 03:39:44,954 : INFO : capital-common-countries: 74.2% (343/462)
2024-05-01 03:39:45,759 : INFO : capital-world: 66.7% (610/914)
2024-05-01 03:39:45,887 : INFO : currency: 11.6% (10/86)
2024-05-01 03:39:47,384 : INFO : city-in-state: 55.3% (1141/2064)
2024-05-01 03:39:47,695 : INFO : family: 66.9% (281/420)
2024-05-01 03:39:48,308 : INFO : gram1-adjective-to-adverb: 19.9% (173/870)
2024-05-01 03:39:48,649 : INFO : gram2-opposite: 18.4% (85/462)
2024-05-01 03:39:49,555 : INFO : gram3-comparative: 66.6% (887/1332)
2024-05-01 03:39:50,203 : INFO : gram4-superlative: 26.2% (213/812)
2024-05-01 03:39:50,739 : INFO : gram5-present-participle: 50.4% (354/702)
2024-05-01 03:39:51,642 : INFO : gram6-nationality-adjective: 84.5% (1098/1299)
2024-05-01 03:39:52,730 : INFO : gram7-past-tense: 58.6% (914/1560)
2024-05-01 03:39:53,496 : INFO : gram8-plural: 76.0% (803/1056)
2024-05-01 03:39:53,836 : INFO : gram9-plural-verbs: 44.9% (227/506)
2024-05-01 03:39:53,839 : INFO : Quadruplets wit

Processing interval 2010-2014


2024-05-01 03:40:01,573 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model
2024-05-01 03:40:01,601 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model.wv.* with mmap=None
2024-05-01 03:40:01,601 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 03:40:01,663 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 03:40:01,754 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model


2024-05-01 03:40:02,300 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/nyt/2005-2009.model', 'datetime': '2024-05-01T03:40:02.300818', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:40:02,301 : INFO : collecting all words and their counts
2024-05-01 03:40:02,301 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:40:02,563 : INFO : PROGRESS: at sentence #10000, processed 3305014 words, keeping 98585 word types
2024-05-01 03:40:02,791 : INFO : PROGRESS: at sentence #20000, processed 6476359 words, keeping 142051 word types
2024-05-01 03:40:03,038 : INFO : PROGRESS: at sentence #30000, processed 9640808 words, keeping 177236 word types
2024-05-01 03:40:03,273 : INFO : PROGRESS: at sentence #40000, processed 12925349 words, keeping 208591 word types
2024-05-01 03:40:03,509 : INFO : PROGRESS: at sentence #

Computing time for training the model: 268.58248710632324 seconds
Number of words processed per second: 615169.7111010561
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'she', 'her', 'year', 'like', 'people']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model


2024-05-01 03:44:30,981 : INFO : capital-common-countries: 79.9% (369/462)
2024-05-01 03:44:31,851 : INFO : capital-world: 73.9% (675/914)
2024-05-01 03:44:31,926 : INFO : currency: 14.0% (12/86)
2024-05-01 03:44:33,563 : INFO : city-in-state: 61.2% (1264/2064)
2024-05-01 03:44:33,897 : INFO : family: 74.8% (314/420)
2024-05-01 03:44:34,648 : INFO : gram1-adjective-to-adverb: 20.5% (178/870)
2024-05-01 03:44:35,013 : INFO : gram2-opposite: 13.4% (62/462)
2024-05-01 03:44:36,176 : INFO : gram3-comparative: 62.0% (826/1332)
2024-05-01 03:44:36,849 : INFO : gram4-superlative: 33.3% (270/812)
2024-05-01 03:44:37,386 : INFO : gram5-present-participle: 55.7% (391/702)
2024-05-01 03:44:38,338 : INFO : gram6-nationality-adjective: 91.8% (1192/1299)
2024-05-01 03:44:39,594 : INFO : gram7-past-tense: 57.7% (900/1560)
2024-05-01 03:44:40,412 : INFO : gram8-plural: 72.6% (767/1056)
2024-05-01 03:44:40,867 : INFO : gram9-plural-verbs: 37.4% (189/506)
2024-05-01 03:44:40,868 : INFO : Quadruplets wit

Processing interval 2015-2019


2024-05-01 03:45:05,944 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model
2024-05-01 03:45:05,991 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model.wv.* with mmap=None
2024-05-01 03:45:05,991 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model.wv.vectors.npy with mmap=None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model


2024-05-01 03:45:06,145 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model.syn1neg.npy with mmap=None
2024-05-01 03:45:06,328 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:45:07,050 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/nyt/2010-2014.model', 'datetime': '2024-05-01T03:45:07.050479', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:45:07,050 : INFO : collecting all words and their counts
2024-05-01 03:45:07,051 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:45:07,434 : INFO : PROGRESS: at sentence #10000, processed 3889240 words, keeping 108512 word types
2024-05-01 03:45:07,774 : INFO : PROGRESS: at sentence #20000, processed 7726461 words, keeping 153749 word types
2024-05-01 03:45:08,147 : INFO : PROGRESS: at sentence #30000, proces

Computing time for training the model: 204.40066409111023 seconds
Number of words processed per second: 641528.3070780543
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'she', 'her', 'year', 'like', 'people']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyt/2015-2019.model


2024-05-01 03:48:31,415 : INFO : capital-common-countries: 79.7% (368/462)
2024-05-01 03:48:32,233 : INFO : capital-world: 75.6% (691/914)
2024-05-01 03:48:32,343 : INFO : currency: 15.1% (13/86)
2024-05-01 03:48:34,141 : INFO : city-in-state: 67.1% (1384/2064)
2024-05-01 03:48:34,535 : INFO : family: 73.3% (308/420)
2024-05-01 03:48:35,309 : INFO : gram1-adjective-to-adverb: 21.1% (184/870)
2024-05-01 03:48:35,690 : INFO : gram2-opposite: 14.5% (67/462)
2024-05-01 03:48:36,992 : INFO : gram3-comparative: 56.9% (758/1332)
2024-05-01 03:48:37,719 : INFO : gram4-superlative: 34.2% (278/812)
2024-05-01 03:48:38,269 : INFO : gram5-present-participle: 55.4% (389/702)
2024-05-01 03:48:39,315 : INFO : gram6-nationality-adjective: 93.1% (1210/1299)
2024-05-01 03:48:40,710 : INFO : gram7-past-tense: 59.7% (931/1560)
2024-05-01 03:48:41,688 : INFO : gram8-plural: 73.7% (778/1056)
2024-05-01 03:48:42,126 : INFO : gram9-plural-verbs: 37.5% (190/506)
2024-05-01 03:48:42,127 : INFO : Quadruplets wit

Processing interval 2000-2004


2024-05-01 03:48:43,135 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T03:48:43.135953', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 03:48:43,136 : INFO : collecting all words and their counts
2024-05-01 03:48:43,136 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training new model


2024-05-01 03:48:43,420 : INFO : PROGRESS: at sentence #10000, processed 3985181 words, keeping 103954 word types
2024-05-01 03:48:43,628 : INFO : collected 133455 word types from a corpus of 6620723 raw words and 15313 sentences
2024-05-01 03:48:43,628 : INFO : Creating a fresh vocabulary
2024-05-01 03:48:43,683 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 50838 unique words (38.09% of original 133455, drops 82617)', 'datetime': '2024-05-01T03:48:43.683334', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-05-01 03:48:43,683 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 6493934 word corpus (98.08% of original 6620723, drops 126789)', 'datetime': '2024-05-01T03:48:43.683687', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'prepare_voc

Computing time for training the model: 9.962915182113647 seconds
Number of words processed per second: 664536.72233265
Most frequent words in model: ['he', 'his', 'new', 'like', 'bush', 'her', 'she', 'people', 'time', 'him']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model


2024-05-01 03:48:53,734 : INFO : capital-common-countries: 4.8% (20/420)
2024-05-01 03:48:54,363 : INFO : capital-world: 8.8% (59/674)
2024-05-01 03:48:54,422 : INFO : currency: 0.0% (0/68)
2024-05-01 03:48:55,294 : INFO : city-in-state: 2.2% (24/1083)
2024-05-01 03:48:55,762 : INFO : family: 10.2% (43/420)
2024-05-01 03:48:56,518 : INFO : gram1-adjective-to-adverb: 0.8% (7/870)
2024-05-01 03:48:57,072 : INFO : gram2-opposite: 0.3% (2/702)
2024-05-01 03:48:58,218 : INFO : gram3-comparative: 3.8% (50/1332)
2024-05-01 03:48:58,902 : INFO : gram4-superlative: 1.6% (14/870)
2024-05-01 03:48:59,440 : INFO : gram5-present-participle: 11.7% (76/650)
2024-05-01 03:49:00,469 : INFO : gram6-nationality-adjective: 20.5% (238/1161)
2024-05-01 03:49:01,764 : INFO : gram7-past-tense: 9.6% (142/1482)
2024-05-01 03:49:02,610 : INFO : gram8-plural: 14.1% (149/1056)
2024-05-01 03:49:02,984 : INFO : gram9-plural-verbs: 3.6% (15/420)
2024-05-01 03:49:02,986 : INFO : Quadruplets with out-of-vocabulary word

Processing interval 2005-2009


2024-05-01 03:49:03,473 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model
2024-05-01 03:49:03,479 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model.wv.* with mmap=None
2024-05-01 03:49:03,479 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 03:49:03,486 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model.syn1neg.npy with mmap=None
2024-05-01 03:49:03,494 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:49:03,614 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model', 'datetime': '2024-05-01T03:49:03.614521', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:49:03,614 : INFO : collecting all words and t

Loading model from /Users/eddyji/GitRepos/01_TAD/models/slate/2000-2004.model


2024-05-01 03:49:03,949 : INFO : PROGRESS: at sentence #10000, processed 4579533 words, keeping 114535 word types
2024-05-01 03:49:04,198 : INFO : PROGRESS: at sentence #20000, processed 8213939 words, keeping 153475 word types
2024-05-01 03:49:04,336 : INFO : collected 172328 word types from a corpus of 10184282 raw words and 26045 sentences
2024-05-01 03:49:04,336 : INFO : Updating model with new vocabulary
2024-05-01 03:49:04,439 : INFO : Word2Vec lifecycle event {'msg': 'added 16493 new unique words (9.57% of original 172328) and increased the count of 44142 pre-existing words (25.62% of original 172328)', 'datetime': '2024-05-01T03:49:04.439744', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-05-01 03:49:04,525 : INFO : deleting the raw counts dictionary of 172328 items
2024-05-01 03:49:04,527 : INFO : sample=0.0001 downsamples 621 most-common words
2024-05-01 03:49:04,

Computing time for training the model: 15.57312297821045 seconds
Number of words processed per second: 653965.2974069241
Most frequent words in model: ['he', 'his', 'new', 'like', 'bush', 'her', 'she', 'people', 'time', 'him']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model


2024-05-01 03:49:19,641 : INFO : capital-common-countries: 18.6% (78/420)
2024-05-01 03:49:20,423 : INFO : capital-world: 21.4% (144/674)
2024-05-01 03:49:20,480 : INFO : currency: 8.8% (6/68)
2024-05-01 03:49:21,328 : INFO : city-in-state: 7.3% (79/1083)
2024-05-01 03:49:21,607 : INFO : family: 32.9% (138/420)
2024-05-01 03:49:22,226 : INFO : gram1-adjective-to-adverb: 2.3% (20/870)
2024-05-01 03:49:22,711 : INFO : gram2-opposite: 3.0% (21/702)
2024-05-01 03:49:23,647 : INFO : gram3-comparative: 23.5% (313/1332)
2024-05-01 03:49:24,227 : INFO : gram4-superlative: 4.9% (43/870)
2024-05-01 03:49:24,699 : INFO : gram5-present-participle: 24.9% (162/650)
2024-05-01 03:49:25,558 : INFO : gram6-nationality-adjective: 46.0% (534/1161)
2024-05-01 03:49:26,555 : INFO : gram7-past-tense: 28.4% (421/1482)
2024-05-01 03:49:27,281 : INFO : gram8-plural: 41.3% (436/1056)
2024-05-01 03:49:27,658 : INFO : gram9-plural-verbs: 13.6% (57/420)
2024-05-01 03:49:27,661 : INFO : Quadruplets with out-of-voca

Processing interval 2010-2014


2024-05-01 03:49:28,484 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model
2024-05-01 03:49:28,491 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model.wv.* with mmap=None
2024-05-01 03:49:28,491 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 03:49:28,501 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 03:49:28,511 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:49:28,663 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model', 'datetime': '2024-05-01T03:49:28.663669', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:49:28,664 : INFO : collecting all words and t

Loading model from /Users/eddyji/GitRepos/01_TAD/models/slate/2005-2009.model


2024-05-01 03:49:28,893 : INFO : PROGRESS: at sentence #10000, processed 3192767 words, keeping 100797 word types
2024-05-01 03:49:29,102 : INFO : PROGRESS: at sentence #20000, processed 6198933 words, keeping 142296 word types
2024-05-01 03:49:29,302 : INFO : PROGRESS: at sentence #30000, processed 9139696 words, keeping 171616 word types
2024-05-01 03:49:29,509 : INFO : PROGRESS: at sentence #40000, processed 12179687 words, keeping 199574 word types
2024-05-01 03:49:29,723 : INFO : PROGRESS: at sentence #50000, processed 15371516 words, keeping 227897 word types
2024-05-01 03:49:29,947 : INFO : PROGRESS: at sentence #60000, processed 18699205 words, keeping 256445 word types
2024-05-01 03:49:29,986 : INFO : collected 261046 word types from a corpus of 19269803 raw words and 61621 sentences
2024-05-01 03:49:29,986 : INFO : Updating model with new vocabulary
2024-05-01 03:49:30,141 : INFO : Word2Vec lifecycle event {'msg': 'added 22666 new unique words (8.68% of original 261046) and i

Computing time for training the model: 30.105632066726685 seconds
Number of words processed per second: 640073.0254488612
Most frequent words in model: ['he', 'his', 'new', 'like', 'bush', 'her', 'she', 'people', 'time', 'him']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model


2024-05-01 03:49:59,099 : INFO : capital-common-countries: 44.8% (188/420)
2024-05-01 03:50:00,091 : INFO : capital-world: 38.6% (260/674)
2024-05-01 03:50:00,156 : INFO : currency: 14.7% (10/68)
2024-05-01 03:50:01,160 : INFO : city-in-state: 21.8% (236/1083)
2024-05-01 03:50:01,600 : INFO : family: 55.7% (234/420)
2024-05-01 03:50:02,477 : INFO : gram1-adjective-to-adverb: 7.5% (65/870)
2024-05-01 03:50:03,112 : INFO : gram2-opposite: 9.4% (66/702)
2024-05-01 03:50:04,455 : INFO : gram3-comparative: 47.1% (627/1332)
2024-05-01 03:50:05,153 : INFO : gram4-superlative: 15.7% (137/870)
2024-05-01 03:50:05,686 : INFO : gram5-present-participle: 51.1% (332/650)
2024-05-01 03:50:06,637 : INFO : gram6-nationality-adjective: 68.6% (797/1161)
2024-05-01 03:50:07,844 : INFO : gram7-past-tense: 40.8% (605/1482)
2024-05-01 03:50:08,842 : INFO : gram8-plural: 63.2% (667/1056)
2024-05-01 03:50:09,206 : INFO : gram9-plural-verbs: 26.4% (111/420)
2024-05-01 03:50:09,207 : INFO : Quadruplets with out

Processing interval 2015-2019


2024-05-01 03:50:10,283 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model
2024-05-01 03:50:10,293 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model.wv.* with mmap=None
2024-05-01 03:50:10,293 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 03:50:10,305 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model.syn1neg.npy with mmap=None
2024-05-01 03:50:10,318 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model


2024-05-01 03:50:10,515 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/slate/2010-2014.model', 'datetime': '2024-05-01T03:50:10.515553', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:50:10,515 : INFO : collecting all words and their counts
2024-05-01 03:50:10,516 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:50:10,758 : INFO : PROGRESS: at sentence #10000, processed 3407191 words, keeping 98586 word types
2024-05-01 03:50:10,995 : INFO : PROGRESS: at sentence #20000, processed 6902607 words, keeping 139596 word types
2024-05-01 03:50:11,231 : INFO : PROGRESS: at sentence #30000, processed 10439686 words, keeping 172783 word types
2024-05-01 03:50:11,496 : INFO : PROGRESS: at sentence #40000, processed 14418409 words, keeping 202980 word types
2024-05-01 03:50:11,767 : INFO : PROGRESS: at sentenc

Computing time for training the model: 35.991223096847534 seconds
Number of words processed per second: 666782.2856540268
Most frequent words in model: ['he', 'his', 'new', 'like', 'bush', 'her', 'she', 'people', 'time', 'him']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/slate/2015-2019.model


2024-05-01 03:50:46,830 : INFO : capital-common-countries: 35.7% (150/420)
2024-05-01 03:50:47,664 : INFO : capital-world: 35.0% (236/674)
2024-05-01 03:50:47,743 : INFO : currency: 14.7% (10/68)
2024-05-01 03:50:48,737 : INFO : city-in-state: 42.2% (457/1083)
2024-05-01 03:50:49,093 : INFO : family: 55.2% (232/420)
2024-05-01 03:50:49,930 : INFO : gram1-adjective-to-adverb: 11.1% (97/870)
2024-05-01 03:50:50,513 : INFO : gram2-opposite: 9.7% (68/702)
2024-05-01 03:50:51,671 : INFO : gram3-comparative: 48.3% (644/1332)
2024-05-01 03:50:52,384 : INFO : gram4-superlative: 13.6% (118/870)
2024-05-01 03:50:52,928 : INFO : gram5-present-participle: 48.5% (315/650)
2024-05-01 03:50:53,883 : INFO : gram6-nationality-adjective: 75.0% (871/1161)
2024-05-01 03:50:55,096 : INFO : gram7-past-tense: 44.7% (663/1482)
2024-05-01 03:50:55,972 : INFO : gram8-plural: 70.7% (747/1056)
2024-05-01 03:50:56,417 : INFO : gram9-plural-verbs: 29.0% (122/420)
2024-05-01 03:50:56,421 : INFO : Quadruplets with ou

Processing interval 2000-2004


2024-05-01 03:50:57,208 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T03:50:57.208425', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 03:50:57,208 : INFO : collecting all words and their counts
2024-05-01 03:50:57,208 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training new model


2024-05-01 03:50:57,467 : INFO : PROGRESS: at sentence #10000, processed 3334477 words, keeping 94482 word types
2024-05-01 03:50:57,722 : INFO : PROGRESS: at sentence #20000, processed 7014507 words, keeping 137375 word types
2024-05-01 03:50:57,993 : INFO : PROGRESS: at sentence #30000, processed 10876366 words, keeping 174687 word types
2024-05-01 03:50:58,262 : INFO : PROGRESS: at sentence #40000, processed 14823390 words, keeping 208547 word types
2024-05-01 03:50:58,337 : INFO : collected 217433 word types from a corpus of 15905452 raw words and 42787 sentences
2024-05-01 03:50:58,337 : INFO : Creating a fresh vocabulary
2024-05-01 03:50:58,421 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 69322 unique words (31.88% of original 217433, drops 148111)', 'datetime': '2024-05-01T03:50:58.421333', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-05-

Computing time for training the model: 23.539870262145996 seconds
Number of words processed per second: 675681.3789911682
Most frequent words in model: ['he', 'says', 'his', 'new', 'she', 'people', 'her', 'year', 'like', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model


2024-05-01 03:51:21,613 : INFO : capital-common-countries: 42.2% (195/462)
2024-05-01 03:51:23,178 : INFO : capital-world: 35.7% (459/1284)
2024-05-01 03:51:23,304 : INFO : currency: 15.1% (16/106)
2024-05-01 03:51:25,391 : INFO : city-in-state: 10.9% (206/1882)
2024-05-01 03:51:25,821 : INFO : family: 40.5% (154/380)
2024-05-01 03:51:26,538 : INFO : gram1-adjective-to-adverb: 3.6% (29/812)
2024-05-01 03:51:27,107 : INFO : gram2-opposite: 1.1% (7/650)
2024-05-01 03:51:28,132 : INFO : gram3-comparative: 26.1% (348/1332)
2024-05-01 03:51:28,954 : INFO : gram4-superlative: 7.4% (69/930)
2024-05-01 03:51:29,497 : INFO : gram5-present-participle: 36.5% (237/650)
2024-05-01 03:51:30,744 : INFO : gram6-nationality-adjective: 46.9% (577/1229)
2024-05-01 03:51:32,469 : INFO : gram7-past-tense: 28.3% (442/1560)
2024-05-01 03:51:33,651 : INFO : gram8-plural: 42.8% (539/1260)
2024-05-01 03:51:34,138 : INFO : gram9-plural-verbs: 10.1% (56/552)
2024-05-01 03:51:34,139 : INFO : Quadruplets with out-o

Processing interval 2005-2009


2024-05-01 03:51:34,880 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model
2024-05-01 03:51:34,888 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model.wv.* with mmap=None
2024-05-01 03:51:34,888 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 03:51:34,898 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model.syn1neg.npy with mmap=None
2024-05-01 03:51:34,908 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:51:35,060 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model', 'datetime': '2024-05-01T03:51:35.060266', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:51:35,060 : INFO : collecting all words and their count

Loading model from /Users/eddyji/GitRepos/01_TAD/models/csm/2000-2004.model


2024-05-01 03:51:35,354 : INFO : PROGRESS: at sentence #10000, processed 3752865 words, keeping 102266 word types
2024-05-01 03:51:35,618 : INFO : PROGRESS: at sentence #20000, processed 7662011 words, keeping 148578 word types
2024-05-01 03:51:35,864 : INFO : PROGRESS: at sentence #30000, processed 11273086 words, keeping 182784 word types
2024-05-01 03:51:36,084 : INFO : PROGRESS: at sentence #40000, processed 14571885 words, keeping 213150 word types
2024-05-01 03:51:36,180 : INFO : collected 225287 word types from a corpus of 16034390 raw words and 44535 sentences
2024-05-01 03:51:36,180 : INFO : Updating model with new vocabulary
2024-05-01 03:51:36,306 : INFO : Word2Vec lifecycle event {'msg': 'added 15987 new unique words (7.10% of original 225287) and increased the count of 53798 pre-existing words (23.88% of original 225287)', 'datetime': '2024-05-01T03:51:36.306745', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1

Computing time for training the model: 24.34507727622986 seconds
Number of words processed per second: 658629.6612685522
Most frequent words in model: ['he', 'says', 'his', 'new', 'she', 'people', 'her', 'year', 'like', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model


2024-05-01 03:52:00,000 : INFO : capital-common-countries: 51.5% (238/462)
2024-05-01 03:52:01,283 : INFO : capital-world: 45.8% (588/1284)
2024-05-01 03:52:01,421 : INFO : currency: 25.5% (27/106)
2024-05-01 03:52:03,145 : INFO : city-in-state: 20.1% (379/1882)
2024-05-01 03:52:03,482 : INFO : family: 56.8% (216/380)
2024-05-01 03:52:04,194 : INFO : gram1-adjective-to-adverb: 4.4% (36/812)
2024-05-01 03:52:04,675 : INFO : gram2-opposite: 5.2% (34/650)
2024-05-01 03:52:05,667 : INFO : gram3-comparative: 33.1% (441/1332)
2024-05-01 03:52:06,485 : INFO : gram4-superlative: 12.0% (112/930)
2024-05-01 03:52:06,950 : INFO : gram5-present-participle: 41.5% (270/650)
2024-05-01 03:52:07,951 : INFO : gram6-nationality-adjective: 60.1% (739/1229)
2024-05-01 03:52:09,075 : INFO : gram7-past-tense: 35.2% (549/1560)
2024-05-01 03:52:10,008 : INFO : gram8-plural: 59.5% (750/1260)
2024-05-01 03:52:10,458 : INFO : gram9-plural-verbs: 18.1% (100/552)
2024-05-01 03:52:10,460 : INFO : Quadruplets with o

Processing interval 2010-2014


2024-05-01 03:52:11,807 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model
2024-05-01 03:52:11,815 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model.wv.* with mmap=None
2024-05-01 03:52:11,815 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 03:52:11,826 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 03:52:11,838 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model


2024-05-01 03:52:12,026 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/csm/2005-2009.model', 'datetime': '2024-05-01T03:52:12.026448', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:52:12,026 : INFO : collecting all words and their counts
2024-05-01 03:52:12,026 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:52:12,283 : INFO : PROGRESS: at sentence #10000, processed 3273025 words, keeping 89866 word types
2024-05-01 03:52:12,495 : INFO : PROGRESS: at sentence #20000, processed 6483386 words, keeping 126960 word types
2024-05-01 03:52:12,712 : INFO : PROGRESS: at sentence #30000, processed 9826095 words, keeping 159726 word types
2024-05-01 03:52:12,935 : INFO : PROGRESS: at sentence #40000, processed 13213559 words, keeping 188079 word types
2024-05-01 03:52:13,165 : INFO : PROGRESS: at sentence #

Computing time for training the model: 46.823031187057495 seconds
Number of words processed per second: 671296.9067386705
Most frequent words in model: ['he', 'says', 'his', 'new', 'she', 'people', 'her', 'year', 'like', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model


2024-05-01 03:52:59,312 : INFO : capital-common-countries: 60.8% (281/462)
2024-05-01 03:53:00,639 : INFO : capital-world: 55.8% (717/1284)
2024-05-01 03:53:00,740 : INFO : currency: 21.7% (23/106)
2024-05-01 03:53:02,059 : INFO : city-in-state: 44.3% (833/1882)
2024-05-01 03:53:02,343 : INFO : family: 61.6% (234/380)
2024-05-01 03:53:02,876 : INFO : gram1-adjective-to-adverb: 10.1% (82/812)
2024-05-01 03:53:03,291 : INFO : gram2-opposite: 13.7% (89/650)
2024-05-01 03:53:04,284 : INFO : gram3-comparative: 42.3% (563/1332)
2024-05-01 03:53:04,970 : INFO : gram4-superlative: 21.5% (200/930)
2024-05-01 03:53:05,401 : INFO : gram5-present-participle: 48.3% (314/650)
2024-05-01 03:53:06,318 : INFO : gram6-nationality-adjective: 81.3% (999/1229)
2024-05-01 03:53:07,350 : INFO : gram7-past-tense: 44.6% (696/1560)
2024-05-01 03:53:08,314 : INFO : gram8-plural: 67.3% (848/1260)
2024-05-01 03:53:08,673 : INFO : gram9-plural-verbs: 23.6% (130/552)
2024-05-01 03:53:08,686 : INFO : Quadruplets with

Processing interval 2015-2019


2024-05-01 03:53:14,385 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model
2024-05-01 03:53:14,397 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model.wv.* with mmap=None
2024-05-01 03:53:14,397 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 03:53:14,413 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model.syn1neg.npy with mmap=None
2024-05-01 03:53:14,429 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model


2024-05-01 03:53:14,672 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/csm/2010-2014.model', 'datetime': '2024-05-01T03:53:14.672451', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:53:14,672 : INFO : collecting all words and their counts
2024-05-01 03:53:14,672 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:53:14,945 : INFO : PROGRESS: at sentence #10000, processed 3480364 words, keeping 93714 word types
2024-05-01 03:53:15,177 : INFO : PROGRESS: at sentence #20000, processed 6956765 words, keeping 133309 word types
2024-05-01 03:53:15,422 : INFO : PROGRESS: at sentence #30000, processed 10632517 words, keeping 167382 word types
2024-05-01 03:53:15,697 : INFO : PROGRESS: at sentence #40000, processed 14717286 words, keeping 198762 word types
2024-05-01 03:53:15,980 : INFO : collected 229325 word 

Computing time for training the model: 28.048979997634888 seconds
Number of words processed per second: 666969.2802225769
Most frequent words in model: ['he', 'says', 'his', 'new', 'she', 'people', 'her', 'year', 'like', 'time']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/csm/2015-2019.model


2024-05-01 03:53:43,258 : INFO : capital-common-countries: 62.3% (288/462)
2024-05-01 03:53:44,132 : INFO : capital-world: 54.0% (693/1284)
2024-05-01 03:53:44,255 : INFO : currency: 22.6% (24/106)
2024-05-01 03:53:45,539 : INFO : city-in-state: 52.9% (996/1882)
2024-05-01 03:53:45,845 : INFO : family: 63.2% (240/380)
2024-05-01 03:53:46,397 : INFO : gram1-adjective-to-adverb: 12.2% (99/812)
2024-05-01 03:53:46,921 : INFO : gram2-opposite: 12.8% (83/650)
2024-05-01 03:53:47,875 : INFO : gram3-comparative: 40.2% (535/1332)
2024-05-01 03:53:48,525 : INFO : gram4-superlative: 17.6% (164/930)
2024-05-01 03:53:49,021 : INFO : gram5-present-participle: 46.5% (302/650)
2024-05-01 03:53:49,915 : INFO : gram6-nationality-adjective: 85.8% (1055/1229)
2024-05-01 03:53:50,945 : INFO : gram7-past-tense: 40.3% (629/1560)
2024-05-01 03:53:52,299 : INFO : gram8-plural: 69.3% (873/1260)
2024-05-01 03:53:52,822 : INFO : gram9-plural-verbs: 27.0% (149/552)
2024-05-01 03:53:52,823 : INFO : Quadruplets wit

Processing interval 2000-2004
Training new model
Computing time for training the model: 0.07237982749938965 seconds
Number of words processed per second: 82.89602513974762
Most frequent words in model: [',']


2024-05-01 03:53:53,063 : INFO : Word2Vec lifecycle event {'fname_or_handle': '/Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-05-01T03:53:53.063270', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'saving'}
2024-05-01 03:53:53,064 : INFO : not storing attribute cum_table
2024-05-01 03:53:53,066 : INFO : saved /Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model
2024-05-01 03:53:53,066 : INFO : Skipping line #2 with OOV words: love	sex	6.77
2024-05-01 03:53:53,067 : INFO : Skipping line #3 with OOV words: tiger	cat	7.35
2024-05-01 03:53:53,068 : INFO : Skipping line #4 with OOV words: tiger	tiger	10.00
2024-05-01 03:53:53,069 : INFO : Skipping line #5 with OOV words: book	paper	7.46
2024-05-01 03:53:53,070 : INFO : Skipping line #6 with OOV words: computer	keyboard	7.62
2024-05-01 03:53:53

Model saved to /Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model
Processing interval 2005-2009


2024-05-01 03:53:53,449 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model
2024-05-01 03:53:53,449 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model.wv.* with mmap=None
2024-05-01 03:53:53,450 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:53:53,450 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model', 'datetime': '2024-05-01T03:53:53.450427', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:53:53,450 : INFO : collecting all words and their counts
2024-05-01 03:53:53,450 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:53:53,567 : INFO : PROGRESS: at sentence #10000, processed 1627793 words, keeping 53388 word types


Loading model from /Users/eddyji/GitRepos/01_TAD/models/thehill/2000-2004.model


2024-05-01 03:53:53,706 : INFO : PROGRESS: at sentence #20000, processed 3694220 words, keeping 75915 word types
2024-05-01 03:53:53,764 : INFO : PROGRESS: at sentence #30000, processed 4463237 words, keeping 89962 word types
2024-05-01 03:53:53,850 : INFO : collected 103940 word types from a corpus of 5766423 raw words and 37352 sentences
2024-05-01 03:53:53,851 : INFO : Updating model with new vocabulary
2024-05-01 03:53:53,895 : INFO : Word2Vec lifecycle event {'msg': 'added 30958 new unique words (29.78% of original 103940) and increased the count of 0 pre-existing words (0.00% of original 103940)', 'datetime': '2024-05-01T03:53:53.894982', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-05-01 03:53:53,937 : INFO : deleting the raw counts dictionary of 103940 items
2024-05-01 03:53:53,938 : INFO : sample=0.0001 downsamples 691 most-common words
2024-05-01 03:53:53,939 : I

Computing time for training the model: 7.281463861465454 seconds
Number of words processed per second: 791931.8298778812
Most frequent words in model: [',', 'bastion', 'congressional', 'bent', 'southern', 'lead', 'believe', 'followed', 'county', 'california']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/thehill/2005-2009.model


2024-05-01 03:54:01,208 : INFO : capital-common-countries: 2.4% (10/420)
2024-05-01 03:54:01,816 : INFO : capital-world: 0.9% (4/424)
2024-05-01 03:54:01,848 : INFO : currency: 0.0% (0/28)
2024-05-01 03:54:03,525 : INFO : city-in-state: 5.7% (104/1827)
2024-05-01 03:54:03,875 : INFO : family: 12.0% (41/342)
2024-05-01 03:54:04,586 : INFO : gram1-adjective-to-adverb: 0.7% (5/756)
2024-05-01 03:54:05,198 : INFO : gram2-opposite: 0.8% (5/650)
2024-05-01 03:54:06,420 : INFO : gram3-comparative: 2.0% (26/1332)
2024-05-01 03:54:07,196 : INFO : gram4-superlative: 0.9% (7/812)
2024-05-01 03:54:07,844 : INFO : gram5-present-participle: 10.8% (70/650)
2024-05-01 03:54:08,686 : INFO : gram6-nationality-adjective: 12.9% (125/969)
2024-05-01 03:54:09,961 : INFO : gram7-past-tense: 2.0% (31/1560)
2024-05-01 03:54:10,858 : INFO : gram8-plural: 8.2% (87/1056)
2024-05-01 03:54:11,459 : INFO : gram9-plural-verbs: 2.4% (12/506)
2024-05-01 03:54:11,460 : INFO : Quadruplets with out-of-vocabulary words: 42

Processing interval 2010-2014


2024-05-01 03:54:12,875 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/thehill/2005-2009.model
2024-05-01 03:54:12,893 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/thehill/2005-2009.model.wv.* with mmap=None
2024-05-01 03:54:12,893 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:54:12,962 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/thehill/2005-2009.model', 'datetime': '2024-05-01T03:54:12.962272', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:54:12,962 : INFO : collecting all words and their counts
2024-05-01 03:54:12,962 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:54:13,072 : INFO : PROGRESS: at sentence #10000, processed 1610306 words, keeping 49214 word types


Loading model from /Users/eddyji/GitRepos/01_TAD/models/thehill/2005-2009.model


2024-05-01 03:54:13,191 : INFO : PROGRESS: at sentence #20000, processed 3450123 words, keeping 72901 word types
2024-05-01 03:54:13,335 : INFO : PROGRESS: at sentence #30000, processed 5299713 words, keeping 91593 word types
2024-05-01 03:54:13,458 : INFO : PROGRESS: at sentence #40000, processed 7240890 words, keeping 108370 word types
2024-05-01 03:54:13,585 : INFO : PROGRESS: at sentence #50000, processed 9260163 words, keeping 124277 word types
2024-05-01 03:54:13,705 : INFO : PROGRESS: at sentence #60000, processed 11173587 words, keeping 139060 word types
2024-05-01 03:54:13,829 : INFO : PROGRESS: at sentence #70000, processed 13120605 words, keeping 153418 word types
2024-05-01 03:54:13,964 : INFO : PROGRESS: at sentence #80000, processed 15205842 words, keeping 168362 word types
2024-05-01 03:54:14,096 : INFO : PROGRESS: at sentence #90000, processed 17272153 words, keeping 182968 word types
2024-05-01 03:54:14,239 : INFO : PROGRESS: at sentence #100000, processed 19510368 wor

Computing time for training the model: 40.84404683113098 seconds
Number of words processed per second: 788420.7980942694
Most frequent words in model: [',', 'bastion', 'congressional', 'bent', 'southern', 'lead', 'believe', 'followed', 'county', 'california']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model


2024-05-01 03:54:54,255 : INFO : capital-common-countries: 22.1% (93/420)
2024-05-01 03:54:54,763 : INFO : capital-world: 17.2% (73/424)
2024-05-01 03:54:54,781 : INFO : currency: 17.9% (5/28)
2024-05-01 03:54:55,958 : INFO : city-in-state: 36.2% (662/1827)
2024-05-01 03:54:56,173 : INFO : family: 27.2% (93/342)
2024-05-01 03:54:56,702 : INFO : gram1-adjective-to-adverb: 3.4% (26/756)
2024-05-01 03:54:57,204 : INFO : gram2-opposite: 7.7% (50/650)
2024-05-01 03:54:58,053 : INFO : gram3-comparative: 18.2% (243/1332)
2024-05-01 03:54:58,585 : INFO : gram4-superlative: 8.0% (65/812)
2024-05-01 03:54:59,064 : INFO : gram5-present-participle: 32.8% (213/650)
2024-05-01 03:54:59,763 : INFO : gram6-nationality-adjective: 39.8% (386/969)
2024-05-01 03:55:00,706 : INFO : gram7-past-tense: 22.4% (349/1560)
2024-05-01 03:55:01,577 : INFO : gram8-plural: 25.4% (268/1056)
2024-05-01 03:55:01,911 : INFO : gram9-plural-verbs: 13.4% (68/506)
2024-05-01 03:55:01,918 : INFO : Quadruplets with out-of-voca

Processing interval 2015-2019


2024-05-01 03:55:05,010 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model
2024-05-01 03:55:05,018 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model.wv.* with mmap=None
2024-05-01 03:55:05,018 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 03:55:05,030 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model.syn1neg.npy with mmap=None
2024-05-01 03:55:05,041 : INFO : setting ignored attribute cum_table to None
2024-05-01 03:55:05,164 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model', 'datetime': '2024-05-01T03:55:05.164875', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:55:05,165 : INFO : collecting all w

Loading model from /Users/eddyji/GitRepos/01_TAD/models/thehill/2010-2014.model


2024-05-01 03:55:05,362 : INFO : PROGRESS: at sentence #10000, processed 2569004 words, keeping 55758 word types
2024-05-01 03:55:05,573 : INFO : PROGRESS: at sentence #20000, processed 5256548 words, keeping 79700 word types
2024-05-01 03:55:05,744 : INFO : PROGRESS: at sentence #30000, processed 7871479 words, keeping 99932 word types
2024-05-01 03:55:05,905 : INFO : PROGRESS: at sentence #40000, processed 10388763 words, keeping 118119 word types
2024-05-01 03:55:06,064 : INFO : PROGRESS: at sentence #50000, processed 12864927 words, keeping 134867 word types
2024-05-01 03:55:06,231 : INFO : PROGRESS: at sentence #60000, processed 15474375 words, keeping 151893 word types
2024-05-01 03:55:06,401 : INFO : PROGRESS: at sentence #70000, processed 18139986 words, keeping 170018 word types
2024-05-01 03:55:06,578 : INFO : PROGRESS: at sentence #80000, processed 20897397 words, keeping 187275 word types
2024-05-01 03:55:06,759 : INFO : PROGRESS: at sentence #90000, processed 23807565 word

Computing time for training the model: 83.61921668052673 seconds
Number of words processed per second: 781116.71686122
Most frequent words in model: [',', 'bastion', 'congressional', 'bent', 'southern', 'lead', 'believe', 'followed', 'county', 'california']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/thehill/2015-2019.model


2024-05-01 03:56:29,239 : INFO : capital-common-countries: 40.0% (168/420)
2024-05-01 03:56:29,993 : INFO : capital-world: 31.4% (133/424)
2024-05-01 03:56:30,029 : INFO : currency: 17.9% (5/28)
2024-05-01 03:56:31,337 : INFO : city-in-state: 52.4% (958/1827)
2024-05-01 03:56:31,555 : INFO : family: 43.3% (148/342)
2024-05-01 03:56:32,150 : INFO : gram1-adjective-to-adverb: 8.7% (66/756)
2024-05-01 03:56:32,678 : INFO : gram2-opposite: 13.1% (85/650)
2024-05-01 03:56:33,743 : INFO : gram3-comparative: 21.9% (292/1332)
2024-05-01 03:56:34,423 : INFO : gram4-superlative: 10.5% (85/812)
2024-05-01 03:56:34,888 : INFO : gram5-present-participle: 34.0% (221/650)
2024-05-01 03:56:35,595 : INFO : gram6-nationality-adjective: 68.8% (667/969)
2024-05-01 03:56:36,706 : INFO : gram7-past-tense: 31.9% (498/1560)
2024-05-01 03:56:37,443 : INFO : gram8-plural: 34.5% (364/1056)
2024-05-01 03:56:37,871 : INFO : gram9-plural-verbs: 27.9% (141/506)
2024-05-01 03:56:37,872 : INFO : Quadruplets with out-o

Processing interval 2000-2004


2024-05-01 03:56:42,763 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T03:56:42.763374', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 03:56:42,763 : INFO : collecting all words and their counts
2024-05-01 03:56:42,764 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training new model


2024-05-01 03:56:43,073 : INFO : PROGRESS: at sentence #10000, processed 4320886 words, keeping 89087 word types
2024-05-01 03:56:43,371 : INFO : PROGRESS: at sentence #20000, processed 8455895 words, keeping 127128 word types
2024-05-01 03:56:43,639 : INFO : PROGRESS: at sentence #30000, processed 12325169 words, keeping 158978 word types
2024-05-01 03:56:43,900 : INFO : PROGRESS: at sentence #40000, processed 16186985 words, keeping 189994 word types
2024-05-01 03:56:44,174 : INFO : PROGRESS: at sentence #50000, processed 20122601 words, keeping 217788 word types
2024-05-01 03:56:44,427 : INFO : PROGRESS: at sentence #60000, processed 23910232 words, keeping 243041 word types
2024-05-01 03:56:44,696 : INFO : PROGRESS: at sentence #70000, processed 28009835 words, keeping 268643 word types
2024-05-01 03:56:44,961 : INFO : PROGRESS: at sentence #80000, processed 32028634 words, keeping 292602 word types
2024-05-01 03:56:45,229 : INFO : PROGRESS: at sentence #90000, processed 36019153 w

Computing time for training the model: 58.888429164886475 seconds
Number of words processed per second: 707731.7665123076
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'year', 'state', 'people', 'time', 'president']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model


2024-05-01 03:57:42,556 : INFO : capital-common-countries: 47.0% (217/462)
2024-05-01 03:57:43,976 : INFO : capital-world: 44.7% (610/1366)
2024-05-01 03:57:44,069 : INFO : currency: 10.5% (9/86)
2024-05-01 03:57:45,039 : INFO : city-in-state: 23.0% (432/1881)
2024-05-01 03:57:45,305 : INFO : family: 47.9% (182/380)
2024-05-01 03:57:45,930 : INFO : gram1-adjective-to-adverb: 5.2% (42/812)
2024-05-01 03:57:46,367 : INFO : gram2-opposite: 9.3% (56/600)
2024-05-01 03:57:47,316 : INFO : gram3-comparative: 34.6% (461/1332)
2024-05-01 03:57:47,827 : INFO : gram4-superlative: 16.0% (139/870)
2024-05-01 03:57:48,336 : INFO : gram5-present-participle: 33.2% (233/702)
2024-05-01 03:57:49,143 : INFO : gram6-nationality-adjective: 71.0% (824/1161)
2024-05-01 03:57:50,206 : INFO : gram7-past-tense: 39.2% (611/1560)
2024-05-01 03:57:50,904 : INFO : gram8-plural: 48.3% (542/1122)
2024-05-01 03:57:51,238 : INFO : gram9-plural-verbs: 25.1% (116/462)
2024-05-01 03:57:51,239 : INFO : Quadruplets with out

Processing interval 2005-2009


2024-05-01 03:57:53,642 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model
2024-05-01 03:57:53,652 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model.wv.* with mmap=None
2024-05-01 03:57:53,652 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 03:57:53,672 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model.syn1neg.npy with mmap=None
2024-05-01 03:57:53,692 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model


2024-05-01 03:57:53,899 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/wt/2000-2004.model', 'datetime': '2024-05-01T03:57:53.899539', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:57:53,899 : INFO : collecting all words and their counts
2024-05-01 03:57:53,900 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:57:54,197 : INFO : PROGRESS: at sentence #10000, processed 3859677 words, keeping 93904 word types
2024-05-01 03:57:54,449 : INFO : PROGRESS: at sentence #20000, processed 7679033 words, keeping 133082 word types
2024-05-01 03:57:54,697 : INFO : PROGRESS: at sentence #30000, processed 11470874 words, keeping 164860 word types
2024-05-01 03:57:54,946 : INFO : PROGRESS: at sentence #40000, processed 15237018 words, keeping 193005 word types
2024-05-01 03:57:55,205 : INFO : PROGRESS: at sentence #

Computing time for training the model: 80.06557893753052 seconds
Number of words processed per second: 693285.7756928131
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'year', 'state', 'people', 'time', 'president']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model


2024-05-01 03:59:14,437 : INFO : capital-common-countries: 60.2% (278/462)
2024-05-01 03:59:15,590 : INFO : capital-world: 52.0% (711/1366)
2024-05-01 03:59:15,680 : INFO : currency: 12.8% (11/86)
2024-05-01 03:59:16,684 : INFO : city-in-state: 42.6% (802/1881)
2024-05-01 03:59:16,969 : INFO : family: 59.7% (227/380)
2024-05-01 03:59:17,577 : INFO : gram1-adjective-to-adverb: 11.9% (97/812)
2024-05-01 03:59:18,029 : INFO : gram2-opposite: 11.5% (69/600)
2024-05-01 03:59:19,018 : INFO : gram3-comparative: 52.8% (703/1332)
2024-05-01 03:59:19,569 : INFO : gram4-superlative: 15.2% (132/870)
2024-05-01 03:59:20,086 : INFO : gram5-present-participle: 41.7% (293/702)
2024-05-01 03:59:20,927 : INFO : gram6-nationality-adjective: 80.3% (932/1161)
2024-05-01 03:59:21,876 : INFO : gram7-past-tense: 48.7% (760/1560)
2024-05-01 03:59:22,599 : INFO : gram8-plural: 60.4% (678/1122)
2024-05-01 03:59:22,960 : INFO : gram9-plural-verbs: 31.4% (145/462)
2024-05-01 03:59:22,961 : INFO : Quadruplets with 

Processing interval 2010-2014


2024-05-01 03:59:31,598 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model
2024-05-01 03:59:31,612 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model.wv.* with mmap=None
2024-05-01 03:59:31,612 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 03:59:31,639 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 03:59:31,667 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model


2024-05-01 03:59:31,952 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/wt/2005-2009.model', 'datetime': '2024-05-01T03:59:31.952732', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 03:59:31,953 : INFO : collecting all words and their counts
2024-05-01 03:59:31,953 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 03:59:32,242 : INFO : PROGRESS: at sentence #10000, processed 3769150 words, keeping 82625 word types
2024-05-01 03:59:32,438 : INFO : PROGRESS: at sentence #20000, processed 6808785 words, keeping 113452 word types
2024-05-01 03:59:32,641 : INFO : PROGRESS: at sentence #30000, processed 9941953 words, keeping 139294 word types
2024-05-01 03:59:32,839 : INFO : PROGRESS: at sentence #40000, processed 13046651 words, keeping 162591 word types
2024-05-01 03:59:33,047 : INFO : PROGRESS: at sentence #5

Computing time for training the model: 164.61941981315613 seconds
Number of words processed per second: 703976.2509886965
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'year', 'state', 'people', 'time', 'president']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model


2024-05-01 04:02:17,037 : INFO : capital-common-countries: 67.7% (313/462)
2024-05-01 04:02:18,190 : INFO : capital-world: 57.0% (778/1366)
2024-05-01 04:02:18,267 : INFO : currency: 4.7% (4/86)
2024-05-01 04:02:19,682 : INFO : city-in-state: 59.5% (1119/1881)
2024-05-01 04:02:20,008 : INFO : family: 53.7% (204/380)
2024-05-01 04:02:20,596 : INFO : gram1-adjective-to-adverb: 15.5% (126/812)
2024-05-01 04:02:21,111 : INFO : gram2-opposite: 14.2% (85/600)
2024-05-01 04:02:22,007 : INFO : gram3-comparative: 45.5% (606/1332)
2024-05-01 04:02:22,650 : INFO : gram4-superlative: 17.2% (150/870)
2024-05-01 04:02:23,172 : INFO : gram5-present-participle: 40.5% (284/702)
2024-05-01 04:02:24,139 : INFO : gram6-nationality-adjective: 85.2% (989/1161)
2024-05-01 04:02:25,303 : INFO : gram7-past-tense: 50.4% (786/1560)
2024-05-01 04:02:26,230 : INFO : gram8-plural: 63.6% (714/1122)
2024-05-01 04:02:26,585 : INFO : gram9-plural-verbs: 30.7% (142/462)
2024-05-01 04:02:26,588 : INFO : Quadruplets with 

Processing interval 2015-2019


2024-05-01 04:03:18,022 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model
2024-05-01 04:03:18,055 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model.wv.* with mmap=None
2024-05-01 04:03:18,055 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 04:03:18,146 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model.syn1neg.npy with mmap=None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model


2024-05-01 04:03:18,238 : INFO : setting ignored attribute cum_table to None
2024-05-01 04:03:18,661 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/wt/2010-2014.model', 'datetime': '2024-05-01T04:03:18.661055', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:03:18,661 : INFO : collecting all words and their counts
2024-05-01 04:03:18,661 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:03:18,821 : INFO : PROGRESS: at sentence #10000, processed 1724782 words, keeping 68026 word types
2024-05-01 04:03:18,980 : INFO : PROGRESS: at sentence #20000, processed 3473402 words, keeping 98148 word types
2024-05-01 04:03:19,140 : INFO : PROGRESS: at sentence #30000, processed 5173920 words, keeping 122282 word types
2024-05-01 04:03:19,312 : INFO : PROGRESS: at sentence #40000, processed 6934304 words, keeping 

Computing time for training the model: 372.5968370437622 seconds
Number of words processed per second: 699591.7090122381
Most frequent words in model: ['said', 'he', 'mr', 'his', 'new', 'year', 'state', 'people', 'time', 'president']


2024-05-01 04:09:31,490 : INFO : Word2Vec lifecycle event {'fname_or_handle': '/Users/eddyji/GitRepos/01_TAD/models/wt/2015-2019.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-05-01T04:09:31.490215', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'saving'}
2024-05-01 04:09:31,490 : INFO : storing np array 'vectors' to /Users/eddyji/GitRepos/01_TAD/models/wt/2015-2019.model.wv.vectors.npy
2024-05-01 04:09:31,558 : INFO : storing np array 'syn1neg' to /Users/eddyji/GitRepos/01_TAD/models/wt/2015-2019.model.syn1neg.npy
2024-05-01 04:09:31,597 : INFO : not storing attribute cum_table
2024-05-01 04:09:31,666 : INFO : saved /Users/eddyji/GitRepos/01_TAD/models/wt/2015-2019.model
2024-05-01 04:09:31,681 : INFO : Skipping line #42 with OOV words: fuck	sex	9.44
2024-05-01 04:09:31,681 : INFO : Skipping line #56 with OOV words: physics	proton	8.12
2024-05-01 0

Model saved to /Users/eddyji/GitRepos/01_TAD/models/wt/2015-2019.model


2024-05-01 04:09:32,224 : INFO : capital-common-countries: 63.2% (292/462)
2024-05-01 04:09:33,842 : INFO : capital-world: 61.2% (836/1366)
2024-05-01 04:09:33,932 : INFO : currency: 4.7% (4/86)
2024-05-01 04:09:35,264 : INFO : city-in-state: 68.8% (1295/1881)
2024-05-01 04:09:35,611 : INFO : family: 64.5% (245/380)
2024-05-01 04:09:36,212 : INFO : gram1-adjective-to-adverb: 21.8% (177/812)
2024-05-01 04:09:36,675 : INFO : gram2-opposite: 17.5% (105/600)
2024-05-01 04:09:37,648 : INFO : gram3-comparative: 42.9% (572/1332)
2024-05-01 04:09:38,242 : INFO : gram4-superlative: 19.9% (173/870)
2024-05-01 04:09:38,784 : INFO : gram5-present-participle: 42.9% (301/702)
2024-05-01 04:09:39,829 : INFO : gram6-nationality-adjective: 83.8% (973/1161)
2024-05-01 04:09:41,222 : INFO : gram7-past-tense: 49.8% (777/1560)
2024-05-01 04:09:42,181 : INFO : gram8-plural: 64.3% (722/1122)
2024-05-01 04:09:42,573 : INFO : gram9-plural-verbs: 32.0% (148/462)
2024-05-01 04:09:42,575 : INFO : Quadruplets with

Processing interval 2000-2004


2024-05-01 04:09:45,772 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T04:09:45.772972', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 04:09:45,773 : INFO : collecting all words and their counts
2024-05-01 04:09:45,773 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:09:45,952 : INFO : PROGRESS: at sentence #10000, processed 2576363 words, keeping 74561 word types


Training new model


2024-05-01 04:09:46,152 : INFO : PROGRESS: at sentence #20000, processed 5191462 words, keeping 106105 word types
2024-05-01 04:09:46,358 : INFO : PROGRESS: at sentence #30000, processed 7862838 words, keeping 131817 word types
2024-05-01 04:09:46,531 : INFO : PROGRESS: at sentence #40000, processed 10392251 words, keeping 155151 word types
2024-05-01 04:09:46,679 : INFO : PROGRESS: at sentence #50000, processed 12539072 words, keeping 175320 word types
2024-05-01 04:09:46,826 : INFO : PROGRESS: at sentence #60000, processed 14697266 words, keeping 194343 word types
2024-05-01 04:09:46,973 : INFO : PROGRESS: at sentence #70000, processed 16815762 words, keeping 212429 word types
2024-05-01 04:09:47,117 : INFO : PROGRESS: at sentence #80000, processed 18920452 words, keeping 230143 word types
2024-05-01 04:09:47,266 : INFO : PROGRESS: at sentence #90000, processed 21038739 words, keeping 247415 word types
2024-05-01 04:09:47,411 : INFO : PROGRESS: at sentence #100000, processed 23207117

Computing time for training the model: 47.307759046554565 seconds
Number of words processed per second: 702097.344482417
Most frequent words in model: ['he', 'said', 'his', 'year', 'new', 'share', 'whatsappemailcopy', 'him', 'facebooktwitter', 'flipboard']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model


2024-05-01 04:10:33,973 : INFO : capital-common-countries: 31.6% (120/380)
2024-05-01 04:10:34,487 : INFO : capital-world: 17.9% (80/448)
2024-05-01 04:10:34,669 : INFO : currency: 7.4% (4/54)
2024-05-01 04:10:36,150 : INFO : city-in-state: 5.7% (99/1746)
2024-05-01 04:10:36,451 : INFO : family: 41.4% (174/420)
2024-05-01 04:10:37,008 : INFO : gram1-adjective-to-adverb: 1.7% (14/812)
2024-05-01 04:10:37,514 : INFO : gram2-opposite: 2.7% (16/600)
2024-05-01 04:10:38,485 : INFO : gram3-comparative: 42.5% (566/1332)
2024-05-01 04:10:39,161 : INFO : gram4-superlative: 11.1% (103/930)
2024-05-01 04:10:39,692 : INFO : gram5-present-participle: 31.4% (204/650)
2024-05-01 04:10:40,318 : INFO : gram6-nationality-adjective: 46.2% (419/906)
2024-05-01 04:10:41,408 : INFO : gram7-past-tense: 33.5% (497/1482)
2024-05-01 04:10:42,106 : INFO : gram8-plural: 39.8% (395/992)
2024-05-01 04:10:42,467 : INFO : gram9-plural-verbs: 31.7% (133/420)
2024-05-01 04:10:42,468 : INFO : Quadruplets with out-of-voc

Processing interval 2005-2009


2024-05-01 04:10:49,618 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model
2024-05-01 04:10:49,626 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model.wv.* with mmap=None
2024-05-01 04:10:49,626 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 04:10:49,641 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model.syn1neg.npy with mmap=None
2024-05-01 04:10:49,653 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model


2024-05-01 04:10:49,831 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/nyp/2000-2004.model', 'datetime': '2024-05-01T04:10:49.831516', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:10:49,831 : INFO : collecting all words and their counts
2024-05-01 04:10:49,831 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:10:50,018 : INFO : PROGRESS: at sentence #10000, processed 2233861 words, keeping 72480 word types
2024-05-01 04:10:50,173 : INFO : PROGRESS: at sentence #20000, processed 4457658 words, keeping 102903 word types
2024-05-01 04:10:50,324 : INFO : PROGRESS: at sentence #30000, processed 6710857 words, keeping 127719 word types
2024-05-01 04:10:50,471 : INFO : PROGRESS: at sentence #40000, processed 8926480 words, keeping 150409 word types
2024-05-01 04:10:50,617 : INFO : PROGRESS: at sentence #5

Computing time for training the model: 57.782593965530396 seconds
Number of words processed per second: 685927.098109227
Most frequent words in model: ['he', 'said', 'his', 'year', 'new', 'share', 'whatsappemailcopy', 'him', 'facebooktwitter', 'flipboard']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model


2024-05-01 04:11:48,066 : INFO : capital-common-countries: 35.0% (133/380)
2024-05-01 04:11:48,446 : INFO : capital-world: 29.9% (134/448)
2024-05-01 04:11:48,494 : INFO : currency: 5.6% (3/54)
2024-05-01 04:11:50,180 : INFO : city-in-state: 18.0% (315/1746)
2024-05-01 04:11:50,482 : INFO : family: 47.4% (199/420)
2024-05-01 04:11:51,039 : INFO : gram1-adjective-to-adverb: 6.5% (53/812)
2024-05-01 04:11:51,495 : INFO : gram2-opposite: 6.0% (36/600)
2024-05-01 04:11:52,402 : INFO : gram3-comparative: 46.2% (616/1332)
2024-05-01 04:11:53,036 : INFO : gram4-superlative: 20.0% (186/930)
2024-05-01 04:11:53,550 : INFO : gram5-present-participle: 33.4% (217/650)
2024-05-01 04:11:54,155 : INFO : gram6-nationality-adjective: 62.5% (566/906)
2024-05-01 04:11:55,467 : INFO : gram7-past-tense: 45.7% (677/1482)
2024-05-01 04:11:56,177 : INFO : gram8-plural: 48.7% (483/992)
2024-05-01 04:11:56,491 : INFO : gram9-plural-verbs: 27.1% (114/420)
2024-05-01 04:11:56,494 : INFO : Quadruplets with out-of-

Processing interval 2010-2014


2024-05-01 04:11:59,193 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model
2024-05-01 04:11:59,204 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model.wv.* with mmap=None
2024-05-01 04:11:59,204 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 04:11:59,222 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 04:11:59,239 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model


2024-05-01 04:11:59,480 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/nyp/2005-2009.model', 'datetime': '2024-05-01T04:11:59.480438', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:11:59,480 : INFO : collecting all words and their counts
2024-05-01 04:11:59,481 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:11:59,661 : INFO : PROGRESS: at sentence #10000, processed 2147319 words, keeping 75175 word types
2024-05-01 04:11:59,803 : INFO : PROGRESS: at sentence #20000, processed 4241837 words, keeping 106391 word types
2024-05-01 04:11:59,946 : INFO : PROGRESS: at sentence #30000, processed 6369256 words, keeping 132492 word types
2024-05-01 04:12:00,082 : INFO : PROGRESS: at sentence #40000, processed 8417141 words, keeping 155180 word types
2024-05-01 04:12:00,232 : INFO : PROGRESS: at sentence #5

Computing time for training the model: 84.91034078598022 seconds
Number of words processed per second: 698193.0993473119
Most frequent words in model: ['he', 'said', 'his', 'year', 'new', 'share', 'whatsappemailcopy', 'him', 'facebooktwitter', 'flipboard']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model


2024-05-01 04:13:24,647 : INFO : capital-common-countries: 62.6% (238/380)
2024-05-01 04:13:25,030 : INFO : capital-world: 48.4% (217/448)
2024-05-01 04:13:25,218 : INFO : currency: 1.9% (1/54)
2024-05-01 04:13:26,435 : INFO : city-in-state: 27.1% (474/1746)
2024-05-01 04:13:26,714 : INFO : family: 48.3% (203/420)
2024-05-01 04:13:27,212 : INFO : gram1-adjective-to-adverb: 10.7% (87/812)
2024-05-01 04:13:27,608 : INFO : gram2-opposite: 11.7% (70/600)
2024-05-01 04:13:28,496 : INFO : gram3-comparative: 51.4% (685/1332)
2024-05-01 04:13:29,151 : INFO : gram4-superlative: 25.5% (237/930)
2024-05-01 04:13:29,579 : INFO : gram5-present-participle: 38.2% (248/650)
2024-05-01 04:13:30,271 : INFO : gram6-nationality-adjective: 76.7% (695/906)
2024-05-01 04:13:31,186 : INFO : gram7-past-tense: 48.3% (716/1482)
2024-05-01 04:13:31,817 : INFO : gram8-plural: 55.6% (552/992)
2024-05-01 04:13:32,130 : INFO : gram9-plural-verbs: 24.8% (104/420)
2024-05-01 04:13:32,137 : INFO : Quadruplets with out-o

Processing interval 2015-2019


2024-05-01 04:13:38,026 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model
2024-05-01 04:13:38,042 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model.wv.* with mmap=None
2024-05-01 04:13:38,042 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 04:13:38,083 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model.syn1neg.npy with mmap=None
2024-05-01 04:13:38,115 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model


2024-05-01 04:13:38,434 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/nyp/2010-2014.model', 'datetime': '2024-05-01T04:13:38.434056', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:13:38,434 : INFO : collecting all words and their counts
2024-05-01 04:13:38,434 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:13:38,644 : INFO : PROGRESS: at sentence #10000, processed 2569541 words, keeping 85748 word types
2024-05-01 04:13:38,818 : INFO : PROGRESS: at sentence #20000, processed 5114296 words, keeping 124004 word types
2024-05-01 04:13:38,991 : INFO : PROGRESS: at sentence #30000, processed 7610327 words, keeping 155020 word types
2024-05-01 04:13:39,175 : INFO : PROGRESS: at sentence #40000, processed 10142276 words, keeping 183205 word types
2024-05-01 04:13:39,348 : INFO : PROGRESS: at sentence #

Computing time for training the model: 92.99123096466064 seconds
Number of words processed per second: 675140.3906445655
Most frequent words in model: ['he', 'said', 'his', 'year', 'new', 'share', 'whatsappemailcopy', 'him', 'facebooktwitter', 'flipboard']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/nyp/2015-2019.model


2024-05-01 04:15:11,629 : INFO : capital-common-countries: 70.0% (266/380)
2024-05-01 04:15:12,511 : INFO : capital-world: 56.9% (255/448)
2024-05-01 04:15:12,577 : INFO : currency: 5.6% (3/54)
2024-05-01 04:15:14,045 : INFO : city-in-state: 42.8% (748/1746)
2024-05-01 04:15:14,330 : INFO : family: 54.8% (230/420)
2024-05-01 04:15:14,955 : INFO : gram1-adjective-to-adverb: 14.8% (120/812)
2024-05-01 04:15:15,374 : INFO : gram2-opposite: 9.0% (54/600)
2024-05-01 04:15:16,302 : INFO : gram3-comparative: 49.2% (656/1332)
2024-05-01 04:15:17,056 : INFO : gram4-superlative: 31.4% (292/930)
2024-05-01 04:15:17,551 : INFO : gram5-present-participle: 52.8% (343/650)
2024-05-01 04:15:18,211 : INFO : gram6-nationality-adjective: 82.6% (748/906)
2024-05-01 04:15:19,256 : INFO : gram7-past-tense: 52.2% (774/1482)
2024-05-01 04:15:20,208 : INFO : gram8-plural: 62.6% (621/992)
2024-05-01 04:15:20,500 : INFO : gram9-plural-verbs: 28.8% (121/420)
2024-05-01 04:15:20,501 : INFO : Quadruplets with out-o

Processing interval 2000-2004


2024-05-01 04:15:21,341 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2024-05-01T04:15:21.341225', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'created'}
2024-05-01 04:15:21,341 : INFO : collecting all words and their counts
2024-05-01 04:15:21,341 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:15:21,443 : INFO : PROGRESS: at sentence #10000, processed 1457655 words, keeping 48903 word types


Training new model


2024-05-01 04:15:21,557 : INFO : PROGRESS: at sentence #20000, processed 3106209 words, keeping 73134 word types
2024-05-01 04:15:21,701 : INFO : PROGRESS: at sentence #30000, processed 4831677 words, keeping 93014 word types
2024-05-01 04:15:21,819 : INFO : PROGRESS: at sentence #40000, processed 6590216 words, keeping 111129 word types
2024-05-01 04:15:21,931 : INFO : PROGRESS: at sentence #50000, processed 8251021 words, keeping 128872 word types
2024-05-01 04:15:22,035 : INFO : PROGRESS: at sentence #60000, processed 9842054 words, keeping 145435 word types
2024-05-01 04:15:22,097 : INFO : collected 153523 word types from a corpus of 10763388 raw words and 64491 sentences
2024-05-01 04:15:22,097 : INFO : Creating a fresh vocabulary
2024-05-01 04:15:22,143 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 38437 unique words (25.04% of original 153523, drops 115086)', 'datetime': '2024-05-01T04:15:22.143347', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2

Computing time for training the model: 13.119465827941895 seconds
Number of words processed per second: 820413.5855193197
Most frequent words in model: ['year', 'said', 'he', 'view', 'comments', 'updated', 'aedt', 'company', 'money', 'new']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model


2024-05-01 04:15:35,190 : INFO : capital-common-countries: 12.9% (49/380)
2024-05-01 04:15:35,546 : INFO : capital-world: 6.6% (25/379)
2024-05-01 04:15:35,703 : INFO : currency: 5.3% (8/152)
2024-05-01 04:15:36,015 : INFO : city-in-state: 0.2% (1/424)
2024-05-01 04:15:36,245 : INFO : family: 7.2% (22/306)
2024-05-01 04:15:36,799 : INFO : gram1-adjective-to-adverb: 2.6% (21/812)
2024-05-01 04:15:37,278 : INFO : gram2-opposite: 1.0% (7/702)
2024-05-01 04:15:38,134 : INFO : gram3-comparative: 11.5% (145/1260)
2024-05-01 04:15:38,575 : INFO : gram4-superlative: 3.8% (27/702)
2024-05-01 04:15:39,043 : INFO : gram5-present-participle: 5.7% (40/702)
2024-05-01 04:15:39,682 : INFO : gram6-nationality-adjective: 35.0% (297/848)
2024-05-01 04:15:40,620 : INFO : gram7-past-tense: 9.0% (134/1482)
2024-05-01 04:15:41,152 : INFO : gram8-plural: 9.9% (80/812)
2024-05-01 04:15:41,474 : INFO : gram9-plural-verbs: 5.1% (26/506)
2024-05-01 04:15:41,479 : INFO : Quadruplets with out-of-vocabulary words: 

Processing interval 2005-2009


2024-05-01 04:15:43,504 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model
2024-05-01 04:15:43,509 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model.wv.* with mmap=None
2024-05-01 04:15:43,509 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model.wv.vectors.npy with mmap=None
2024-05-01 04:15:43,517 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model.syn1neg.npy with mmap=None
2024-05-01 04:15:43,524 : INFO : setting ignored attribute cum_table to None
2024-05-01 04:15:43,608 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model', 'datetime': '2024-05-01T04:15:43.608874', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:15:43,609 : INFO : collec

Loading model from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2000-2004.model


2024-05-01 04:15:43,771 : INFO : PROGRESS: at sentence #10000, processed 2303251 words, keeping 59027 word types
2024-05-01 04:15:43,960 : INFO : PROGRESS: at sentence #20000, processed 4781599 words, keeping 86005 word types
2024-05-01 04:15:44,142 : INFO : PROGRESS: at sentence #30000, processed 7479504 words, keeping 125043 word types
2024-05-01 04:15:44,328 : INFO : PROGRESS: at sentence #40000, processed 10192686 words, keeping 159684 word types
2024-05-01 04:15:44,528 : INFO : PROGRESS: at sentence #50000, processed 12813638 words, keeping 186468 word types
2024-05-01 04:15:44,712 : INFO : PROGRESS: at sentence #60000, processed 15576821 words, keeping 211769 word types
2024-05-01 04:15:44,892 : INFO : PROGRESS: at sentence #70000, processed 18289690 words, keeping 234875 word types
2024-05-01 04:15:45,082 : INFO : PROGRESS: at sentence #80000, processed 21035337 words, keeping 256106 word types
2024-05-01 04:15:45,279 : INFO : PROGRESS: at sentence #90000, processed 23850255 wor

Computing time for training the model: 68.33641695976257 seconds
Number of words processed per second: 692125.5181969717
Most frequent words in model: ['year', 'said', 'he', 'view', 'comments', 'updated', 'aedt', 'company', 'money', 'new']
Model saved to /Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model


2024-05-01 04:16:52,503 : INFO : capital-common-countries: 65.0% (247/380)
2024-05-01 04:16:52,870 : INFO : capital-world: 50.9% (193/379)
2024-05-01 04:16:53,019 : INFO : currency: 2.6% (4/152)
2024-05-01 04:16:53,461 : INFO : city-in-state: 20.0% (85/424)
2024-05-01 04:16:53,780 : INFO : family: 48.0% (147/306)
2024-05-01 04:16:54,457 : INFO : gram1-adjective-to-adverb: 11.2% (91/812)
2024-05-01 04:16:55,113 : INFO : gram2-opposite: 6.7% (47/702)
2024-05-01 04:16:56,386 : INFO : gram3-comparative: 44.0% (555/1260)
2024-05-01 04:16:57,011 : INFO : gram4-superlative: 22.1% (155/702)
2024-05-01 04:16:57,594 : INFO : gram5-present-participle: 50.3% (353/702)
2024-05-01 04:16:58,322 : INFO : gram6-nationality-adjective: 83.3% (706/848)
2024-05-01 04:16:59,636 : INFO : gram7-past-tense: 45.1% (669/1482)
2024-05-01 04:17:00,454 : INFO : gram8-plural: 58.5% (475/812)
2024-05-01 04:17:00,927 : INFO : gram9-plural-verbs: 20.0% (101/506)
2024-05-01 04:17:00,927 : INFO : Quadruplets with out-of-

Processing interval 2010-2014


2024-05-01 04:17:39,622 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model
2024-05-01 04:17:39,635 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model.wv.* with mmap=None
2024-05-01 04:17:39,635 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model.wv.vectors.npy with mmap=None
2024-05-01 04:17:39,688 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model.syn1neg.npy with mmap=None
2024-05-01 04:17:39,740 : INFO : setting ignored attribute cum_table to None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model


2024-05-01 04:17:39,978 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/dailymail/2005-2009.model', 'datetime': '2024-05-01T04:17:39.978100', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:17:39,978 : INFO : collecting all words and their counts
2024-05-01 04:17:39,978 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:17:40,329 : INFO : PROGRESS: at sentence #10000, processed 2954343 words, keeping 82593 word types
2024-05-01 04:17:40,582 : INFO : PROGRESS: at sentence #20000, processed 5926603 words, keeping 117193 word types
2024-05-01 04:17:40,814 : INFO : PROGRESS: at sentence #30000, processed 8859759 words, keeping 145705 word types
2024-05-01 04:17:41,058 : INFO : PROGRESS: at sentence #40000, processed 11852427 words, keeping 172150 word types
2024-05-01 04:17:41,291 : INFO : PROGRESS: at sent

Computing time for training the model: 383.061888217926 seconds
Number of words processed per second: 648131.0687289136
Most frequent words in model: ['year', 'said', 'he', 'view', 'comments', 'updated', 'aedt', 'company', 'money', 'new']


2024-05-01 04:24:02,975 : INFO : saved /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model
2024-05-01 04:24:02,986 : INFO : Skipping line #35 with OOV words: king	cabbage	0.23
2024-05-01 04:24:02,986 : INFO : Skipping line #37 with OOV words: king	rook	5.92
2024-05-01 04:24:02,987 : INFO : Skipping line #42 with OOV words: fuck	sex	9.44
2024-05-01 04:24:02,987 : INFO : Skipping line #55 with OOV words: movie	theater	7.92
2024-05-01 04:24:02,988 : INFO : Skipping line #87 with OOV words: food	rooster	4.42
2024-05-01 04:24:02,989 : INFO : Skipping line #97 with OOV words: rooster	voyage	0.62
2024-05-01 04:24:02,989 : INFO : Skipping line #110 with OOV words: tiger	feline	8.00
2024-05-01 04:24:02,989 : INFO : Skipping line #111 with OOV words: tiger	carnivore	7.08
2024-05-01 04:24:02,989 : INFO : Skipping line #112 with OOV words: tiger	mammal	6.85
2024-05-01 04:24:02,990 : INFO : Skipping line #114 with OOV words: tiger	organism	4.77
2024-05-01 04:24:02,990 : INFO : Skipping l

Model saved to /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model


2024-05-01 04:24:03,683 : INFO : capital-common-countries: 79.2% (301/380)
2024-05-01 04:24:04,071 : INFO : capital-world: 76.8% (291/379)
2024-05-01 04:24:04,201 : INFO : currency: 13.2% (20/152)
2024-05-01 04:24:04,643 : INFO : city-in-state: 68.4% (290/424)
2024-05-01 04:24:04,831 : INFO : family: 53.3% (163/306)
2024-05-01 04:24:05,647 : INFO : gram1-adjective-to-adverb: 26.1% (212/812)
2024-05-01 04:24:06,210 : INFO : gram2-opposite: 14.4% (101/702)
2024-05-01 04:24:07,615 : INFO : gram3-comparative: 51.7% (651/1260)
2024-05-01 04:24:08,255 : INFO : gram4-superlative: 28.8% (202/702)
2024-05-01 04:24:08,860 : INFO : gram5-present-participle: 60.4% (424/702)
2024-05-01 04:24:09,489 : INFO : gram6-nationality-adjective: 94.5% (801/848)
2024-05-01 04:24:10,742 : INFO : gram7-past-tense: 58.5% (867/1482)
2024-05-01 04:24:11,374 : INFO : gram8-plural: 74.5% (605/812)
2024-05-01 04:24:11,783 : INFO : gram9-plural-verbs: 27.3% (138/506)
2024-05-01 04:24:11,784 : INFO : Quadruplets with o

Processing interval 2015-2019


2024-05-01 04:26:50,253 : INFO : loading Word2Vec object from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model
2024-05-01 04:26:50,285 : INFO : loading wv recursively from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model.wv.* with mmap=None
2024-05-01 04:26:50,286 : INFO : loading vectors from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model.wv.vectors.npy with mmap=None
2024-05-01 04:26:50,372 : INFO : loading syn1neg from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model.syn1neg.npy with mmap=None


Loading model from /Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model


2024-05-01 04:26:50,490 : INFO : setting ignored attribute cum_table to None
2024-05-01 04:26:51,036 : INFO : Word2Vec lifecycle event {'fname': '/Users/eddyji/GitRepos/01_TAD/models/dailymail/2010-2014.model', 'datetime': '2024-05-01T04:26:51.036666', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-05-01 04:26:51,037 : INFO : collecting all words and their counts
2024-05-01 04:26:51,037 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-01 04:26:51,327 : INFO : PROGRESS: at sentence #10000, processed 3257846 words, keeping 89053 word types
2024-05-01 04:26:51,617 : INFO : PROGRESS: at sentence #20000, processed 6459303 words, keeping 127392 word types
2024-05-01 04:26:51,900 : INFO : PROGRESS: at sentence #30000, processed 9794667 words, keeping 158870 word types
2024-05-01 04:26:52,197 : INFO : PROGRESS: at sentence #40000, processed 13117258 words,

Computing time for training the model: 708.8723270893097 seconds


2024-05-01 04:38:43,610 : INFO : Word2Vec lifecycle event {'fname_or_handle': '/Users/eddyji/GitRepos/01_TAD/models/dailymail/2015-2019.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-05-01T04:38:43.610633', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:17:37) [Clang 14.0.6 ]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'saving'}
2024-05-01 04:38:43,611 : INFO : storing np array 'vectors' to /Users/eddyji/GitRepos/01_TAD/models/dailymail/2015-2019.model.wv.vectors.npy
2024-05-01 04:38:43,676 : INFO : storing np array 'syn1neg' to /Users/eddyji/GitRepos/01_TAD/models/dailymail/2015-2019.model.syn1neg.npy
2024-05-01 04:38:43,765 : INFO : not storing attribute cum_table


Number of words processed per second: 629685.790998247
Most frequent words in model: ['year', 'said', 'he', 'view', 'comments', 'updated', 'aedt', 'company', 'money', 'new']


2024-05-01 04:38:43,903 : INFO : saved /Users/eddyji/GitRepos/01_TAD/models/dailymail/2015-2019.model
2024-05-01 04:38:43,913 : INFO : Skipping line #35 with OOV words: king	cabbage	0.23
2024-05-01 04:38:43,913 : INFO : Skipping line #37 with OOV words: king	rook	5.92
2024-05-01 04:38:43,913 : INFO : Skipping line #42 with OOV words: fuck	sex	9.44
2024-05-01 04:38:43,914 : INFO : Skipping line #55 with OOV words: movie	theater	7.92
2024-05-01 04:38:43,915 : INFO : Skipping line #87 with OOV words: food	rooster	4.42
2024-05-01 04:38:43,916 : INFO : Skipping line #97 with OOV words: rooster	voyage	0.62
2024-05-01 04:38:43,916 : INFO : Skipping line #110 with OOV words: tiger	feline	8.00
2024-05-01 04:38:43,917 : INFO : Skipping line #111 with OOV words: tiger	carnivore	7.08
2024-05-01 04:38:43,917 : INFO : Skipping line #112 with OOV words: tiger	mammal	6.85
2024-05-01 04:38:43,917 : INFO : Skipping line #114 with OOV words: tiger	organism	4.77
2024-05-01 04:38:43,917 : INFO : Skipping l

Model saved to /Users/eddyji/GitRepos/01_TAD/models/dailymail/2015-2019.model


2024-05-01 04:38:44,404 : INFO : capital-common-countries: 77.9% (296/380)
2024-05-01 04:38:44,761 : INFO : capital-world: 77.6% (294/379)
2024-05-01 04:38:44,945 : INFO : currency: 15.8% (24/152)
2024-05-01 04:38:45,326 : INFO : city-in-state: 69.8% (296/424)
2024-05-01 04:38:45,556 : INFO : family: 59.2% (181/306)
2024-05-01 04:38:46,298 : INFO : gram1-adjective-to-adverb: 27.5% (223/812)
2024-05-01 04:38:46,891 : INFO : gram2-opposite: 12.8% (90/702)
2024-05-01 04:38:48,118 : INFO : gram3-comparative: 48.3% (608/1260)
2024-05-01 04:38:48,652 : INFO : gram4-superlative: 31.8% (223/702)
2024-05-01 04:38:49,195 : INFO : gram5-present-participle: 61.5% (432/702)
2024-05-01 04:38:49,775 : INFO : gram6-nationality-adjective: 94.8% (804/848)
2024-05-01 04:38:50,912 : INFO : gram7-past-tense: 62.0% (919/1482)
2024-05-01 04:38:51,601 : INFO : gram8-plural: 72.8% (591/812)
2024-05-01 04:38:51,991 : INFO : gram9-plural-verbs: 31.6% (160/506)
2024-05-01 04:38:51,992 : INFO : Quadruplets with ou

In [26]:
import csv
from pathlib import Path

file_name_list = [f'{i[0]}-{i[-1]}' for i in intervalsList]
outlets = ['cbs','nyt', 'slate', 'reuters','thehill', 'csm', 'wt', 'nyp', 'dailymail']

def count_csv_lines(outlets, file_name_list):
    base_dir = Path("../scrambledArticlesForWord2vec")
    line_counts = {}

    csv.field_size_limit(1000000)  # 增加字段大小限制

    for outlet in outlets:
        line_counts[outlet] = {}
        outlet_dir = base_dir / outlet

        for interval in file_name_list:
            interval_dir = outlet_dir / f'{interval}.csv'
            line_count = 0

            with open(interval_dir, "r") as file:
                csv_reader = csv.reader(file)
                line_count = sum(1 for row in csv_reader)

            line_counts[outlet][interval] = line_count

    return line_counts

count = count_csv_lines(outlets, file_name_list)
count_df = pd.DataFrame(count)
count_df

Unnamed: 0,cbs,nyt,slate,reuters,thehill,csm,wt,nyp,dailymail
2000-2004,60869,421143,15313,6,6,42787,104164,146136,64491
2005-2009,155509,303699,26045,790883,37352,44535,150534,187012,170786
2010-2014,203569,460506,61621,1322954,155641,92239,513780,257433,803932
2015-2019,137473,290703,61284,1180341,220846,48144,1299226,255737,1356851
