# Koselleck

## Non-local imports

### stdlib

In [1]:
import os,sys,time,random,ujson,json,warnings
import pickle5 as pickle
from pprint import pprint
import logging
from collections import Counter,defaultdict

In [3]:
# settings
warnings.filterwarnings('ignore')

### other

In [1]:
import gensim
import networkx as nx
import plotnine as p9
import pandas as pd
import numpy as np
from fastdist import fastdist
from tqdm import tqdm
tqdm.pandas()
from scipy.stats import ttest_ind
try:
    from ipywidgets import interact, interactive, fixed, interact_manual
    import ipywidgets as widgets
except ImportError as e:
    pass
from ftfy import fix_text
import cv2
from pandas.core.groupby.groupby import DataError
from gensim.models import KeyedVectors,Word2Vec
from loguru import logger
from scipy.spatial.distance import cosine
from scipy.stats import percentileofscore
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
try:
    from IPython.display import Image
    from ipywidgets import interact, interactive, fixed, interact_manual
    import ipywidgets as widgets
except ImportError as e:
    print('!!',e)
    pass
from sqlitedict import SqliteDict
pd.options.display.max_colwidth=None
import pandas as pd
import gspread
from gspread_pandas import Spread, Client
from IPython.display import Markdown
import markdown 



In [3]:
from ipywidgets import *

In [5]:
## custom
import lltk

### Logging

In [5]:
def disable_gensim_logging():
    for log_name, log_obj in logging.Logger.manager.loggerDict.items():
        if log_name.startswith('gensim'):
            log_obj.disabled=True

def enable_gensim_logging():
    for log_name, log_obj in logging.Logger.manager.loggerDict.items():
        if log_name.startswith('gensim'):
            log_obj.disabled=False

In [6]:
disable_gensim_logging()

In [28]:
logging.Logger.manager.loggerDict['sqlitedict'].disabled=True
logger.remove()
logger.add(sys.stderr, format="[Koselleck] ({time:HH:mm:ss}) {message}", level="INFO")
LASTTIME=time.time()
def log(*x,timer=True,**y): 
    global LASTTIME
    now=time.time()
    lstr=f'(+{round(now-LASTTIME,1)}s)' if timer else ''
    x=list(x) + [lstr]
    logger.info(' '.join(str(xx) for xx in x),**y)
    LASTTIME=now
print = log

## Local imports

In [9]:
here=os.path.dirname(os.path.realpath("__file__"))
here

'/home/ryan/github/koselleck/koselleck'

In [10]:
root=os.path.dirname(os.path.abspath(os.path.join(here)))
root

'/home/ryan/github/koselleck'

In [11]:
PATH_LIB=os.path.join(root,'lib')
PATH_LIB

'/home/ryan/github/koselleck/lib'

In [12]:
# Semi-custom imports -@HACK
sys.path.append(os.path.join(PATH_LIB,'Noise-Aware-Alignment'))
# sys.path.append(os.path.join(PATH_LIB,'abslithist'))
sys.path.append(os.path.join(PATH_LIB,'yapmap'))
from noise_aware import noise_aware
from yapmap import *
# from abslithist import *

In [13]:
# other paths
PATH_DATA=os.path.join(root,'data')
PATH_DATA_HD=os.path.join(root,'data1')
PATH_FIGS=os.path.join(root,'figures')
PATH_SKIPGRAMS=os.path.join(PATH_DATA_HD,'skipgrams')
PATH_MODELS=os.path.join(PATH_DATA_HD,'models')
PATH_SKIPGRAMS_YR=os.path.join(PATH_SKIPGRAMS,'years')
PATH_FIELDS=os.path.join(PATH_DATA,'fields')
PATH_MODELS_BPO=os.path.join(PATH_MODELS,'bpo')
PATH_DB=os.path.join(root,'db')

In [14]:
# urls
URL_KEYWORDS='https://docs.google.com/spreadsheets/d/e/2PACX-1vRzHA7iqgW7BB9SCtR0Nr3Dge5zSkY9C6lOkUMFV7Bd4Bhap6LVR3sWrXnjovUNhL9HAUNUJNRB62rD/pub?gid=0&single=true&output=csv'
UPROOT='/Markdown/Drafts/TheGreatAbstraction/figures/'
GSPREAD_NAME='ComputingKoselleck'

In [15]:
# # filenames
# FN_CHANGE_RUNS_AVG = os.path.join(PATH_DATA,'data.measured_change.runs_avg.v2.csv')
# FN_CHANGE_RUNS = os.path.join(PATH_DATA,'data.measured_change.runs.v2.csv')
# FN_DATA_CACHE_DEC=os.path.join(PATH_DATA,'data.cache.decade_level_data.pkl')
# FN_VECTOR_SCORES_DIFFMEANS=os.path.join(PATH_DATA,'data.vector_scores_across_models.diff_means.csv')
# FN_FREQ_DEC_MODELS=os.path.join(PATH_DATA,'data.freq_across_decade_models.csv')
# FN_DATA_PACEOFCHANGE = os.path.join(PATH_DATA,'data.semantic_change_over_decades.1run.v10-local-k50-halfdec.pkl')
# FN_NOVELTY_DATA=os.path.join(PATH_DATA,'data.words_by_rateofchange.v4.pkl')
# FN_ALL_LOCALDISTS_V2=os.path.join(PATH_DATA,'data.all_local_dists.v5.pkl')
# FN_ALL_LOCALDISTS_V2_CACHE=os.path.join(PATH_DATA,'data.all_local_dists.v5.cache.pkl')
# FN_ALL_LOCALDISTS=os.path.join(PATH_DATA,'data.all_local_dists.v3.pkl')
# FN_ALL_LOCALDISTS_CACHE=os.path.join(PATH_DATA,'data.all_local_dists.v3.cache.pkl')
# FN_NOV_ALL_BYWORD = os.path.join(PATH_DATA,'data.novelty.by_word.pkl')
# FN_ALL_NEIGHBS=os.path.join(PATH_DATA,'data.all_local_neighbs.v2.pkl')
# FN_ALL_NEIGHBS_SIMPLE=os.path.join(PATH_DATA,'data.all_local_neighbs.v2.simple.pkl')
# FN_ALL_MODEL_CACHE=os.path.join(PATH_DATA,'data.all_models_halfdec.pkl')
# FN_ALL_NEIGHBS_SIMPLE=os.path.join(PATH_DATA,'data.all_local_neighbs.v2.simple.pkl')
# FN_VECLIB=os.path.join(PATH_DATA,'data.veclib.dbm')
# FN_AMBIGUITY=os.path.join(PATH_DATA,'data.ambiguity.runs.csv')
# PATH_FIELD_WILLIAMS_SRC=os.path.join(PATH_FIELDS,'williams-src.txt')
# PATH_FIELD_WILLIAMS=os.path.join(PATH_FIELDS,'williams.txt')
# PATH_FIELD_KOSELLECK_SRC=os.path.join(PATH_FIELDS,'koselleck-src.txt')
# PATH_FIELD_KOSELLECK=os.path.join(PATH_FIELDS,'koselleck.txt')
# FN_NOVELTY_DATA=os.path.join(PATH_DATA,'data.words_by_rateofchange.pkl')


In [2]:
FN_WORDS=os.path.join(PATH_DATA,'data.mfw.txt')
FN_STOPWORDS=os.path.join(PATH_DATA,'stopwords.txt')
FN_FIELDS=os.path.join(PATH_DATA,'data.fields.json')
FN_ORIGFIELDS=os.path.join(PATH_DATA,'data.origfields.pkl')
FN_DEFAULT_MODEL_PATHS=os.path.join(PATH_DATA,'data.model.paths.default.pkl')
FN_ALL_LOCALDISTS_ORIGDATA=os.path.join(PATH_DATA_HD,'data.all_local_dists.v4.pkl')
FN_ALL_NEIGHBS=os.path.join(PATH_DATA_HD,'data.all_local_neighbs.v2.pkl')
FN_NOV_CACHE=os.path.join(PATH_DATA_HD,'data.nov_cache.pkl')
FN_VECTOR_SCORES_RUNS=os.path.join(PATH_DATA,'data.vector_scores_across_models.v2.pkl')
FN_VECTOR_SCORES_TTEST=os.path.join(PATH_DATA,'data.vector_scores_across_models.v2.ttests.pkl')
FN_LNM_TTEST=os.path.join(PATH_DATA,'data.lnm.ttests.pkl')

NameError: name 'os' is not defined

In [1]:
# # blank objects
# DF_LOCALDISTS=None
DFVECSCORES=None
DFALLNOV=None
# VECLIB={}
# DF_MODELS_DL=None
# NEIGHB_SIMPLE_D=None
MODEL_CACHE={}

## Config defaults

In [1]:
DEFAULT_NUM_SKIP=20000
NSKIP_PER_YR=20000
YMIN=1720
YMAX=1900
FIELD_ABS_KEY='Abs-Conc.Median'
FOOTE_W=5
YEARBIN=5
K=10
NBR_MAX_RANK=1000

In [19]:
# Corpus defaults
CNAME='BPO'

## Local imports

In [1]:
from ipynb.fs.defs.tools import *
from ipynb.fs.defs.corpora import *
from ipynb.fs.defs.db import *
from ipynb.fs.defs.fields import *
from ipynb.fs.defs.models import *
from ipynb.fs.defs.vecs import *
from ipynb.fs.defs.distvecs import *
from ipynb.fs.defs.cdists import *
from ipynb.fs.defs.neighbs import *
from ipynb.fs.defs.shifts import *
from ipynb.fs.defs.distmat import *
from ipynb.fs.defs.novelty import *
from ipynb.fs.defs.tilts import *
from ipynb.fs.defs.info import *
from ipynb.fs.defs.info import *
from ipynb.fs.defs.passages import *


[Koselleck] (06:04:31) Alles bereit (+0.0s)


In [None]:
print('Alles bereit')

In [None]:
YMAX