# db2

In [1]:
from ipynb.fs.full.koselleck import *

In [2]:
# !dropdb koselleck
# !createdb koselleck

In [3]:
from peewee import *
from playhouse.postgres_ext import *
from datetime import datetime as dt
from psycopg2.errors import UniqueViolation
db=None

## Model definitions

In [4]:
# db=PostgresqlExtDatabase('koselleck')
FN_DB=os.path.join(PATH_DATA,'db2.koselleck.v2.sqlite')
os.remove(FN_DB)
db = SqliteDatabase(FN_DB)
db.connect()

True

In [5]:
class BaseModel(Model):
    class Meta:
        database = db

In [6]:
class Word(BaseModel):
    word = CharField(unique=True)
    pos = CharField()
    pos0 = CharField()
    is_valid = BooleanField()
    count = IntegerField()
    fpm = FloatField()
    rank = IntegerField()

In [7]:
class Period(BaseModel):
    period = CharField(unique=True)
    ymin = IntegerField()
    ymax = IntegerField()
    ybin = IntegerField()

In [8]:
class Corpus(BaseModel):
    corpus = CharField(unique=True)
    name = CharField(unique=True)

In [9]:
class Embedding(BaseModel):
    path = CharField(unique=True)
    period = ForeignKeyField(Period)
    run = IntegerField()
    corpus = ForeignKeyField(Corpus)
#     df = BlobField()

In [10]:
class Vector(BaseModel):
    word = ForeignKeyField(Word,backref='vectors')
    model = ForeignKeyField(Embedding,backref='vectors')
    #vector = ArrayField(field_class=FloatField, index=False)
    vector = BlobField()

In [11]:
# class Distance(BaseModel):
#     word1=ForeignKeyField(Word,backref='dists1')
#     word2=ForeignKeyField(Word,backref='dists2')
#     model1=ForeignKeyField(Embedding)
#     model2=ForeignKeyField(Embedding)
#     type = CharField(unique=False)
#     value = FloatField()
#     rank = IntegerField()
#     z = FloatField()
#     perc = FloatField()

In [12]:
class Distance(BaseModel):
    #vector1=ForeignKeyField(Vector)
    #vector2=ForeignKeyField(Vector)
    type = CharField(unique=False)
    
    word1=ForeignKeyField(Word)
    word2=ForeignKeyField(Word)
    
    model1=ForeignKeyField(Embedding)
    model2=ForeignKeyField(Embedding)
    
    value = FloatField()
#     rank = IntegerField()
#     z = FloatField()
#     perc = FloatField()

## Init

In [13]:
models =[Corpus,Period,Word,Embedding,Vector,Distance]
with db.atomic() as tx:
    #db.drop_tables(models)
    db.create_tables(models)

## Ingest

In [14]:
def ingest(cls, data_iter, batchsize=100):
    with db.atomic() as txn:
        for i,d in enumerate(data_iter):
            try:
                cls.create(**d)
            except IntegrityError:
                db.rollback()
#             if i and not i%batchsize:
#                 txn.commit()
#         txt.commit()

In [15]:

ingest(Corpus, [dict(corpus='bpo',name='BPO'), dict(corpus='ecco',name='ECCO')])

### Ingest words

In [16]:
mfwdfn=os.path.join(PATH_DATA,'data.mfwdf.pkl')
#get_corpus().mfw_df().to_pickle(mfwdfn)
mfwdf=pd.read_pickle(mfwdfn)

In [17]:
done={w.name for w in Word.select()}

In [18]:
idf=mfwdf.reset_index()
idf['is_valid']=idf['pos0'].isin({'n','v','j'})
idf=idf[~idf.word.isin(done)]

In [19]:
ingest(Word, idf.to_dict('records'))

## Ingest periods


In [20]:
period_lens={5,20,70}

In [21]:
# Models
dfmodels = get_pathdf_models(period_len=None)
dfmodels = dfmodels[dfmodels.period_len.isin(period_lens)]
dfmodels

Scanning directory for models: 3901it [00:00, 63888.03it/s]


Unnamed: 0,corpus,period_start,period_end,path,path_vocab,run,period,period_len,qstr
2090,bpo,1720,1740,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_01/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_01/vocab.txt,run_01,1720-1740,20,vecs(1720-1740_01)
2091,bpo,1720,1740,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_05/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_05/vocab.txt,run_05,1720-1740,20,vecs(1720-1740_05)
2092,bpo,1720,1740,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_10/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_10/vocab.txt,run_10,1720-1740,20,vecs(1720-1740_10)
2093,bpo,1720,1740,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_02/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_02/vocab.txt,run_02,1720-1740,20,vecs(1720-1740_02)
2094,bpo,1720,1740,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_08/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1720-1740/run_08/vocab.txt,run_08,1720-1740,20,vecs(1720-1740_08)
...,...,...,...,...,...,...,...,...,...
1851,bpo,1895,1900,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_11/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_11/vocab.txt,run_11,1895-1900,5,vecs(1895-1900_11)
1853,bpo,1895,1900,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_04/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_04/vocab.txt,run_04,1895-1900,5,vecs(1895-1900_04)
1854,bpo,1895,1900,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_12/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_12/vocab.txt,run_12,1895-1900,5,vecs(1895-1900_12)
1847,bpo,1895,1900,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_06/model.bin,/home/ryan/github/koselleck/data1/models/bpo/1895-1900/run_06/vocab.txt,run_06,1895-1900,5,vecs(1895-1900_06)


### Periods

In [22]:
def ingest_periods(dfmodels):
    df=dfmodels[
        ['period','period_len','period_start','period_end']
    ].drop_duplicates()
    df.columns = ['period', 'ybin', 'ymin', 'ymax']
    return ingest(Period, df.to_dict('records'))

In [23]:
ingest_periods(dfmodels)

In [24]:
# PATH_MODELS

## Ingest embeddings


```python
class Embedding(BaseModel):
    path = CharField(unique=True)
    period = ForeignKeyField(Period)
    run = IntegerField()
    corpus = ForeignKeyField(Corpus)
```

In [25]:
# def model_to_vectors(mpath):
#     m=load_model(mpath)
#     return pd.DataFrame(m.wv.vectors, index=[m.wv.index_to_key[i] for i in range(len(m.wv.vectors))])

In [26]:
# model_to_vectors(dfmodels.iloc[0].path)

In [27]:
# pickle.dump(dfmodels)
def ingest_model_row(row):
    d=dict(
        path=os.path.relpath(row.path, PATH_MODELS),
        period = Period.get(Period.period==row.period),
        run = int(row.run.split('_')[-1]),
        corpus = Corpus.get(Corpus.corpus==row.corpus),
#         df = pickle.dumps(model_to_vectors(row.path))
    )
    return d

In [28]:
def iter_ingest_models(dfmodels):
    objs=[row for i,row in dfmodels.iterrows()]
    yield from pmap_iter(
        ingest_model_row,
        objs,
        num_proc=1
    )

In [29]:
# o=next(iter_ingest_models(dfmodels))

In [30]:
ingest(Embedding, iter_ingest_models(dfmodels), batchsize=2)

Mapping ingest_model_row() [x1]: 100%|██████████| 1002/1002 [00:00<00:00, 1544.08it/s]


### Vecs

```python
class WordVector(Vector):
    word = ForeignKeyField(Word,backref='vectors')
    model = ForeignKeyField(Embedding,backref='vectors')
    vector = ArrayField(field_class=FloatField, index=False)
```

In [31]:
dbd_word = dict((w.word,w) for w in Word.select())
dbd_path = dict((x.path,x) for x in Embedding.select())

In [32]:
def ingest_vecs_from_model(row):
    m=load_model(row.path)
    l=[]
#     with db.atomic() as txn:
    words={w.word for w in Word.select()}
    pathid=os.path.relpath(row.path, PATH_MODELS)
    for word,index in m.wv.key_to_index.items():
        if word not in words: continue
        vec=m.wv.vectors[index]
        o=dict(
            word=dbd_word.get(word),
            model=dbd_path.get(pathid),
            vector=pickle.dumps(vec)#[float(x) for x in vec],
        )
        l.append(o)
    return l

In [33]:
# next(ingest_vecs_from_model(dfmodels.iloc[0]))

In [34]:
def iter_ingest_vecs(dfmodels,num_proc=1):
    paths=dfmodels.path
    for iterr in pmap_iter(
        ingest_vecs_from_model,
        [row for i,row in dfmodels.iterrows()],
        num_proc=num_proc
        ):
        
        ingest(Vector, iterr)

In [35]:
# iter_ingest_vecs(dfmodels)

In [36]:
# stop
# ingest(Vector, iter_ingest_vecs(dfmodels))

In [37]:
def get_vecs_from_db(period, run=1):
    with db.atomic():
        vecs = Vector.select().join(Embedding).where(
            (Embedding.run==run) & (
                Embedding.period==Period.get(
                    Period.period==period
                )
            )
        )
        return pd.DataFrame(dict((vec.word.word, pickle.loads(vec.vector)) for vec in tqdm(vecs))).T
        #)

In [38]:
# vecs('1720-1740')

## Distances

```python
class Distance(BaseModel):
    #vector1=ForeignKeyField(Vector)
    #vector2=ForeignKeyField(Vector)
    type = CharField(unique=False)
    
    word1=ForeignKeyField(Word)
    word2=ForeignKeyField(Word)
    
    model1=ForeignKeyField(Embedding)
    model2=ForeignKeyField(Embedding)
    
    value = FloatField()
#     rank = IntegerField()
#     z = FloatField()
#     perc = FloatField()
```

In [39]:
def get_valid_words_from_db():
    with db.atomic(): return {w.word for w in Word.select().where(Word.is_valid==True)}
get_valid_words = get_valid_words_from_db

In [40]:
# get_valid_words_from_db()

In [41]:
def ingest_dists_model(period,run=1,only_valid_words=True,lim=25000):
    embed = Embedding.get((Embedding.period==Period.get(Period.period==period)) & (Embedding.run==run))
    
    dfvecs=vecs(period,run)
    if only_valid_words: dfvecs=dfvecs.loc[set(dfvecs.index)&get_valid_words()]
    if lim: dfvecs=dfvecs[:lim]
    print(dfvecs.shape)
    dfsim=pd.DataFrame(
        fastdist.cosine_pairwise_distance(
            dfvecs.values.astype(float),
            return_matrix=True
        ),
        index=dfvecs.index,
        columns=dfvecs.index
    )
    dfdist=1-dfsim
    
    dfmelt=dfdist.rename_axis('word1').reset_index().melt(id_vars=['word1'],var_name='word2',value_name='value')
    dfmelt=dfmelt.query('word1<word2')
#     dfmelt['rank']=dfmelt.value.rank(ascending=False)
#     dfmelt['z']=(dfmelt.value - dfmelt.value.mean())/dfmelt.value.std()
#     dfmelt['perc']=dfmelt['rank'] / len(dfmelt) * 100
    
    def iterr():
        for w1,w2,v in tqdm(zip(dfmelt.word1,dfmelt.word2,dfmelt.value), total=len(dfmelt)):
            yield dict(
                type='cos',
                word1=dbd_word.get(w1),
                word2=dbd_word.get(w2),
                model1=embed,
                model2=embed,
                value=v
            )
    
    return ingest(Distance, iterr())

In [42]:
ingest_dists_model('1720-1740')

[Koselleck] (18:36:32) (13486, 100) (+4.6s)
  2%|▏         | 1480055/90929355 [03:51<4:01:32, 6172.11it/s]
KeyboardInterrupt



In [None]:
Distance.select().where(Distance.word1==Word.get(Word.word=='creeping')).first()

In [None]:
# len(list(Distance.select()))