# db2

In [1]:
from ipynb.fs.full.koselleck import *

In [2]:
!dropdb koselleck
!createdb koselleck

In [3]:
from peewee import *
from playhouse.postgres_ext import *
from datetime import datetime as dt
from psycopg2.errors import UniqueViolation
db=None

## Model definitions

In [4]:
db=PostgresqlExtDatabase('koselleck')
# db = SqliteDatabase('../data/db2.koselleck.sqlite')
db.connect()

True

In [5]:
class BaseModel(Model):
    class Meta:
        database = db

In [6]:
class Word(BaseModel):
    word = CharField(unique=True)
    pos = CharField()
    pos0 = CharField()
    is_valid = BooleanField()
    count = IntegerField()
    fpm = FloatField()
    rank = IntegerField()

In [7]:
class Period(BaseModel):
    period = CharField(unique=True)
    ymin = IntegerField()
    ymax = IntegerField()
    ybin = IntegerField()

In [8]:
class Corpus(BaseModel):
    corpus = CharField(unique=True)
    name = CharField(unique=True)

In [13]:
class WordEmbedding(BaseModel):
    path = CharField(unique=True)
    period = ForeignKeyField(Period)
    run = IntegerField()
    corpus = ForeignKeyField(Corpus)
    df = BlobField()

In [12]:
# class WordVector(BaseModel):
#     word = ForeignKeyField(Word,backref='vectors')
#     model = ForeignKeyField(Embedding,backref='vectors')
#     vector = ArrayField(field_class=FloatField, index=False)

In [11]:
class VectorDistance(BaseModel):
    vector1=ForeignKeyField(Vector,backref='dists1')
    vector2=ForeignKeyField(Vector,backref='dists2')
    type = CharField(unique=False)
    value = FloatField()
    rank = IntegerField()
    z = FloatField()
    perc = FloatField()

NameError: name 'Vector' is not defined

## Init

In [None]:
models =[Corpus,Period,Embedding,Vector,WordVector,Word,VectorDistance]
with db.atomic() as tx:
    #db.drop_tables(models)
    db.create_tables(models)

## Ingest

In [None]:
def ingest(cls, data_iter):
    with db.atomic() as txn:
        for d in data_iter:
            try:
                cls.create(**d)
            except IntegrityError:
                db.rollback()

In [None]:

ingest(Corpus, [dict(corpus='bpo',name='BPO'), dict(corpus='ecco',name='ECCO')])

### Ingest words

In [None]:
done={w.name for w in Word.select()}

In [None]:
idf=get_corpus().mfw_df().reset_index()
idf['is_valid']=idf['pos0'].isin({'n','v','j'})
idf=idf[~idf.word.isin(done)]

In [None]:
ingest(Word, idf.to_dict('records'))

## Ingest periods


In [None]:
period_lens={5,20,70}

In [None]:
# Models
dfmodels = get_pathdf_models(period_len=None)
dfmodels = dfmodels[dfmodels.period_len.isin(period_lens)]
dfmodels

### Periods

In [None]:
def ingest_periods(dfmodels):
    df=dfmodels[['period','period_len','period_start','period_end']].drop_duplicates()
    df.columns = ['period', 'ybin', 'ymin', 'ymax']
    return ingest(Period, df.to_dict('records'))

In [None]:
ingest_periods(dfmodels)

In [None]:
# PATH_MODELS

## Ingest embeddings


```python
class Embedding(BaseModel):
    path = CharField(unique=True)
    period = ForeignKeyField(Period)
    run = IntegerField()
    corpus = ForeignKeyField(Corpus)
```

In [None]:
def model_to_vectors(mpath):
    m=load_model(mpath)
    return pd.DataFrame(m.wv.vectors, index=[m.wv.index_to_key[i] for i in range(len(m.wv.vectors))])

In [None]:
# model_to_vectors(dfmodels.iloc[0].path)

In [None]:
def iter_ingest_models(dfmodels):
    for i,row in dfmodels.iterrows():
        yield dict(
            path=os.path.relpath(row.path, PATH_MODELS),
            period = Period.get(Period.period==row.period),
            run = int(row.run.split('_')[-1]),
            corpus = Corpus.get(Corpus.corpus==row.corpus),
            df = model_to_vectors(row.path)
        )

In [None]:
next(iter_ingest_models(dfmodels))

In [None]:
ingest(Embedding, iter_ingest_models(dfmodels))

In [None]:
stop

### Vecs

```python
class WordVector(Vector):
    word = ForeignKeyField(Word,backref='vectors')
    model = ForeignKeyField(Embedding,backref='vectors')
    vector = ArrayField(field_class=FloatField, index=False)
```

In [None]:
class 

In [None]:
def ingest_vecs_from_model(row):
    print(row)
    m=load_model(row.path)
    with db.atomic() as txn:
        words={w.word for w in Word.select()}
        pathid=os.path.relpath(row.path, PATH_MODELS)
        for word,index in m.wv.key_to_index.items():
            if word not in words: continue
            vec=m.wv.vectors[index]
            yield dict(
                word=Word.get(Word.word==word),
                model=Embedding.get(Embedding.path==pathid),
                vector=np.array([float(x) for x in vec]),
            )

In [None]:
# next(ingest_vecs_from_model(dfmodels.iloc[0]))

In [None]:
def iter_ingest_vecs(dfmodels):
    paths=dfmodels.path
    for iterr in pmap_iter(
        ingest_vecs_from_model,
        [row for i,row in dfmodels.iterrows()],
        num_proc=1
    ):
        ingest(WordVector, iterr)
        break

In [None]:
# iter_ingest_vecs(dfmodels)

In [None]:

# with db.atomic():
#     x=WordVector.get(
#         WordVector.word==Word.get(Word.word=='virtue')
#     )
#     print(x)

In [None]:
def get_vecs_from_db(period, run=1):
    with db.atomic():
        vecs = WordVector.select().join(Embedding).where(
            (Embedding.run==run) & (
                Embedding.period==Period.get(
                    Period.period==period
                )
            )
        )
        return pd.DataFrame(
            dict((vec.word.word, vec.vector) for vec in tqdm(vecs))
        )

In [None]:
get_vecs_from_db('1720-1740')

In [None]:
vec.word.word