# Semantic fields

In [4]:
from ipynb.fs.full.koselleck import *

## Words to use everywhere

In [5]:
STOPWORDS=None
def get_stopwords(ifn=FN_STOPWORDS):
    global STOPWORDS
    if STOPWORDS is None:
        with open(ifn) as f:
            STOPWORDS=set(tokenize_fast(f.read()))
    return STOPWORDS

In [11]:
# %%timeit
# get_stopwords()

In [10]:
stops=get_stopwords()
len(stops),random.sample(stops,10)

(1929,
 ['downwards',
  'ei',
  "here's",
  'indicated',
  'thousandths',
  'plus',
  'crikey',
  'nevertheless',
  'mid-thirties',
  'fifty-fifty'])

In [8]:
VALIDWORDS={}

def get_valid_words(only_pos={'n*','j*','v*'},max_rank=25000,force=False,lim=None,remove_stopwords=True):
    global VALIDWORDS
    posstr='-'.join(sorted(list(only_pos)))
    if remove_stopwords: stops=get_stopwords()
    key=posstr
    if key in VALIDWORDS:
        words=VALIDWORDS[key]
    else:
        fnfn=FN_WORDS if not only_pos else FN_WORDS.replace('.txt',f'.{posstr}.txt')
        if not force and os.path.exists(fnfn):
            with open(fnfn) as f:
                words=tokenize_fast(f.read())
        else:
            C=get_corpus()
            mfwdf=C.mfw_df(only_pos=only_pos).query(f'ranks_avg<={max_rank}')
            words = list(mfwdf.index)
            with open(fnfn,'w') as of:
                of.write('\n'.join(words))
        VALIDWORDS[key]=words
    words=[w for w in words if not remove_stopwords or w not in stops]
    
    return words[:lim]

In [12]:
# %%timeit
# get_valid_words()

In [10]:
def get_valid_word(**get_valid_words_opts): return random.choice(get_valid_words(**get_valid_words_opts))

In [16]:
# get_valid_word()

In [7]:
words=get_valid_words(force=False)
len(words),words[:10]

(15661,
 ['long',
  'lie',
  'ill',
  'house',
  'young',
  'tile',
  'church',
  'general',
  'english',
  'hand'])

### All nouns?

In [8]:
def get_all_nouns(**y):
    return get_valid_words(only_pos={'nn1','nn2'},**y)

In [9]:
def get_all_nouns_adjs(**y):
    return get_valid_words(only_pos={'nn1','nn2','jj'},**y)

In [13]:
allnouns=get_all_nouns()
len(allnouns),random.sample(allnouns,10),'labor' in get_valid_words()

(8774,
 ['reservation',
  'wrath',
  'distinctions',
  'dialogue',
  'inns',
  'lavender',
  'abstinence',
  'entreaties',
  'procession',
  'arrangements'],
 False)

In [11]:
allwords=get_all_nouns_adjs()
len(allwords),allwords[:10]

(11819,
 ['long',
  'ill',
  'house',
  'young',
  'tile',
  'church',
  'general',
  'english',
  'hand',
  'public'])

### Top nouns

In [11]:
def get_top_nouns(): return get_all_nouns(lim=2000)

In [12]:
topnouns = set(get_all_nouns())
len(topnouns),random.sample(topnouns,10)

(8775,
 ['views',
  'maxims',
  'uniforms',
  'lesson',
  'corporations',
  'property',
  'acknowledgments',
  'rate',
  'crab',
  'tine'])

In [13]:
# Test if in there
for kw in ['culture','progress','value','liberty','sensibility']:
    print(f'{kw} is in topnouns' if kw in topnouns else f'{fk} is not in topnouns')

[Koselleck] (17:22:23) culture is in topnouns (+5.2s)
[Koselleck] (17:22:23) progress is in topnouns (+0.0s)
[Koselleck] (17:22:23) value is in topnouns (+0.0s)
[Koselleck] (17:22:23) liberty is in topnouns (+0.0s)
[Koselleck] (17:22:23) sensibility is in topnouns (+0.0s)


## Keywords

In [14]:
def get_keywords_df(url=URL_KEYWORDS,just_words=False):
    df=pd.read_csv(url).fillna('')
    df['word']=df.word.apply(lambda x: x.lower())
    # df=df[~df.word.isin({'?',''})]
    return df.set_index('word')

In [15]:
dfkw=get_keywords_df()
dfkw

Unnamed: 0_level_0,changes,explanation,in_author,process,change_type,from,toward,when,use,note_transl,Word,Wort
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
?,,,Koselleck,,,,,,,Separation of powers,?,Gewaltenteilung
?,,,Koselleck,,,,,,,,?,Stand
?,,,Koselleck,,,,,,,,?,Völkerrecht
?,,,Koselleck,,,,,,,,?,politischer Körper
?,,,Koselleck,,,,,,,,?,Obrigkeit
...,...,...,...,...,...,...,...,...,...,...,...,...
western,,,Williams,,,,,,,,Western,
work,Man-- Interp++ Freq+ Sing+ Virtue+ Pos+ Perc- Acad+ Active+,,Williams,,,,,,,,Work,
work,Man-- Interp++ Freq+ Sing+ Virtue+ Pos+ Perc- Acad+ Active+,,Koselleck,,,,,,,,Work,Arbeit
world,Interp-- Sing+ Freq+ Abs-,,Koselleck,,,,,,,,World,Welt


In [16]:
# as list
def get_keywords():
    return set(x for x in get_keywords_df().index if x and x[0].isalpha() and not ' ' in x)
get_keywords_l = get_keywords

In [17]:
kwl=get_keywords()
len(kwl),random.sample(kwl,10)

(213,
 ['fraternity',
  'honour',
  'self-sufficiency',
  'myth',
  'balance',
  'genetic',
  'evolution',
  'liberty',
  'law',
  'formalist'])

In [18]:
# test in topnouns
Counter(x in topnouns for x in kwl)

Counter({True: 123, False: 90})

## Keywords

In [19]:
def get_origfields():
    with open(FN_ORIGFIELDS,'rb') as f:
        return pickle.load(f)

In [20]:
origfields=get_origfields()
len(origfields),random.sample(origfields.keys(),10)

(24,
 ['Abs-Conc.MT-Conc.Abs.orig',
  'Abs-Conc.PAV-Conc.Abs.orig',
  'Abs-Conc.PAV-Conc.Conc.orig',
  'Abs-Conc.LSN-Hapt.Neither.orig',
  'Abs-Conc.LSN-Imag.Abs.orig',
  'Abs-Conc.PAV-Conc.Neither.orig',
  'Abs-Conc.Median.Conc.orig',
  'Abs-Conc.Median.Abs.orig',
  'Abs-Conc.Median.Neither.orig',
  'Abs-Conc.MRC-Imag.Neither.orig'])

In [21]:
def get_fields():
    with open(FN_FIELDS) as f:
        fieldd={**json.load(f), **get_origfields()}
        return fieldd

In [22]:
fields=get_fields()
len(fields),random.sample(fields.keys(),10)

(316,
 ['RH.Locke.Specific',
  'HGI.Vice',
  'LSN.Perceptual.Concrete',
  'HGI.Feel',
  'HGI.IAV',
  'HGI.Female',
  'HGI.FREQ',
  'HGI.EnlGain',
  'Abs-Conc.MRC-Conc.Conc.orig',
  'HGI.RspGain'])

## Contrasts

In [23]:
def get_origcontrasts():
    return read_df(os.path.join(PATH_DATA,'data.origcontrasts.pkl'))

In [28]:
get_origcontrasts().iloc[0].pos

{'lymph',
 'bloating',
 'hollowness',
 'mangy',
 'diving',
 'fleecy',
 'hairstyle',
 'corrugated',
 'leafy',
 'attach',
 'cutting',
 'marina',
 'cigarette',
 'hot',
 'lollipop',
 'stair',
 'cuticle',
 'sparerib',
 'lengthy',
 'flytrap',
 'crushed',
 'snatched',
 'certificate',
 'oral',
 'livestock',
 'puff',
 'sizes',
 'booty',
 'folded',
 'rusted',
 'newsletter',
 'earthliness',
 'samba',
 'absorbency',
 'humpback',
 'unhardened',
 'android',
 'stopwatch',
 'granulation',
 'sandbag',
 'incision',
 'hearthside',
 'fornication',
 'cone',
 'grapefruit',
 'grandson',
 'sled',
 'inflated',
 'aspirin',
 'snakeskin',
 'marauding',
 'barelegged',
 'calcification',
 'zipping',
 'tomato',
 'weeds',
 'porkpie',
 'scarred',
 'luscious',
 'neck',
 'blanketed',
 'slithery',
 'strings',
 'crocheting',
 'unshelled',
 'wee',
 'stereo',
 'hobby',
 'fanlight',
 'scruff',
 'house',
 'comfortable',
 'poppy',
 'centigrade',
 'chili',
 'uncleanly',
 'splint',
 'kit',
 'cornflower',
 'purse',
 'twist',
 'doo