In [1]:
import pandas as pd
import time
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
import csv
import fastparquet
import spacy
import fasttext
import glob, os
import re
from os.path import isfile

In [14]:
fmodel = fasttext.load_model('/data/dharp/packages/lid.176.bin')
nlp = spacy.load('en_core_web_sm')



def sent_maker(sent_lst):
    ret_sents=[]
    for sent in sent_lst:
        cur_words=[]
        pos_sent=[]
        for word_pos in sent.split(' '):
            word,*pos=word_pos.rsplit('_',1)
            cur_words.append(word)
            cur_sent=' '.join(cur_words)
            cur_sent=re.sub('_', '', cur_sent)
        ret_sents.append(cur_sent)
    return ret_sents

nn='(?!NOUN).*'
comp='NOUN\sNOUN'
word='.*'

n1=f'^{comp}\s{nn}\s{comp}$'
n2=f'^{comp}\s{nn}\s{word}\s{word}$'
n3=f'^{nn}\s{comp}\s{nn}\s{word}$'
n4=f'^{word}\s{nn}\s{comp}\s{nn}$'
n5=f'^{word}\s{word}\s{nn}\s{comp}$'


def delist_lang(lst):
    lang_lst=[]
    for i,lang in enumerate(lst):
        if not lang:
            lang_lst.append(None)
        else:
            lang_lst.append(lang[0])
    return lang_lst


def significance(lst):
    significance_list=[]
    for l in lst:
        if len(l)>1:
            significance_list.append(abs(l[0]-l[1])/np.mean(l[0]+l[1])>0.1)
            #print(f'{conf[0]} {conf[1]} {abs(conf[0]-conf[1])/np.mean(conf[0]+conf[1])>0.1}')
        else:
            significance_list.append(True)
    return significance_list

In [3]:
def decader(x):
    return(x -x%10)
decades=[2000, 1990, 1980, 1970, 1960, 1950, 1940, 1930, 1920, 1900, 1910,
            1890, 1880, 1870, 1850, 1860, 1840, 1830, 1820, 1810, 1800]

In [4]:
def ner_lemma_reducer(sent):
    ner_sent=[]
    lemma=[]
    pos=[]
    parse=[]
    ner=[]
    parsed_sent=nlp(sent)
    for token in parsed_sent:
        parse.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)

    #print(parse)
    parse_sent=' '.join(parse)
    lemma_sent=' '.join(lemma)
    pos_sent=' '.join(pos)
    if parsed_sent.ents:
        for ent in parsed_sent.ents:
            cur_ner='_'.join([str(ent.start_char), str(ent.end_char), ent.label_])
            ner.append(cur_ner)
    else:
        ner.append('')
    ner_sent=' '.join(ner)
    
    return parse_sent,ner_sent,lemma_sent,pos_sent

In [5]:
def lang_tagger(parsed_sent):
    labels,confs=fmodel.predict(parsed_sent,k=-1,threshold=0.1)
    lang_list=delist_lang(labels)    
    significance_list=significance(confs)
    assert len(lang_list)==len(significance_list)
    return lang_list,significance_list

NOUN (nouns),
VERB (verbs), ADJ (adjectives), ADV (adverbs),
PRON (pronouns), DET (determiners and articles),
ADP (prepositions and postpositions), NUM (numerals), CONJ (conjunctions), PRT (particles), ‘.’
(punctuation marks) and X

In [6]:
unigrams="a b c d e f g h i j k l m n o other p pos punctuation q r s t u v w x y z"
unigram_list=unigrams.split()
np.random.shuffle(unigram_list)
num_cores=mp.cpu_count() -1

In [7]:
def trial(df):
    df.reset_index(inplace=True,drop=True)
    df['sent']=sent_maker(df.old_index)
    
    results=np.vectorize(ner_lemma_reducer)(df.sent.values)
    results_df=pd.DataFrame(results)
    results_df=results_df.transpose()
    results_df.columns=['parse_sent','ner_sent','lemma_sent','pos_sent']

    index_df=pd.concat([df,results_df],axis=1,ignore_index=False)
    lang_list,significance_list=lang_tagger(index_df.parse_sent.values.tolist())
    
    index_df['lang']=lang_list
    index_df['lang_conf']=significance_list
    index_df.lang=index_df.lang.str.split('_',n=4).str[-1]
    index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]
    index_df['nwords']=index_df.pos_sent.str.count(' ').add(1)
    index_df=index_df.loc[index_df.nwords==5]
    index_df.lemma_sent=index_df.lemma_sent.str.lower()
    index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN')
    index_df['comp_class']=0

    index_df.loc[index_df.pos_sent.str.contains(n1),'comp_class']=1
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n2),'comp_class']=2
    index_df.loc[index_df.pos_sent.str.contains(n3),'comp_class']=3
    index_df.loc[index_df.pos_sent.str.contains(n4),'comp_class']=4
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n5),'comp_class']=5

    index_df.drop(['parse_sent','sent','lang','lang_conf','nwords'],axis=1,inplace=True)
    #print(index_df)
    return index_df

In [38]:
path_loc="http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-j.gz"
df   = pd.read_csv(path_loc, compression='gzip', header=None, sep="\t", quoting=csv.QUOTE_NONE,usecols=[0,1,2])    
df.columns=['fivegram_pos','year','count']
df=df.loc[df.year>=1800]

index_df=df.groupby(['fivegram_pos'])['count'].sum().reset_index()
index_df.columns=['old_index','total_count']

In [39]:
df=index_df.head(100_000).copy()

In [40]:
    df.reset_index(inplace=True,drop=True)
    df['sent']=sent_maker(df.old_index)
    
    results=np.vectorize(ner_lemma_reducer)(df.sent.values)
    results_df=pd.DataFrame(results)
    results_df=results_df.transpose()
    results_df.columns=['parse_sent','ner_sent','lemma_sent','pos_sent']

In [41]:
    index_df=pd.concat([df,results_df],axis=1,ignore_index=False)
    lang_list,significance_list=lang_tagger(index_df.parse_sent.values.tolist())
    
    index_df['lang']=lang_list
    index_df['lang_conf']=significance_list

In [49]:
contexts=pd.read_pickle('/data/dharp/compounds/Compounding/data/contexts.pkl')

['be_verb',
 'have_verb',
 'not_adv',
 'do_verb',
 'can_verb',
 'may_verb',
 'when_adv',
 'make_verb',
 'would_verb',
 'will_verb',
 'oth_adj',
 'time_noun',
 'see_verb',
 'say_verb',
 'so_adv',
 'man_noun',
 'take_verb',
 'also_adv',
 'such_adj',
 'use_verb',
 'give_verb',
 'only_adv',
 'more_adv',
 'go_verb',
 'could_verb',
 'year_noun',
 'state_noun',
 'then_adv',
 'know_verb',
 'should_verb',
 'come_verb',
 'find_verb',
 'first_adj',
 'great_adj',
 'many_adj',
 'as_adv',
 'good_adj',
 'people_noun',
 'now_adv',
 'where_adv',
 'way_noun',
 'even_adv',
 'very_adv',
 'must_verb',
 'life_noun',
 'day_noun',
 'new_adj',
 'more_adj',
 'how_adv',
 'same_adj',
 'part_noun',
 'case_noun',
 "'_verb",
 'work_noun',
 'get_verb',
 'become_verb',
 'think_verb',
 'system_noun',
 'most_adv',
 'high_adj',
 'well_adv',
 'child_noun',
 'show_verb',
 'however_adv',
 'new_noun',
 'own_adj',
 'number_noun',
 'world_noun',
 'woman_noun',
 'large_adj',
 'call_verb',
 'long_adj',
 'just_adv',
 'power_noun'

In [25]:



    index_df.lang=index_df.lang.str.split('_',n=4).str[-1]
    index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]
    index_df['nwords']=index_df.pos_sent.str.count(' ').add(1)
    index_df=index_df.loc[index_df.nwords==5]
    index_df.lemma_sent=index_df.lemma_sent.str.lower()
    index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN')
    index_df['comp_class']=0

    index_df.loc[index_df.pos_sent.str.contains(n1),'comp_class']=1
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n2),'comp_class']=2
    index_df.loc[index_df.pos_sent.str.contains(n3),'comp_class']=3
    index_df.loc[index_df.pos_sent.str.contains(n4),'comp_class']=4
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n5),'comp_class']=5

Unnamed: 0,old_index,total_count,ner_sent,lemma_sent,pos_sent,comp_class
1244,J-g/kg,98,,j - g / kg,NOUN PUNCT NOUN SYM NOUN,0
1245,J-g/kg_NOUN,77,,j - g / kg,NOUN PUNCT NOUN SYM NOUN,0
1246,J-g/ml,117,0_3_PRODUCT,j - g / ml,NOUN PUNCT NOUN SYM NOUN,0
1247,J-g/ml_NOUN,100,0_3_PRODUCT,j - g / ml,NOUN PUNCT NOUN SYM NOUN,0
2577,J.Biomed.Mater.Res,145,,j.biomed . mater . res,NOUN PUNCT NOUN PUNCT NOUN,0
2578,J.Biomed.Mater.Res_NOUN,144,,j.biomed . mater . res,NOUN PUNCT NOUN PUNCT NOUN,0
3872,J.Fish.Res.Board,204,7_10_PERSON,j.fish . res . board,NOUN PUNCT NOUN PUNCT NOUN,0
6545,J.Pharmacol.Exp.Ther,94,12_15_PERSON,j.pharmacol . exp . ther,NOUN PUNCT NOUN PUNCT ADJ,0
7807,J.Vac.Sci.Tech,62,0_5_PERSON 6_9_PERSON,j.vac . sci . tech,NOUN PUNCT NOUN PUNCT NOUN,0
7808,J.Vac.Sci.Tech_NOUN,62,0_5_PERSON 6_9_PERSON,j.vac . sci . tech,NOUN PUNCT NOUN PUNCT NOUN,0


In [18]:
context_list=[]
pool = Pool(num_cores)
#print("Started with letter "+str(unigram_list))
for temp_contexts in pool.imap_unordered(dataset_extracter,unigram_list):
    context_list.append(temp_contexts)
        
pool.close()
pool.join()
contexts = pd.concat(context_list,ignore_index=True,sort=False)
contexts.decade=contexts.decade.astype("int32")
contexts_df=contexts.pivot_table(values='count',columns='decade',index='context',aggfunc=np.sum)

Started with letter(s) e
Started with letter(s) j
Started with letter(s) y
Started with letter(s) i
Started with letter(s) m
Started with letter(s) x
Started with letter(s) w
Started with letter(s) g
Started with letter(s) f
Started with letter(s) q
Started with letter(s) d
Started with letter(s) pos
Started with letter(s) r
Started with letter(s) b
Started with letter(s) punctuation
Started with letter(s) l
Started with letter(s) h
Started with letter(s) c
Started with letter(s) k
Started with letter(s) p
Started with letter(s) n
Started with letter(s) v
Started with letter(s) o
Started with letter(s) s
Started with letter(s) a
Started with letter(s) z
Started with letter(s) t
Started with letter(s) u
Started with letter(s) other
Finished with letter(s) pos ; Before : 4250, After : 0 Change in percentage : 100.00%
Letter(s) pos took time 0.24 seconds


Finished with letter(s) other ; Before : 97674, After : 0 Change in percentage : 100.00%
Letter(s) other took time 0.36 seconds


Fini

In [20]:
contexts_df

decade,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,...,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000
context,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'_adv,103972.0,125832.0,186022.0,240938.0,223659.0,345615.0,399334.0,633392.0,816868.0,901098.0,...,820092.0,854196.0,884490.0,750755.0,1402187.0,2833805.0,3900740.0,5487186.0,8571111.0,14614001.0
'_noun,19951.0,30788.0,43962.0,73431.0,139412.0,255480.0,206652.0,233312.0,255974.0,258352.0,...,201149.0,179398.0,167217.0,211614.0,343942.0,699959.0,842184.0,937638.0,1231765.0,1692352.0
'_verb,233902.0,287827.0,385176.0,478464.0,571687.0,909050.0,1085939.0,1435716.0,1969267.0,2915489.0,...,4965545.0,4709221.0,4917721.0,4969736.0,7490454.0,12289013.0,20162725.0,36118111.0,68141327.0,127572458.0
'c_noun,,,,,,,,,2.0,1.0,...,,2.0,2.0,1.0,2.0,6.0,9.0,11.0,26.0,67.0
'd_noun,1.0,,1.0,4.0,3.0,2.0,5.0,8.0,7.0,6.0,...,23.0,34.0,22.0,34.0,53.0,103.0,267.0,400.0,1084.0,2112.0
'h_noun,1.0,,,1.0,,,,,,,...,1.0,,2.0,3.0,2.0,1.0,11.0,10.0,5.0,10.0
'i_noun,,,,,,1.0,,,1.0,,...,2.0,1.0,,3.0,1.0,2.0,6.0,6.0,10.0,15.0
'j_noun,,1.0,,,,2.0,,,1.0,,...,1.0,1.0,3.0,2.0,2.0,5.0,9.0,15.0,11.0,24.0
'll_adj,3.0,,,,,2.0,1.0,,2.0,1.0,...,3.0,9.0,,9.0,12.0,21.0,29.0,73.0,114.0,188.0
'll_noun,,3.0,,,3.0,3.0,,3.0,6.0,8.0,...,48.0,37.0,54.0,92.0,90.0,158.0,408.0,628.0,1069.0,1833.0


In [22]:
total_freq_all=contexts_df.sum(axis=1)
total_freq_no_2000=contexts_df.drop(2000,axis=1).sum(axis=1)
all_decade_freq=contexts_df.dropna().sum(axis=1)

all_decade_contents=contexts_df[contexts_df>10]
all_decade_free_min_10=all_decade_contents.dropna().sum(axis=1)

In [23]:
top_total_freq_all=total_freq_all.sort_values(ascending=False).head(50_000).index.tolist()
top_total_freq_no_2000=total_freq_no_2000.sort_values(ascending=False).head(50_000).index.tolist()
top_all_decade_freq=all_decade_freq.sort_values(ascending=False).head(50_000).index.tolist()
top_all_decade_free_min_10=all_decade_free_min_10.sort_values(ascending=False).head(50_000).index.tolist()

In [24]:
len(set(top_total_freq_all).intersection(top_total_freq_no_2000))

48218

In [25]:
len(set(top_all_decade_freq).intersection(top_all_decade_free_min_10))

44488

In [29]:
len(set(top_total_freq_all).intersection(top_all_decade_free_min_10))

39664

In [21]:
chosen_context=contexts.sort_values('count',ascending=False).head(50_000)
chosen_context.context["of"]

KeyError: 'of'

In [32]:
top_all_decade_free_min_10[:100]

['be_verb',
 'have_verb',
 'not_adv',
 'do_verb',
 'can_verb',
 'may_verb',
 'when_adv',
 'make_verb',
 'would_verb',
 'will_verb',
 'oth_adj',
 'time_noun',
 'see_verb',
 'say_verb',
 'so_adv',
 'man_noun',
 'take_verb',
 'also_adv',
 'such_adj',
 'use_verb',
 'give_verb',
 'only_adv',
 'more_adv',
 'go_verb',
 'could_verb',
 'year_noun',
 'state_noun',
 'then_adv',
 'know_verb',
 'should_verb',
 'come_verb',
 'find_verb',
 'first_adj',
 'great_adj',
 'many_adj',
 'as_adv',
 'good_adj',
 'people_noun',
 'now_adv',
 'where_adv',
 'way_noun',
 'even_adv',
 'very_adv',
 'must_verb',
 'life_noun',
 'day_noun',
 'new_adj',
 'more_adj',
 'how_adv',
 'same_adj',
 'part_noun',
 'case_noun',
 "'_verb",
 'work_noun',
 'get_verb',
 'become_verb',
 'think_verb',
 'system_noun',
 'most_adv',
 'high_adj',
 'well_adv',
 'child_noun',
 'show_verb',
 'however_adv',
 'new_noun',
 'own_adj',
 'number_noun',
 'world_noun',
 'woman_noun',
 'large_adj',
 'call_verb',
 'long_adj',
 'just_adv',
 'power_noun'

Four extra columns are added, which are:

$cf$ : Collection frequency, which is the log of the sum of the term across decades, i.e. log(sum(term).

$presence$ : Number of decades a term is present in.

$pattern$ : A binary representation of a word. 1 if the $word$ exists in a decade, 0 otherwise.

In [34]:
pkl.dump(top_all_decade_free_min_10,open( "contexts.pkl", "wb" ) )