In [1]:
import pandas as pd
import fasttext
import time
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
import csv
import spacy
import re
from urllib.request import urlopen
import gzip
import shutil

In [2]:
to_save_path='/data/dharp/compounds/datasets/'
keep_string=r"(.+_(NOUN|ADV|VERB|ADJ|X|PRT|CONJ|PRON|DET|ADP|NUM|\.)|_END_)\s*"

word='.*'

nn='(?!(?:NOUN|PROPN)).*'
comp='(?:NOUN|PROPN)\s(?:NOUN|PROPN)'

ner_cats=['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
n1=f'^{comp}\s{nn}\s{comp}$'
n2=f'^{comp}\s{nn}\s{word}\s{word}$'
n3=f'^{nn}\s{comp}\s{nn}\s{word}$'
n4=f'^{word}\s{nn}\s{comp}\s{nn}$'
n5=f'^{word}\s{word}\s{nn}\s{comp}$'

In [3]:
fmodel = fasttext.load_model('/data/dharp/packages/lid.176.bin')
nlp = spacy.load('en_core_web_lg')




In [4]:
def delist_lang(lst):
    lang_lst=[]
    for i,lang in enumerate(lst):
        if not lang:
            lang_lst.append(None)
        else:
            lang_lst.append(lang[0])
    return lang_lst


def significance(lst):
    significance_list=[]
    for l in lst:
        if len(l)>1:
            significance_list.append(abs(l[0]-l[1])/np.mean(l[0]+l[1])>0.1)
            #print(f'{conf[0]} {conf[1]} {abs(conf[0]-conf[1])/np.mean(conf[0]+conf[1])>0.1}')
        else:
            significance_list.append(True)
    return significance_list

In [5]:
def sent_maker(sent_lst):
    ret_sents=[]
    g_pos=[]
    for sent in sent_lst:
        cur_words=[]
        pos_sent=[]
        sent=sent.replace('_END_','@@@_.')
        for word_pos in sent.split(' '):
            word,pos=word_pos.rsplit('_',1)
            cur_words.append(word)
            pos_sent.append(pos)
            cur_sent=' '.join(cur_words)
            cur_pos=' '.join(pos_sent)
        ret_sents.append(cur_sent)
        g_pos.append(cur_pos)
    return ret_sents,g_pos

In [6]:
def ner_lemma_reducer(sent):
    ner_sent=[]
    lemma=[]
    pos=[]
    #parse=[]
    is_comp=False
    comp_ner_type=[]
    parsed_sent=nlp(sent)
    for token in parsed_sent:
        #parse.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        #if token.ent_type_=="":
            #to_add="NONNER"
        #else:
            #to_add=token.ent_type_
        if token.dep_=="compound":
            is_comp=True
            if token.ent_type_!="":
                comp_ner_type.append(token.ent_type_)
    #print(parse)
    comp_ner_sent=' '.join(comp_ner_type)
    lemma_sent=' '.join(lemma)
    pos_sent=' '.join(pos)
    #ner_token_sent=' '.join(ner_token)
    #dep_sent=' '.join(dep)
    #ner_length=0
    #if parsed_sent.ents:
    #    for ent in parsed_sent.ents:
            #cur_ner=
            #cur_ner='_'.join([str(ent.start_char), str(ent.end_char), ent.label_])
    #        ner_length+=ent.end_char-ent.start_char
            #ner.append(cur_ner)
    #else:
        #ner.append("")
    #ner_sent=' '.join(ner)
    
    return lemma_sent,pos_sent,is_comp,comp_ner_sent

In [7]:
def lang_tagger(parsed_sent):
    labels,confs=fmodel.predict(parsed_sent,k=-1,threshold=0.1)
    lang_list=delist_lang(labels)    
    significance_list=significance(confs)
    assert len(lang_list)==len(significance_list)
    return lang_list,significance_list

In [8]:
def index_processor(df):
    df.reset_index(inplace=True,drop=True)
    ret_lst=sent_maker(df.old_index)
    
    df['sent']=ret_lst[0]
    df['g_pos']=ret_lst[1]
    
    results=np.vectorize(ner_lemma_reducer)(df.sent.values)
    results_df=pd.DataFrame(results)
    results_df=results_df.transpose()
    results_df.columns=['lemma_sent','pos_sent','num_comp','comp_ner_sent']

    #results_df=results_df.loc[~results_df.ner_token_sent.str.contains("PERSON PERSON")]

    index_df=pd.concat([df,results_df],axis=1,ignore_index=False)

    lang_list,significance_list=lang_tagger(index_df.sent.values.tolist())
    index_df['lang']=lang_list
    index_df['lang_conf']=significance_list
    index_df.lang=index_df.lang.str.split('_',n=4).str[-1]
    index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]

    index_df['nwords']=index_df.pos_sent.str.count(' ').add(1)
    index_df=index_df.loc[index_df.nwords==5]
    
    index_df.lemma_sent=index_df.lemma_sent.str.lower()
    #index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN',regex=False)
    #index_df.pos_sent=index_df.pos_sent.str.replace('AUX','VERB',regex=False)
    #index_df.pos_sent=index_df.pos_sent.str.replace('CCONJ','CONJ',regex=False)
    #index_df.g_pos=index_df.g_pos.str.replace('.','PUNCT',regex=False)
    #index_df.g_pos=index_df.g_pos.str.replace('PRT','ADP',regex=False)
    if index_df.shape[0]==0:
        return pd.DataFrame()
    index_df['lemma_pos']=str_joiner(index_df)
    index_df['nX']=index_df.pos_sent.str.count('X')-index_df.pos_sent.str.count('AUX')
    index_df=index_df.loc[~(index_df.nX>1)]
    
    #index_df['ner_perc']=index_df.ner_length/index_df.sent.str.len()
   
    index_df['comp_class']=0

    index_df.loc[index_df.pos_sent.str.contains(n1),'comp_class']=1
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n2),'comp_class']=2
    index_df.loc[index_df.pos_sent.str.contains(n3),'comp_class']=3
    index_df.loc[index_df.pos_sent.str.contains(n4),'comp_class']=4
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n5),'comp_class']=5
    
    index_df.drop(['old_index','g_pos','lang','lang_conf','nwords','nX','lemma_sent'],axis=1,inplace=True)
    index_year_df=year_count_split(index_df)
    index_df=index_df.merge(index_year_df, on='lemma_pos',how='right')
    index_df['count']=index_df['count'].astype("int64")
    index_df['year']=index_df['year'].astype("int64")
    index_df=index_df.groupby(['lemma_pos','pos_sent','year','comp_class','num_comp','comp_ner_sent'])['count'].sum().to_frame().reset_index()
    return index_df

In [9]:
def year_count_split(df):
    trial_df=pd.concat([df.lemma_pos, df.year_counts.str.split("\t", expand=True)], axis=1)
    trial_df=pd.melt(trial_df, id_vars=["lemma_pos"], value_vars=list(range(len(trial_df.columns)-1))).dropna().drop("variable", axis = 1)
    trial_df[['year','count']] = trial_df.value.str.split(",", n=3, expand=True)[[0,1]]
    return trial_df.drop(['value'],axis=1).reset_index(drop=True)

In [10]:
def str_joiner(df):
    #print(df)
    new_df=pd.DataFrame()
    try:
        new_df[['l1','l2','l3','l4','l5']]=df.lemma_sent.str.split(" ",expand=True)
        new_df[['p1','p2','p3','p4','p5']]=df.pos_sent.str.split(" ",expand=True)
    except:
        return pd.DataFrame()
    new_df['lemma_pos']=new_df.l1+"_"+new_df.p1+" "+\
                        new_df.l2+"_"+new_df.p2+" "+\
                        new_df.l3+"_"+new_df.p3+" "+\
                        new_df.l4+"_"+new_df.p4+" "+\
                        new_df.l5+"_"+new_df.p5
    return new_df['lemma_pos']

In [11]:
fname='5-01700-of-19423'
lnk=f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/{fname}.gz'
#index_df   = pd.read_csv(lnk, compression='gzip', squeeze=True, header=None,quoting=csv.QUOTE_NONE)   

In [24]:
import io
import requests

# defining the url
url = "https://data.brasil.io/dataset/covid19/caso_full.csv.gz"
response = requests.get(url)
content = response.content
print(type(content))

#compressed_file = io.BytesIO(response.read())
#decompressed_file = gzip.GzipFile(fileobj=compressed_file)

<class 'bytes'>


In [25]:
response

<http.client.HTTPResponse at 0x7f67cfc4e850>

In [21]:
tt=decompressed_file.read().split('\n')

TypeError: a bytes-like object is required, not 'str'

In [18]:
tt[:100]

b'D _NOUN_ _NOUN_ _ADP_ depression\t1952,2,1\t1953,5,3\t1956,3,3\t1959,1,1\t1964,1,1\t1967,1,1\t1971,1,1\t1980'

In [24]:
index_df[['old_index','year_counts']]=index_df[0].str.split('\t',n=1,expand=True)
index_df=index_df.loc[index_df.old_index.str.match("^"+keep_string*5+"$",na=False)]
index_df.drop(0,axis=1,inplace=True)
index_df

Unnamed: 0,old_index,year_counts
39,"D'_NOUN Artagnan_NOUN ,_. coloring_VERB a_DET","1888,1,1\t1890,1,1\t1892,1,1\t1893,8,8\t1894,1..."
551,D'_NOUN AMATO_NOUN :_. I_PRON understand_VERB,"1975,1,1\t1986,15,11\t1988,11,8\t1989,2,2\t199..."
679,D'_NOUN E_NOUN '_. is_VERB the_DET,"1826,4,1\t1832,8,2\t1835,4,1\t1836,4,1\t1838,8..."
798,"D'_NOUN Onofrio_NOUN -_. Flores_NOUN ,_.","1976,1,1\t1978,1,1\t1980,1,1\t1981,1,1\t1982,2..."
1235,"D'_NOUN you_PRON know_VERB ,_. only_ADV","1886,1,1\t1924,9,9\t1925,3,3\t1926,1,1\t1928,2..."
...,...,...
3198962,D'_NOUN Amato_NOUN Laxalt_NOUN Stennis_NOUN Da...,"1981,3,3\t1982,18,14\t1983,21,17\t1984,5,5\t19..."
3199391,D'_NOUN you_PRON mean_VERB that_DET ?_.,"1896,1,1\t1900,6,4\t1901,2,2\t1908,3,3\t1909,1..."
3199392,"D'_NOUN Hondt_NOUN ,_. W._NOUN ,_.","1966,3,1\t1982,5,1\t1985,2,1\t1987,3,3\t1988,2..."
3199422,D'_NOUN s_VERB 31_NUM consecutive_ADJ day_NOUN,"1974,5,5\t1981,1,1\t1986,1,1\t1987,8,8\t1989,2..."


In [25]:
if index_df.shape[0]<10_000:
    
    cur_time=time.time()
    new_index_df=index_processor(index_df)
    print(f'Total time taken {round(time.time()-cur_time)} secs')
    
else:
    num_partitions=round(0.95*mp.cpu_count())
    cur_time=time.time()
    df_split = np.array_split(index_df, num_partitions)
    pool = Pool(num_partitions)
    print('Started parallelization')
    results=pool.map_async(index_processor,df_split)
    pool.close()
    pool.join()
        
        
    curr_df_list=results.get()
    new_index_df=pd.concat(curr_df_list,ignore_index=True)
    print(f'Total time taken {round(time.time()-cur_time)} secs')

Started parallelization
Total time taken 18 secs


In [29]:
new_index_df

Unnamed: 0,lemma_pos,pos_sent,year,comp_class,num_comp,comp_ner_sent,count
0,"d'alessandro_NOUN ,_PUNCT a._NOUN (_PUNCT 2008...",NOUN PUNCT NOUN PUNCT NUM,1961,0,False,,1
1,"d'alessandro_NOUN ,_PUNCT a._NOUN (_PUNCT 2008...",NOUN PUNCT NOUN PUNCT NUM,2008,0,False,,1
2,"d'alessandro_NOUN ,_PUNCT a._NOUN (_PUNCT 2008...",NOUN PUNCT NOUN PUNCT NUM,2009,0,False,,2
3,"d'alessandro_NOUN ,_PUNCT a._NOUN (_PUNCT 2008...",NOUN PUNCT NOUN PUNCT NUM,2010,0,False,,12
4,"d'alessandro_NOUN ,_PUNCT a._NOUN (_PUNCT 2008...",NOUN PUNCT NOUN PUNCT NUM,2011,0,False,,9
...,...,...,...,...,...,...,...
706,d'artagnan_PROPN go_VERB up_ADP once_ADV more_ADV,PROPN VERB ADP ADV ADV,2015,0,False,,4
707,d'artagnan_PROPN go_VERB up_ADP once_ADV more_ADV,PROPN VERB ADP ADV ADV,2016,0,False,,4
708,d'artagnan_PROPN go_VERB up_ADP once_ADV more_ADV,PROPN VERB ADP ADV ADV,2017,0,False,,14
709,d'artagnan_PROPN go_VERB up_ADP once_ADV more_ADV,PROPN VERB ADP ADV ADV,2018,0,False,,21


In [28]:
new_index_df.to_pickle('/data/dharp/compounds/datasets/googleV3/1700.pkl')


In [None]:
N - N
N 's N


