In [1]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
pd.options.display.float_format = '{:,.3f}'.format
import argparse
pd.options.mode.chained_assignment = None
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid", font_scale = 1.5)
sns.set_context(rc={"lines.markersize": 10}) # controls size of style markers in line plots

import matplotlib

import pickle as pkl
from matplotlib import pyplot as plt
new_plot_col=list(range(1800,2010,20))
from scipy.stats.stats import pearsonr
from scipy.stats.stats import pearsonr

from functools import reduce

In [2]:
br_to_us=pd.read_excel("../data/Book.xlsx",skiprows=[0])
br_to_us_dict=dict(zip(br_to_us.UK.tolist(),br_to_us.US.tolist()))
spelling_replacement={'modifier':br_to_us_dict,'head':br_to_us_dict}

def lemma_maker(x, y):
    #print(lemmatizer.lemmatize(x,y))
    return lemmatizer.lemmatize(x,y)

In [3]:
parser = argparse.ArgumentParser(description='Compute features from embeddings')

parser.add_argument('--temporal',  type=int,
                    help='Value to bin the temporal information: 0 (remove temporal information), 1 (no binning), 10 (binning to decades), 20 (binning each 20 years) or 50 (binning each 50 years)')

parser.add_argument('--cutoff', type=int, default=50,
                    help='Cut-off frequency for each compound per time period : none (0), 20, 50 and 100')

parser.add_argument('--contextual', action='store_true',
                    help='Is the model contextual')

args = parser.parse_args('--temporal 10 --cutoff 20'.split())

In [4]:
print(f'Cutoff: {args.cutoff}')
print(f'Time span:  {args.temporal}')

Cutoff: 20
Time span:  10


In [5]:
temp_cutoff_str=str(args.temporal)+'_'+str(args.cutoff)

In [6]:
if args.contextual:
    comp_df_path='../../datasets/compounds_CompoundAware_'+temp_cutoff_str+'_300.pkl'
    mod_df_path='../../datasets/modifiers_CompoundAware_'+temp_cutoff_str+'_300.pkl'
    head_df_path='../../datasets/heads_CompoundAware_'+temp_cutoff_str+'_300.pkl'
    features_df_path='../../datasets/features_CompoundAware_'+temp_cutoff_str+'_300.pkl'
else:
    comp_df_path='../../datasets/compounds_CompoundAgnostic_'+temp_cutoff_str+'_300.pkl'
    mod_df_path='../../datasets/constituents_CompoundAgnostic_'+temp_cutoff_str+'_300.pkl'
    head_df_path='../../datasets/constituents_CompoundAgnostic_'+temp_cutoff_str+'_300.pkl'
    features_df_path='../../datasets/features_CompoundAgnostic_'+temp_cutoff_str+'_300.pkl'

In [7]:
heads=pd.read_pickle(head_df_path)

if args.temporal!=0:
    heads.index.set_names('time', level=1,inplace=True)
    heads.index.set_names('head',level=0,inplace=True)

In [8]:
modifiers=pd.read_pickle(mod_df_path)

if args.temporal!=0:
    modifiers.index.set_names('time', level=1,inplace=True)
    modifiers.index.set_names('modifier',level=0,inplace=True)

In [20]:
compounds=pd.read_pickle(comp_df_path)

if args.temporal!=0:
    compounds.index.set_names('time', level=2,inplace=True)
compounds.drop(['common'],axis=1,inplace=True)
compounds=compounds+1

In [41]:
if args.temporal!=0:
    all_comps=compounds.reset_index()[['modifier','head','time']]
    mod_prod=compounds.groupby(['modifier','time']).size().to_frame()
    mod_prod.columns=['mod_prod']
    head_prod=compounds.groupby(['head','time']).size().to_frame()
    head_prod.columns=['head_prod']
    prod1=pd.merge(all_comps,mod_prod.reset_index(),how='left',on=['modifier','time'])
    productivity=pd.merge(prod1,head_prod.reset_index(),how='left',on=['head','time'])
    productivity.set_index(['modifier','head','time'],inplace=True)
else:
    all_comps=compounds.reset_index()[['modifier','head']]
    mod_prod=compounds.groupby(['modifier']).size().to_frame()
    mod_prod.columns=['mod_prod']
    head_prod=compounds.groupby(['head']).size().to_frame()
    head_prod.columns=['head_prod']
    prod1=pd.merge(all_comps,mod_prod.reset_index(),how='left',on=['modifier'])
    productivity=pd.merge(prod1,head_prod.reset_index(),how='left',on=['head'])
    productivity.set_index(['modifier','head'],inplace=True)   

In [10]:
if args.temporal!=0:
    
    compound_decade_counts=compounds.groupby('time').sum().sum(axis=1).to_frame()
    compound_decade_counts.columns=['N']

    XY=compounds.groupby(['modifier','head','time']).sum().sum(axis=1).to_frame()
    X_star=compounds.groupby(['modifier','time']).sum().sum(axis=1).to_frame()
    Y_star=compounds.groupby(['head','time']).sum().sum(axis=1).to_frame()


else:
    XY=compounds.groupby(['modifier','head']).sum().sum(axis=1).to_frame()
    X_star=compounds.groupby(['modifier']).sum().sum(axis=1).to_frame()
    Y_star=compounds.groupby(['head']).sum().sum(axis=1).to_frame()


    
XY.columns=['a']

X_star.columns=['x_star']
Y_star.columns=['star_y']


if args.temporal!=0:
 
    merge1=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier','time'])

    information_feat=pd.merge(merge1,Y_star.reset_index(),on=['head','time'])
else:
    merge1=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier'])

    information_feat=pd.merge(merge1,Y_star.reset_index(),on=['head'])    

information_feat['b']=information_feat['x_star']-information_feat['a']
information_feat['c']=information_feat['star_y']-information_feat['a']

if args.temporal!=0:
    information_feat=pd.merge(information_feat,compound_decade_counts.reset_index(),on=['time'])



else:
 
    information_feat['N']=compounds.reset_index().drop(['modifier','head'],axis=1).sum().sum()
    

information_feat['d']=information_feat['N']-(information_feat['a']+information_feat['b']+information_feat['c'])
information_feat['x_bar_star']=information_feat['N']-information_feat['x_star']
information_feat['star_y_bar']=information_feat['N']-information_feat['star_y']

if args.temporal!=0:

    information_feat.set_index(['modifier','head','time'],inplace=True)
else:
    information_feat.set_index(['modifier','head'],inplace=True)


information_feat.replace(0,0.0001,inplace=True)
information_feat['log_ratio']=2*(information_feat['a']*np.log((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y']))+\
information_feat['b']*np.log((information_feat['b']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y_bar']))+\
information_feat['c']*np.log((information_feat['c']*information_feat['N'])/(information_feat['x_bar_star']*information_feat['star_y']))+\
information_feat['d']*np.log((information_feat['d']*information_feat['N'])/(information_feat['x_bar_star']*information_feat['star_y_bar'])))
information_feat['ppmi']=np.log2((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y']))
information_feat['local_mi']=information_feat['a']*information_feat['ppmi']
information_feat.ppmi.loc[information_feat.ppmi<=0]=0
information_feat.drop(['a','x_star','star_y','b','c','d','N','d','x_bar_star','star_y_bar'],axis=1,inplace=True)

In [49]:
information_feat

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,log_ratio,ppmi,local_mi
modifier,head,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
'_noun,a'isha_noun,1970,2766.334,6.642,1995.446
'_noun,a_noun,1970,1085.989,0.000,-625.496
a_noun,a_noun,1970,403.808,2.063,620.822
aa_noun,a_noun,1970,2000.089,6.170,1856.897
aaa_noun,a_noun,1970,1895.583,5.923,1782.637
...,...,...,...,...,...
york_noun,public_noun,1810,4171.651,9.947,2997.548
young_noun,colkitto_noun,1810,5000.581,11.876,3560.731
youth_noun,strain_noun,1810,5037.256,11.872,3586.584
zuniga_noun,vida_noun,1810,6523.014,14.193,4271.218


In [43]:
new_compounds=compounds-1


compound_modifier_sim=new_compounds.multiply(modifiers.reindex(new_compounds.index, method='ffill')).sum(axis=1).to_frame()
compound_modifier_sim.columns=['sim_with_modifier']


compound_head_sim=new_compounds.multiply(heads.reindex(new_compounds.index, method='ffill')).sum(axis=1).to_frame()
compound_head_sim.columns=['sim_with_head']

prod_mod=compound_modifier_sim.groupby('modifier').size().to_frame()
prod_mod.columns=['modifier_prod']

prod_head=compound_modifier_sim.groupby('head').size().to_frame()
prod_head.columns=['head_prod']

if args.temporal!=0:
    constituent_sim=new_compounds.reset_index()[['modifier','head','time']].merge(modifiers.reset_index(),how='left',on=['modifier','time'])
    constituent_sim.set_index(['modifier','head','time'],inplace=True)
else:
    constituent_sim=new_compounds.reset_index()[['modifier','head']].merge(modifiers.reset_index(),how='left',on=['modifier'])
    constituent_sim.set_index(['modifier','head'],inplace=True)


constituent_sim=constituent_sim.multiply(heads.reindex(constituent_sim.index, method='ffill')).sum(axis=1).to_frame()
constituent_sim.columns=['sim_bw_constituents']

In [14]:
dfs = [constituent_sim, compound_head_sim, compound_modifier_sim, information_feat,productivity]
compounds_final = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), dfs)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sim_bw_constituents,sim_with_head,sim_with_modifier,log_ratio,ppmi,local_mi
modifier,head,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cattle_noun,society_noun,1900,0.014,0.149,0.149,713.586,2.964,886.908
pragmatist_noun,william_noun,1990,0.457,-0.013,-0.013,2150.121,6.539,1965.028
pembroke_noun,college_noun,1950,0.032,0.008,0.008,2215.085,6.573,1981.832
louisville_noun,water_noun,1900,0.043,-0.017,-0.017,1888.115,5.914,1767.303
ground_noun,coriander_noun,1970,0.900,0.002,0.002,4673.332,11.193,3370.050
...,...,...,...,...,...,...,...,...
onion_noun,vz_noun,2000,0.446,-0.030,-0.030,3745.462,10.368,3115.456
anna_noun,seward_noun,1850,0.253,0.017,0.017,4589.400,11.908,3586.269
parliament_noun,town_noun,1800,0.493,-0.020,-0.020,1969.783,6.046,1813.043
consumer_noun,advocacy_noun,1980,0.005,0.008,0.008,2491.116,7.395,2210.242


In [45]:
if args.temporal!=0:
    compounds_final=pd.pivot_table(compounds_final.reset_index(), index=['modifier','head'], columns=['time'])

    compounds_final.fillna(0,inplace=True)
    compounds_final -= compounds_final.min()
    compounds_final /= compounds_final.max()
    compounds_final_1=compounds_final.columns.get_level_values(0)
    compounds_final_2=compounds_final.columns.get_level_values(1)

    cur_year=0
    new_columns=[]
    for year in compounds_final_2:
        new_columns.append(str(year)+"_"+compounds_final_1[cur_year])
        cur_year+=1
    compounds_final.columns=new_columns


else:
    #compounds_final = reduce(lambda left,right: pd.merge(left,right,on=['modifier','head']), dfs)
    #compounds_final.drop(['head_denom','modifier_denom'],axis=1,inplace=True)
    compounds_final.set_index(['modifier','head'],inplace=True)
    compounds_final.fillna(0,inplace=True)
    compounds_final -= compounds_final.min()
    compounds_final /= compounds_final.max()

In [13]:
reddy_comp=pd.read_csv("../data/reddy_compounds.txt",sep="\t")
#print(reddy_comp.columns)
reddy_comp.columns=['compound','to_divide']
reddy_comp['modifier_mean'],reddy_comp['modifier_std'],reddy_comp['head_mean'],reddy_comp['head_std'],reddy_comp['compound_mean'],reddy_comp['compound_std'],_=reddy_comp.to_divide.str.split(" ",7).str
reddy_comp['modifier'],reddy_comp['head']=reddy_comp['compound'].str.split(" ",2).str
reddy_comp.modifier=reddy_comp.modifier.str[:-2]
reddy_comp['head']=reddy_comp['head'].str[:-2]
reddy_comp.drop(['compound','to_divide'],axis=1,inplace=True)
reddy_comp['modifier']=np.vectorize(lemma_maker)(reddy_comp['modifier'],'n')
reddy_comp['head']=np.vectorize(lemma_maker)(reddy_comp['head'],'n')
reddy_comp.replace(spelling_replacement,inplace=True)
#reddy_comp['modifier']=reddy_comp['modifier']+"_noun"
#reddy_comp['head']=reddy_comp['head']+"_noun"
reddy_comp=reddy_comp.apply(pd.to_numeric, errors='ignore')
#reddy_comp.set_index(['modifier','head'],inplace=True)

In [14]:
comp_90=pd.read_csv("../data/compounds90.txt",sep="\t")
comp_90['mod_pos'],comp_90['head_pos']=comp_90.compound_lemmapos.str.split('_').str
comp_90['modifier'],comp_90['mod_pos']=comp_90.mod_pos.str.split('/').str
comp_90['head'],comp_90['head_pos']=comp_90.head_pos.str.split('/').str
comp_90=comp_90.loc[~(comp_90.mod_pos=="ADJ")]
comp_90=comp_90.loc[:,['avgModifier','stdevModifier','avgHead','stdevHeadModifier','compositionality','stdevHeadModifier','modifier','head']]
comp_90.columns=reddy_comp.columns

In [15]:
comp_ext=pd.read_csv("../data/compounds_ext.txt",sep="\t")
comp_ext['mod_pos'],comp_ext['head_pos']=comp_ext.compound_lemmapos.str.split('_').str
comp_ext['modifier'],comp_ext['mod_pos']=comp_ext.mod_pos.str.split('/').str
comp_ext['head'],comp_ext['head_pos']=comp_ext.head_pos.str.split('/').str
comp_ext=comp_ext.loc[~(comp_ext.mod_pos=="ADJ")]

comp_ext=comp_ext.loc[:,['avgModifier','stdevModifier','avgHead','stdevHeadModifier','compositionality','stdevHeadModifier','modifier','head']]
comp_ext.columns=reddy_comp.columns

In [16]:
all_compounds=pd.concat([reddy_comp,comp_ext,comp_90],ignore_index=True)
all_compounds['modifier']=all_compounds['modifier']+"_noun"
all_compounds['head']=all_compounds['head']+"_noun"
all_compounds

Unnamed: 0,modifier_mean,modifier_std,head_mean,head_std,compound_mean,compound_std,modifier,head
0,3.867,1.118,4.867,0.340,4.250,0.871,end_noun,user_noun
1,1.607,1.655,1.893,1.496,1.704,1.717,firing_noun,line_noun
2,2.821,1.965,4.862,0.345,3.828,1.234,game_noun,plan_noun
3,4.767,0.423,4.862,0.345,4.800,0.476,application_noun,form_noun
4,0.600,0.800,4.586,1.099,1.310,1.021,snail_noun,mail_noun
...,...,...,...,...,...,...,...,...
205,2.684,1.493,4.158,1.344,3.158,1.344,street_noun,girl_noun
206,4.737,0.452,4.421,0.597,4.632,0.597,subway_noun,system_noun
207,2.222,1.800,4.333,1.757,2.500,1.757,tennis_noun,elbow_noun
208,4.100,1.252,0.350,1.395,1.050,1.395,top_noun,dog_noun


In [20]:
compounds_final['1800_index'].value_counts()

0.000    1127414
0.059          1
0.494          1
0.826          1
0.141          1
          ...   
0.772          1
0.119          1
0.369          1
0.589          1
0.138          1
Name: 1800_index, Length: 13088, dtype: int64

In [27]:
merge_df=all_compounds.merge(compounds_final.reset_index(),on=['modifier','head'],how='inner')
merge_df.set_index(["modifier", "head"], inplace = True)

merge_df.to_csv(features_df_path,sep='\t')