In [1]:
import pandas as pd
import numpy as np
import argparse
import time
import pickle as pkl

from itertools import product
from functools import reduce
import glob
import os

import seaborn as sns
sns.set(style="whitegrid", font_scale = 2.5)
sns.set_context(rc={"lines.markersize": 17, "lines.linewidth": 2})

import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
parser = argparse.ArgumentParser(description='Compute features from sparse dataset for google version')

parser.add_argument('--inputdir',type=str,
                    help='Provide directory where features are located')
parser.add_argument('--outputdir',type=str,
                    help='Where should the output be stored?')
parser.add_argument('--tag', action='store_true',
                    help='Should the POS tag be kept?')
parser.add_argument('--ppmi', action='store_true',
                    help='Should co-occurence matrix be converted to PPMI values')
parser.add_argument('--plot', action='store_true',
                    help='Should plots be saved')
parser.add_argument('--temporal',  type=int,
                    help='Value to bin the temporal information: 10000 (remove temporal information), 1 (no binning), 10 (binning to decades), 20 (binning each 20 years) or 50 (binning each 50 years)')
parser.add_argument('--cutoff', type=int, default=0,
                    help='Cut-off frequency for each compound per time period : none (0), 20, 50 and 100')


args = parser.parse_args('--inputdir /work/dhar/data/Compounding/ --outputdir /work/dhar/data/Compounding/google/ --temporal 10 --tag'.split())



In [3]:
reddy_df=pd.read_csv('data/reddy_90.txt',sep='\t')
reddy_df['source']='reddy'
cordeiro90_df=pd.read_csv('data/cordeiro_90.txt',sep='\t')
cordeiro90_df['source']='cordeiro90'
cordeiro100_df=pd.read_csv('data/cordeiro_100.txt',sep='\t')
cordeiro100_df['source']='cordeiro100'

    
comp_ratings_df=pd.concat([reddy_df,cordeiro90_df,cordeiro100_df],ignore_index=True)
#comp_ratings_df.drop_duplicates(inplace=True)
comp_ratings_df

Unnamed: 0,modifier,head,avgModifier,stdevModifier,avgHead,stdevHead,compositionality,stdevHeadModifier,is_adj,compound,source,is_original
0,end,user,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,False,end_user,reddy,True
1,firing,line,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,False,firing_line,reddy,True
2,game,plan,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,False,game_plan,reddy,True
3,application,form,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,False,application_form,reddy,True
4,snail,mail,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,False,snail_mail,reddy,True
...,...,...,...,...,...,...,...,...,...,...,...,...
282,wedding,day,4.764700,0.562300,4.058800,1.434900,4.941200,0.242500,False,wedding_day,cordeiro100,True
283,white,noise,0.652200,1.112300,4.043500,1.429500,1.173900,1.230400,True,white_noise,cordeiro100,True
284,white,spirit,1.538500,1.240300,2.038500,1.949000,1.307700,1.257600,True,white_spirit,cordeiro100,True
285,winter,solstice,5.000000,0.000000,4.681800,1.086100,4.545500,1.335500,False,winter_solstice,cordeiro100,True


In [4]:
def testset_tagger(df):

    #### NOUN NOUN
    
    copy_df_1=df.copy()
    copy_df_1.modifier=copy_df_1.modifier+'_NOUN'
    copy_df_1['head']=copy_df_1['head']+'_NOUN'

    ### PROPN NOUN

    copy_df_2=df.copy()
    copy_df_2.modifier=copy_df_2.modifier+'_PROPN'
    copy_df_2['head']=copy_df_2['head']+'_NOUN'
    
    ### NOUN PROPN

    copy_df_3=df.copy()
    copy_df_3.modifier=copy_df_3.modifier+'_NOUN'
    copy_df_3['head']=copy_df_3['head']+'_PROPN'
    
    ### PROPN PROPN    

    copy_df_4=df.copy()
    copy_df_4.modifier=copy_df_4.modifier+'_PROPN'
    copy_df_4['head']=copy_df_4['head']+'_PROPN'
    
   
    ### ADJ/NOUN NOUN
    
    copy_df_5=df.copy()
    
    copy_df_5.loc[copy_df_5.is_adj==True,"modifier"]+="_ADJ"
    copy_df_5.loc[copy_df_5.is_adj==False,"modifier"]+="_NOUN"
    copy_df_5['head']=copy_df_5['head']+'_NOUN'   
    
    
    ### ADJ/NOUN PROPN
    
    copy_df_6=df.copy()
    copy_df_6.loc[copy_df_6.is_adj==True,"modifier"]+="_ADJ"
    copy_df_6.loc[copy_df_6.is_adj==False,"modifier"]+="_NOUN"
    copy_df_6['head']=copy_df_6['head']+'_PROPN'  

    
    #### ADJ/PROPN NOUN
    
    copy_df_7=df.copy()
    copy_df_7.loc[copy_df_7.is_adj==True,"modifier"]+="_ADJ"
    copy_df_7.loc[copy_df_7.is_adj==False,"modifier"]+="_PROPN"
    copy_df_7['head']=copy_df_7['head']+'_NOUN' 
    
    
    #### ADJ/PROPN PROPN
    
    copy_df_8=df.copy()
    copy_df_8.loc[copy_df_8.is_adj==True,"modifier"]+="_ADJ"
    copy_df_8.loc[copy_df_8.is_adj==False,"modifier"]+="_PROPN"
    copy_df_8['head']=copy_df_8['head']+'_PROPN' 
    
    
    complete_df=pd.concat([copy_df_1,copy_df_2,copy_df_3,copy_df_4,copy_df_5,copy_df_6,copy_df_7,copy_df_8],ignore_index=True)
                           
    return complete_df     

In [5]:
comp_ratings_df=testset_tagger(comp_ratings_df)
comp_ratings_df.drop_duplicates(inplace=True)
comp_ratings_df

Unnamed: 0,modifier,head,avgModifier,stdevModifier,avgHead,stdevHead,compositionality,stdevHeadModifier,is_adj,compound,source,is_original
0,end_NOUN,user_NOUN,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,False,end_user,reddy,True
1,firing_NOUN,line_NOUN,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,False,firing_line,reddy,True
2,game_NOUN,plan_NOUN,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,False,game_plan,reddy,True
3,application_NOUN,form_NOUN,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,False,application_form,reddy,True
4,snail_NOUN,mail_NOUN,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,False,snail_mail,reddy,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1701,radioactive_ADJ,waste_PROPN,4.916700,0.288700,4.583300,0.668600,4.583300,0.793000,True,radioactive_waste,cordeiro100,True
1702,rainy_ADJ,season_PROPN,4.590900,0.854100,3.772700,1.659900,4.227300,1.066000,True,rainy_season,cordeiro100,True
1707,social_ADJ,insurance_PROPN,3.551700,1.325200,3.103400,1.819400,2.827600,1.691800,True,social_insurance,cordeiro100,True
1718,white_ADJ,noise_PROPN,0.652200,1.112300,4.043500,1.429500,1.173900,1.230400,True,white_noise,cordeiro100,True


In [6]:
def compound_tag_remover(compounds):
    
    print('Removing tags for compound dataset')
    compounds['head']=compounds['head'].str.replace('_NOUN|_PROPN','',regex=True)
    compounds.modifier=compounds.modifier.str.replace('_NOUN|_PROPN|_ADJ','',regex=True)
    
    compounds=compounds.groupby(['modifier','head','context'])['count'].sum().to_frame().reset_index()

    return compounds


def constituent_tag_remover(constituents,ctype='word'):
    
    print(f'Removing tags for {ctype} dataset')
    constituents[ctype]=constituents[ctype].str.replace('_NOUN|_PROPN|_ADJ','',regex=True)
    
    constituents=constituents.groupby([ctype,'context'])['count'].sum().to_frame().reset_index()

    return constituents

In [7]:
def process_cutoff_compound(df):

    df=df.loc[df.groupby(['modifier','head','time'])['count'].transform('sum').gt(args.cutoff)]
    
    return df


def process_cutoff_constituent(df,ctype='word'):

    df=df.loc[df.groupby([ctype,'time'])['count'].transform('sum').gt(args.cutoff)]
    
    return df

In [8]:
def ppmi(ppmi_df):
    
    ppmi_cols=ppmi_df.columns.tolist()
    ppmi_cols=['XY' if 'count' in x else x for x in ppmi_cols]
    ppmi_df.columns=ppmi_cols

    ppmi_time_counts=ppmi_df.groupby('time')['XY'].sum().to_frame()
    ppmi_time_counts.columns=['N']


    Y_star=ppmi_df.groupby(['context','time'])['XY'].sum().to_frame()
    Y_star.columns=['Y']

    ppmi_df=pd.merge(ppmi_df,Y_star.reset_index(),on=['context','time'])
    
    X_cols=[x for x in ppmi_cols if x not in ['context','XY'] ]


    X_star=ppmi_df.groupby(X_cols)['XY'].sum().to_frame()
    X_star.columns=['X']

    ppmi_df=pd.merge(ppmi_df,X_star.reset_index(),on=X_cols)
    ppmi_df=pd.merge(ppmi_df,ppmi_time_counts.reset_index(),on=['time'])
    ppmi_df['count']=np.log2((ppmi_df['XY']*ppmi_df['N'])/(ppmi_df['X']*ppmi_df['Y']))
    ppmi_df=ppmi_df.loc[ppmi_df['count']>=0]
    ppmi_df.drop(['XY','X','Y','N'],axis=1,inplace=True)
    
    return ppmi_df

In [9]:
def calculate_compound_features(compounds,modifiers,heads,all_comps,not_found_compounds_df,not_found_modifiers_df,not_found_heads_df):
    
    mod_cols=modifiers.columns.tolist()
    mod_cols=['count' if 'count' in x else x for x in mod_cols]
    modifiers.columns=mod_cols

    head_cols=heads.columns.tolist()
    head_cols=['count' if 'count' in x else x for x in head_cols]
    heads.columns=head_cols

    comp_cols=compounds.columns.tolist()
    comp_cols=['count' if 'count' in x else x for x in comp_cols]
    compounds.columns=comp_cols

    print('Calculating productivity features')
    
    compound_types=compounds.groupby(['time']).size().to_frame()
    compound_types.columns=['comp_size']
    
    modifier_types=modifiers.groupby(['time']).size().to_frame()
    modifier_types.columns=['mod_size']
    
    head_types=heads.groupby(['time']).size().to_frame()
    head_types.columns=['head_size']

    mod_prod=compounds.groupby(['modifier','time']).size().to_frame()
    mod_prod.columns=['mod_prod']
    mod_prod=pd.merge(mod_prod.reset_index(),compound_types.reset_index(),on=['time'])
    mod_prod=pd.merge(mod_prod,modifier_types.reset_index(),on=['time'])

    mod_prod['mod_family_size']=np.log2(mod_prod.mod_prod/mod_prod.comp_size)
    mod_prod['mod_family_size_new']=np.log2(mod_prod.mod_prod/mod_prod.mod_size)


    not_found_mod_prod=not_found_modifiers_df.copy()
    not_found_mod_prod['mod_prod']=0
    not_found_mod_prod=pd.merge(not_found_mod_prod,compound_types.reset_index(),on=['time'])
    not_found_mod_prod=pd.merge(not_found_mod_prod,modifier_types.reset_index(),on=['time'])
    not_found_mod_prod['mod_family_size']=0
    not_found_mod_prod['mod_family_size_new']=0


    head_prod=compounds.groupby(['head','time']).size().to_frame()
    head_prod.columns=['head_prod']
    head_prod=pd.merge(head_prod.reset_index(),compound_types.reset_index(),on=['time'])
    head_prod=pd.merge(head_prod,head_types.reset_index(),on=['time'])

    head_prod['head_family_size']=np.log2(head_prod.head_prod/head_prod.comp_size)
    head_prod['head_family_size_new']=np.log2(head_prod.head_prod/head_prod.head_size)


    not_found_head_prod=not_found_heads_df.copy()
    not_found_head_prod['head_prod']=0
    not_found_head_prod=pd.merge(not_found_head_prod,compound_types.reset_index(),on=['time'])
    not_found_head_prod=pd.merge(not_found_head_prod,head_types.reset_index(),on=['time'])

    not_found_head_prod['head_family_size']=0
    not_found_head_prod['head_family_size_new']=0

    
    mod_prod=pd.concat([mod_prod,not_found_mod_prod],ignore_index=True)
    head_prod=pd.concat([head_prod,not_found_head_prod],ignore_index=True)


    prod1=pd.merge(mod_prod.drop(['mod_size','comp_size'],axis=1),all_comps,on=['modifier','time'])
    productivity=pd.merge(head_prod.drop('head_size',axis=1),prod1,on=['head','time'])


    print('Calculating information theory features')
    
    compound_time_counts=compounds.groupby('time')['count'].sum().to_frame()
    
    compound_time_counts.columns=['N']
    XY=compounds.groupby(['modifier','head','time'])['count'].sum().to_frame()    

    XY.columns=['a']
    
    not_found_XY=not_found_compounds_df.copy()
    not_found_XY['count']=0
    not_found_XY=not_found_XY.groupby(['modifier','head','time'])['count'].sum().to_frame()
    not_found_XY.columns=['a']
    
    
    X_star=compounds.groupby(['modifier','time'])['count'].sum().to_frame()
    X_star.columns=['x_star']
    
    not_found_X_star=not_found_modifiers_df.copy()
    not_found_X_star['count']=0
    not_found_X_star=not_found_X_star.groupby(['modifier','time'])['count'].sum().to_frame()
    not_found_X_star.columns=['x_star']

    Y_star=compounds.groupby(['head','time'])['count'].sum().to_frame()
    Y_star.columns=['star_y']

    not_found_Y_star=not_found_heads_df.copy()
    not_found_Y_star['count']=0    
    not_found_Y_star=not_found_Y_star.groupby(['head','time'])['count'].sum().to_frame()
    not_found_Y_star.columns=['star_y']

    XY=pd.concat([XY,not_found_XY])
    X_star=pd.concat([X_star,not_found_X_star])
    Y_star=pd.concat([Y_star,not_found_Y_star])

    merge1=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier','time'])

    information_feat=pd.merge(merge1,Y_star.reset_index(),on=['head','time'])    

    information_feat['b']=information_feat['x_star']-information_feat['a']
    information_feat['c']=information_feat['star_y']-information_feat['a']

    information_feat=pd.merge(information_feat,compound_time_counts.reset_index(),on=['time'])

    information_feat['d']=information_feat['N']-(information_feat['a']+information_feat['b']+information_feat['c'])
    information_feat['x_bar_star']=information_feat['N']-information_feat['x_star']
    information_feat['star_y_bar']=information_feat['N']-information_feat['star_y']
    information_feat['overflow_check']=np.log2((information_feat['d']*information_feat['N']+1)/(information_feat['x_bar_star']*information_feat['star_y_bar']+1))
    information_feat['overflow_check'] = information_feat['overflow_check'].fillna(0)
    information_feat['log_ratio']=2*(\
    information_feat['a']*np.log2((information_feat['a']*information_feat['N']+1)/(information_feat['x_star']*information_feat['star_y']+1))+\
    information_feat['b']*np.log2((information_feat['b']*information_feat['N']+1)/(information_feat['x_star']*information_feat['star_y_bar']+1))+\
    information_feat['c']*np.log2((information_feat['c']*information_feat['N']+1)/(information_feat['x_bar_star']*information_feat['star_y']+1))+\
    information_feat['d']*information_feat['overflow_check'])
    information_feat['ppmi']=np.log2((information_feat['a']*information_feat['N']+1)/(information_feat['x_star']*information_feat['star_y']+1))
    information_feat['local_mi']=information_feat['a']*information_feat['ppmi']
    information_feat.loc[information_feat.ppmi<=0,'ppmi']=0
    information_feat.drop(['a','x_star','star_y','b','c','N','d','x_bar_star','star_y_bar','overflow_check'],axis=1,inplace=True)

    
    compound_features=pd.merge(productivity,information_feat,on=['modifier','head','time'])
    
    print('Frequency features')
            
    modifier_time_counts=modifiers.groupby(['time'])['count'].sum().to_frame()
    modifier_time_counts.columns=['mod_time_count']
    
    head_time_counts=heads.groupby(['time'])['count'].sum().to_frame()
    head_time_counts.columns=['head_time_count']
    
    
    
    
    frequency_feat=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier','time'])
    frequency_feat=frequency_feat.merge(Y_star.reset_index(),on=['head','time'])

    frequency_feat=frequency_feat.merge(compound_time_counts.reset_index(),on='time')
    frequency_feat=frequency_feat.merge(modifier_time_counts.reset_index(),on='time')
    frequency_feat=frequency_feat.merge(head_time_counts.reset_index(),on='time')

    frequency_feat.set_index(['modifier','head','time'],inplace=True)
    frequency_feat.columns=['comp_freq','mod_freq','head_freq','N','mod_time_count','head_time_count']
    frequency_feat['comp_tf']=np.log2(1+frequency_feat.comp_freq)
    
    frequency_feat['log_comp_freq']=np.log2(frequency_feat.comp_freq/frequency_feat.N)

    frequency_feat['mod_tf']=np.log2(1+frequency_feat.mod_freq)
    frequency_feat['log_mod_freq']=np.log2(frequency_feat.mod_freq/frequency_feat.N)
    frequency_feat['log_mod_freq_new']=np.log2(frequency_feat.mod_freq/frequency_feat.mod_time_count)

    frequency_feat['head_tf']=np.log2(1+frequency_feat.head_freq)
    frequency_feat['log_head_freq']=np.log2(frequency_feat.head_freq/frequency_feat.N)
    frequency_feat['log_head_freq_new']=np.log2(frequency_feat.head_freq/frequency_feat.head_time_count)
    frequency_feat.fillna(0,inplace=True)
    frequency_feat.drop(['mod_time_count','head_time_count','N'],axis=1,inplace=True)

    
    compound_features=compound_features.merge(frequency_feat.reset_index(),on=['modifier','head','time'])
    
    return compound_features

In [10]:
def calculate_cosine_features(compounds,modifiers,heads,not_found_compounds_df):
    
    mod_cols=modifiers.columns.tolist()
    mod_cols=['count' if 'count' in x else x for x in mod_cols]
    modifiers.columns=mod_cols

    head_cols=heads.columns.tolist()
    head_cols=['count' if 'count' in x else x for x in head_cols]
    heads.columns=head_cols

    comp_cols=compounds.columns.tolist()
    comp_cols=['count' if 'count' in x else x for x in comp_cols]
    compounds.columns=comp_cols
        
    compound_denom=compounds.copy()
    compound_denom['count']=compound_denom['count']**2
    compound_denom=compound_denom.groupby(['modifier','head','time'])['count'].sum().to_frame()
    compound_denom['count']=np.sqrt(compound_denom['count'])
    compound_denom.columns=['compound_denom']

    modifier_denom=modifiers.copy()
    modifier_denom['count']=modifier_denom['count']**2
    modifier_denom=modifier_denom.groupby(['modifier','time'])['count'].sum().to_frame()
    modifier_denom['count']=np.sqrt(modifier_denom['count'])
    modifier_denom.columns=['modifier_denom']

    head_denom=heads.copy()
    head_denom['count']=head_denom['count']**2
    head_denom=head_denom.groupby(['head','time'])['count'].sum().to_frame()
    head_denom['count']=np.sqrt(head_denom['count'])
    head_denom.columns=['head_denom']

    mod_cols=modifiers.columns.tolist()
    mod_cols=['mod_count' if 'count' in x else x for x in mod_cols]
    modifiers.columns=mod_cols

    head_cols=heads.columns.tolist()
    head_cols=['head_count' if 'count' in x else x for x in head_cols]
    heads.columns=head_cols

    comp_cols=compounds.columns.tolist()
    comp_cols=['comp_count' if 'count' in x else x for x in comp_cols]
    compounds.columns=comp_cols
    
    print('Calculating cosine features')

    print('compound_modifier_sim')
    compound_modifier_sim=pd.merge(compounds,modifiers,on=["modifier","context",'time'])
    compound_modifier_sim['numerator']=compound_modifier_sim['comp_count']*compound_modifier_sim['mod_count']
    compound_modifier_sim=compound_modifier_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
    compound_modifier_sim=pd.merge(compound_modifier_sim.reset_index(),compound_denom.reset_index(),on=["modifier","head",'time'])
    compound_modifier_sim=pd.merge(compound_modifier_sim,modifier_denom.reset_index(),on=['modifier','time'])
    compound_modifier_sim['sim_with_modifier']=compound_modifier_sim['numerator']/(compound_modifier_sim['compound_denom']*compound_modifier_sim['modifier_denom'])
    compound_modifier_sim.drop(['numerator','compound_denom','modifier_denom'],axis=1,inplace=True)

    print('compound_head_sim')
    compound_head_sim=pd.merge(compounds,heads,on=["head","context",'time'])
    compound_head_sim['numerator']=compound_head_sim['comp_count']*compound_head_sim['head_count']
    compound_head_sim=compound_head_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
    compound_head_sim=pd.merge(compound_head_sim.reset_index(),compound_denom.reset_index(),on=["modifier","head",'time'])
    compound_head_sim=pd.merge(compound_head_sim,head_denom.reset_index(),on=['head','time'])
    compound_head_sim['sim_with_head']=compound_head_sim['numerator']/(compound_head_sim['compound_denom']*compound_head_sim['head_denom'])
    compound_head_sim.drop(['numerator','compound_denom','head_denom'],axis=1,inplace=True)
    
    cosine_sim_feat=pd.merge(compound_modifier_sim,compound_head_sim,on=['modifier','head','time'])
    
    print('constituent_sim')

    constituent_sim=pd.merge(heads,compounds,on=["head","context","time"])
    #constituent_sim.drop('comp_count',axis=1,inplace=True)
    constituent_sim=pd.merge(constituent_sim,modifiers,on=["modifier","context","time"])
    constituent_sim['numerator']=constituent_sim['head_count']*constituent_sim['mod_count']
    constituent_sim=constituent_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
    constituent_sim=pd.merge(constituent_sim.reset_index(),head_denom.reset_index(),on=["head","time"])
    constituent_sim=pd.merge(constituent_sim,modifier_denom.reset_index(),on=["modifier","time"])
    constituent_sim['sim_bw_constituents']=constituent_sim['numerator']/(constituent_sim['head_denom']*constituent_sim['modifier_denom'])
    constituent_sim.drop(['numerator','modifier_denom','head_denom'],axis=1,inplace=True)
    
    
    not_found_constituent_sim=pd.merge(not_found_compounds_df,heads,on=["head",'time'])
    not_found_constituent_sim=pd.merge(not_found_constituent_sim,modifiers,on=["modifier",'context','time'])
    not_found_constituent_sim['numerator']=not_found_constituent_sim['head_count']*not_found_constituent_sim['mod_count']
    not_found_constituent_sim=not_found_constituent_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
    not_found_constituent_sim=pd.merge(not_found_constituent_sim.reset_index(),head_denom.reset_index(),on=["head",'time'])
    not_found_constituent_sim=pd.merge(not_found_constituent_sim,modifier_denom.reset_index(),on=["modifier",'time'])
    not_found_constituent_sim['sim_bw_constituents']=not_found_constituent_sim['numerator']/(not_found_constituent_sim['head_denom']*not_found_constituent_sim['modifier_denom'])
    not_found_constituent_sim.drop(['numerator','modifier_denom','head_denom'],axis=1,inplace=True)
    
    constituent_sim=pd.concat([constituent_sim,not_found_constituent_sim])

    
    cosine_sim_feat=pd.merge(cosine_sim_feat,constituent_sim,on=['modifier','head','time'],how='right')
    print('Cordeiro features')

    cosine_sim_feat['beta']=(cosine_sim_feat['sim_with_modifier']-cosine_sim_feat['sim_with_head']*cosine_sim_feat['sim_bw_constituents'])/\
    ((cosine_sim_feat['sim_with_modifier']+cosine_sim_feat['sim_with_head'])*(1-cosine_sim_feat['sim_bw_constituents']))
    cosine_sim_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
    na_values = {"beta": 0.5}
    cosine_sim_feat.fillna(value=na_values,inplace=True)

    cosine_sim_feat['geom_mean_sim']=np.sqrt(cosine_sim_feat['sim_with_modifier']*cosine_sim_feat['sim_with_head'])
    cosine_sim_feat['arith_mean_sim']=cosine_sim_feat[['sim_with_modifier', 'sim_with_head']].mean(axis=1)
    
    cpf_head_df=pd.merge(heads,head_denom.reset_index(),on=["head",'time'])
    cpf_head_df['head_value']=cpf_head_df['head_count']/cpf_head_df['head_denom']
    cpf_head_df.drop(['head_count','head_denom'],axis=1,inplace=True)

    cpf_modifier_df=pd.merge(modifiers,modifier_denom.reset_index(),on=["modifier",'time'])
    cpf_modifier_df['modifier_value']=cpf_modifier_df['mod_count']/cpf_modifier_df['modifier_denom']
    cpf_modifier_df.drop(['mod_count','modifier_denom'],axis=1,inplace=True)
    
    cpf_sim=pd.merge(cpf_head_df,compounds,on=["head","context","time"])
    cpf_sim=pd.merge(cpf_sim,cpf_modifier_df,on=["modifier","context","time"])
    cpf_sim=pd.merge(cpf_sim,cosine_sim_feat[['modifier','head','beta','time']],on=["modifier",'head','time'])

    beta=0.0
    cpf_sim['cp_0']=(beta*(cpf_sim['head_value'])/((cpf_sim['head_value']**2).sum()))+((1-beta)*cpf_sim['modifier_value'])

    beta=0.25
    cpf_sim['cp_25']=(beta*cpf_sim['head_value'])+((1-beta)*cpf_sim['modifier_value'])

    beta=0.5
    cpf_sim['cp_50']=(beta*cpf_sim['head_value'])+((1-beta)*cpf_sim['modifier_value'])

    beta=0.75
    cpf_sim['cp_75']=(beta*cpf_sim['head_value'])+((1-beta)*cpf_sim['modifier_value'])

    beta=1
    cpf_sim['cp_100']=(beta*cpf_sim['head_value'])+((1-beta)*cpf_sim['modifier_value'])

    cpf_sim['cp_beta']=(cpf_sim['beta']*cpf_sim['head_value'])+((1-cpf_sim['beta'])*cpf_sim['modifier_value'])

    temp_cdf_df=cpf_sim[['modifier','head','time','cp_0','cp_25','cp_50','cp_75','cp_100','cp_beta']].copy()
    temp_cdf_df['denom_cp_0']=temp_cdf_df['cp_0']**2
    temp_cdf_df['denom_cp_25']=temp_cdf_df['cp_25']**2
    temp_cdf_df['denom_cp_50']=temp_cdf_df['cp_50']**2
    temp_cdf_df['denom_cp_75']=temp_cdf_df['cp_75']**2
    temp_cdf_df['denom_cp_100']=temp_cdf_df['cp_100']**2
    temp_cdf_df['denom_cp_beta']=temp_cdf_df['cp_beta']**2

    cdf_denom=temp_cdf_df.groupby(['modifier','head','time'])[['denom_cp_0','denom_cp_25','denom_cp_50','denom_cp_75','denom_cp_100','denom_cp_beta']].sum()
    cdf_denom['denom_cp_0']=np.sqrt(cdf_denom['denom_cp_0'])
    cdf_denom['denom_cp_25']=np.sqrt(cdf_denom['denom_cp_25'])
    cdf_denom['denom_cp_50']=np.sqrt(cdf_denom['denom_cp_50'])
    cdf_denom['denom_cp_75']=np.sqrt(cdf_denom['denom_cp_75'])
    cdf_denom['denom_cp_100']=np.sqrt(cdf_denom['denom_cp_100'])
    cdf_denom['denom_cp_beta']=np.sqrt(cdf_denom['denom_cp_beta'])

    cpf_sim['num_cp_0']=cpf_sim['comp_count']*cpf_sim['cp_0']
    cpf_sim['num_cp_25']=cpf_sim['comp_count']*cpf_sim['cp_25']
    cpf_sim['num_cp_50']=cpf_sim['comp_count']*cpf_sim['cp_50']
    cpf_sim['num_cp_75']=cpf_sim['comp_count']*cpf_sim['cp_75']
    cpf_sim['num_cp_100']=cpf_sim['comp_count']*cpf_sim['cp_100']
    cpf_sim['num_cp_beta']=cpf_sim['comp_count']*cpf_sim['cp_beta']

    cpf_sim=cpf_sim.groupby(['modifier','head','time'])[['num_cp_0','num_cp_25','num_cp_50','num_cp_75','num_cp_100','num_cp_beta']].sum()
    cpf_sim=pd.merge(cpf_sim,cdf_denom,on=['modifier','head','time'])
    cpf_sim=pd.merge(cpf_sim,compound_denom.reset_index(),on=["modifier","head","time"])
    cpf_sim['sim_cpf_0']=cpf_sim['num_cp_0']/(cpf_sim['denom_cp_0']*cpf_sim['compound_denom'])
    cpf_sim['sim_cpf_25']=cpf_sim['num_cp_25']/(cpf_sim['denom_cp_25']*cpf_sim['compound_denom'])
    cpf_sim['sim_cpf_50']=cpf_sim['num_cp_50']/(cpf_sim['denom_cp_50']*cpf_sim['compound_denom'])
    cpf_sim['sim_cpf_75']=cpf_sim['num_cp_75']/(cpf_sim['denom_cp_75']*cpf_sim['compound_denom'])
    cpf_sim['sim_cpf_100']=cpf_sim['num_cp_100']/(cpf_sim['denom_cp_100']*cpf_sim['compound_denom'])
    cpf_sim['sim_cpf_beta']=cpf_sim['num_cp_beta']/(cpf_sim['denom_cp_beta']*cpf_sim['compound_denom'])

    cpf_sim=cpf_sim[['modifier','head','time','sim_cpf_0','sim_cpf_25','sim_cpf_50','sim_cpf_75','sim_cpf_100','sim_cpf_beta']].copy()
    
    cosine_sim_feat=cosine_sim_feat.merge(cpf_sim,on=["modifier",'head','time'],how='left')
    
    return cosine_sim_feat

In [11]:
def calculate_setting_similarity(compounds_aware,modifiers_aware,heads_aware,compounds_agnostic,modifiers_agnostic,heads_agnostic,compound_list_df):
    
    mod_awr_cols=modifiers_aware.columns.tolist()
    mod_awr_cols=['count' if 'count' in x else x for x in mod_awr_cols]
    modifiers_aware.columns=mod_awr_cols

    head_awr_cols=heads_aware.columns.tolist()
    head_awr_cols=['count' if 'count' in x else x for x in head_awr_cols]
    heads_aware.columns=head_awr_cols

    comp_awr_cols=compounds_aware.columns.tolist()
    comp_awr_cols=['count' if 'count' in x else x for x in comp_awr_cols]
    compounds_aware.columns=comp_awr_cols

    
    mod_agn_cols=modifiers_agnostic.columns.tolist()
    mod_agn_cols=['count' if 'count' in x else x for x in mod_agn_cols]
    modifiers_agnostic.columns=mod_agn_cols
    
    head_agn_cols=heads_agnostic.columns.tolist()
    head_agn_cols=['count' if 'count' in x else x for x in head_agn_cols]
    heads_agnostic.columns=head_agn_cols
    
    comp_agn_cols=compounds_agnostic.columns.tolist()
    comp_agn_cols=['count' if 'count' in x else x for x in comp_agn_cols]
    compounds_agnostic.columns=comp_agn_cols
    
    print('Calculating setting frequency values')
    
    
    num_modifiers_features_df=(modifiers_aware.groupby(['modifier','time'])['count'].agg(perc_token_modifier='sum', perc_type_modifier='size')/modifiers_agnostic.groupby(['modifier','time'])['count'].agg(perc_token_modifier='sum', perc_type_modifier='size')).reset_index()
    num_heads_features_df=(heads_aware.groupby(['head','time'])['count'].agg(perc_token_head='sum', perc_type_head='size')/heads_agnostic.groupby(['head','time'])['count'].agg(perc_token_head='sum', perc_type_head='size')).reset_index()

    num_compounds_features_df=(compounds_aware.groupby(['modifier','head','time'])['count'].agg(perc_token_comp='sum', perc_type_comp='size')/compounds_agnostic.groupby(['modifier','head','time'])['count'].agg(perc_token_comp='sum', perc_type_comp='size')).reset_index()
    num_compounds_features_df=pd.merge(num_compounds_features_df,num_modifiers_features_df,on=['modifier','time'])
    num_compounds_features_df=pd.merge(num_compounds_features_df,num_heads_features_df,on=['head','time'])
    
    print('Calculating denominator values')

    compound_aware_denom=compounds_aware.copy()
    compound_aware_denom['count']=compound_aware_denom['count']**2
    compound_aware_denom=compound_aware_denom.groupby(['modifier','head','time'])['count'].sum().to_frame()
    compound_aware_denom['count']=np.sqrt(compound_aware_denom['count'])
    compound_aware_denom.columns=['compound_awr_denom']
    
    compound_agnostic_denom=compounds_agnostic.copy()
    compound_agnostic_denom['count']=compound_agnostic_denom['count']**2
    compound_agnostic_denom=compound_agnostic_denom.groupby(['modifier','head','time'])['count'].sum().to_frame()
    compound_agnostic_denom['count']=np.sqrt(compound_agnostic_denom['count'])
    compound_agnostic_denom.columns=['compound_agn_denom']
    

    modifier_aware_denom=modifiers_aware.copy()
    modifier_aware_denom['count']=modifier_aware_denom['count']**2
    modifier_aware_denom=modifier_aware_denom.groupby(['modifier','time'])['count'].sum().to_frame()
    modifier_aware_denom['count']=np.sqrt(modifier_aware_denom['count'])
    modifier_aware_denom.columns=['modifier_awr_denom']
    
    modifier_agnostic_denom=modifiers_agnostic.copy()
    modifier_agnostic_denom['count']=modifier_agnostic_denom['count']**2
    modifier_agnostic_denom=modifier_agnostic_denom.groupby(['modifier','time'])['count'].sum().to_frame()
    modifier_agnostic_denom['count']=np.sqrt(modifier_agnostic_denom['count'])
    modifier_agnostic_denom.columns=['modifier_agn_denom']
    
    
    head_aware_denom=heads_aware.copy()
    head_aware_denom['count']=head_aware_denom['count']**2
    head_aware_denom=head_aware_denom.groupby(['head','time'])['count'].sum().to_frame()
    head_aware_denom['count']=np.sqrt(head_aware_denom['count'])
    head_aware_denom.columns=['head_awr_denom']
    
    head_agnostic_denom=heads_agnostic.copy()
    head_agnostic_denom['count']=head_agnostic_denom['count']**2
    head_agnostic_denom=head_agnostic_denom.groupby(['head','time'])['count'].sum().to_frame()
    head_agnostic_denom['count']=np.sqrt(head_agnostic_denom['count'])
    head_agnostic_denom.columns=['head_agn_denom'] 
    
    
    mod_awr_cols=modifiers_aware.columns.tolist()
    mod_awr_cols=['mod_awr_count' if 'count' in x else x for x in mod_awr_cols]
    modifiers_aware.columns=mod_awr_cols

    head_awr_cols=heads_aware.columns.tolist()
    head_awr_cols=['head_awr_count' if 'count' in x else x for x in head_awr_cols]
    heads_aware.columns=head_awr_cols

    comp_awr_cols=compounds_aware.columns.tolist()
    comp_awr_cols=['comp_awr_count' if 'count' in x else x for x in comp_awr_cols]
    compounds_aware.columns=comp_awr_cols

    
    mod_agn_cols=modifiers_agnostic.columns.tolist()
    mod_agn_cols=['mod_agn_count' if 'count' in x else x for x in mod_agn_cols]
    modifiers_agnostic.columns=mod_agn_cols
    
    head_agn_cols=heads_agnostic.columns.tolist()
    head_agn_cols=['head_agn_count' if 'count' in x else x for x in head_agn_cols]
    heads_agnostic.columns=head_agn_cols
    
    comp_agn_cols=compounds_agnostic.columns.tolist()
    comp_agn_cols=['comp_agn_count' if 'count' in x else x for x in comp_agn_cols]
    compounds_agnostic.columns=comp_agn_cols

    print('Calculating cosine setting features')

    compound_setting_sim=pd.merge(compounds_aware,compounds_agnostic,on=["modifier",'head',"context",'time'])
    compound_setting_sim['numerator']=compound_setting_sim['comp_awr_count']*compound_setting_sim['comp_agn_count']
    compound_setting_sim=compound_setting_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()

    compound_setting_sim=pd.merge(compound_setting_sim.reset_index(),compound_aware_denom.reset_index(),on=["modifier","head",'time'])
    compound_setting_sim=pd.merge(compound_setting_sim,compound_agnostic_denom.reset_index(),on=["modifier","head",'time'])

    compound_setting_sim['sim_bw_settings_comp']=compound_setting_sim['numerator']/(compound_setting_sim['compound_awr_denom']*compound_setting_sim['compound_agn_denom'])
    
    compound_setting_sim=pd.merge(compound_setting_sim,compound_list_df,on=["modifier",'head','time'],how='outer')

    compound_setting_sim.set_index(['modifier','head','time'],inplace=True)
    compound_setting_sim.drop(['numerator','compound_awr_denom','compound_agn_denom'],axis=1,inplace=True)

    head_setting_sim=pd.merge(heads_aware,heads_agnostic,on=['head',"context",'time'])
    head_setting_sim['numerator']=head_setting_sim['head_awr_count']*head_setting_sim['head_agn_count']
    head_setting_sim=head_setting_sim.groupby(['head','time'])['numerator'].sum().to_frame()

    head_setting_sim=pd.merge(head_setting_sim.reset_index(),head_aware_denom.reset_index(),on=["head",'time'])
    head_setting_sim=pd.merge(head_setting_sim,head_agnostic_denom.reset_index(),on=["head",'time'])

    head_setting_sim['sim_bw_settings_head']=head_setting_sim['numerator']/(head_setting_sim['head_awr_denom']*head_setting_sim['head_agn_denom'])
    head_setting_sim.set_index(['head','time'],inplace=True)
    head_setting_sim.drop(['numerator','head_awr_denom','head_agn_denom'],axis=1,inplace=True)
    
    compound_setting_sim=pd.merge(compound_setting_sim.reset_index(),head_setting_sim.reset_index(),on=["head",'time'])


    modifier_setting_sim=pd.merge(modifiers_aware,modifiers_agnostic,on=['modifier',"context",'time'])
    modifier_setting_sim['numerator']=modifier_setting_sim['mod_awr_count']*modifier_setting_sim['mod_agn_count']
    modifier_setting_sim=modifier_setting_sim.groupby(['modifier','time'])['numerator'].sum().to_frame()

    modifier_setting_sim=pd.merge(modifier_setting_sim.reset_index(),modifier_aware_denom.reset_index(),on=["modifier",'time'])
    modifier_setting_sim=pd.merge(modifier_setting_sim,modifier_agnostic_denom.reset_index(),on=["modifier",'time'])

    modifier_setting_sim['sim_bw_settings_modifier']=modifier_setting_sim['numerator']/(modifier_setting_sim['modifier_awr_denom']*modifier_setting_sim['modifier_agn_denom'])
    modifier_setting_sim.set_index(['modifier','time'],inplace=True)
    modifier_setting_sim.drop(['numerator','modifier_awr_denom','modifier_agn_denom'],axis=1,inplace=True)

    compound_setting_sim=pd.merge(compound_setting_sim,modifier_setting_sim.reset_index(),on=["modifier",'time'])
    
    compound_setting_sim=pd.merge(compound_setting_sim,num_compounds_features_df,on=["modifier","head",'time'])
    
    return compound_setting_sim

In [12]:
def merge_comp_ratings(features_df):

    features_df=pd.pivot_table(features_df, index=['modifier','head'], columns=['time'])
    features_df_columns_1=features_df.columns.get_level_values(0)
    features_df_columns_2=features_df.columns.get_level_values(1)

    cur_year=0
    new_columns=[]
    for year in features_df_columns_2:
        new_columns.append(features_df_columns_1[cur_year]+":"+str(year))
        cur_year+=1

    features_df.columns=new_columns
    cur_ratings_df_na=features_df.reset_index().merge(comp_ratings_df,on=['modifier','head'],how='right')


    imputer= SimpleImputer(strategy="median")
    df_med=pd.DataFrame(imputer.fit_transform(features_df))
    df_med.columns=features_df.columns
    df_med.index=features_df.index

    cur_ratings_df_med=df_med.reset_index().merge(comp_ratings_df,on=['modifier','head'],how='right')
    
    return cur_ratings_df_na,cur_ratings_df_med

In [13]:
def process_decades_compound(dec_list,input_dir,ctype='compound'):

    if os.path.exists(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2"):
        print(f'Reading file {ctype}')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2")
        
    elif os.path.exists(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl.bz2") and args.temporal!=10000:
        print(f'Reading decades file {ctype}s/10_{dec_list[0]}_{tag_str}.pkl.bz2')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl.bz2")
        
        print(f'Reducing to {args.temporal}')
        complete_df['time']=complete_df['time']-complete_df['time']%args.temporal

        complete_df=complete_df.groupby(['modifier','head','time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2")


    else:

        df_list=[]

        for dec in dec_list:
            print(dec)
            cur_df=pd.read_pickle(f'{input_dir}/{ctype}s/{dec}.pkl.bz2')
            
            if not args.tag:
                cur_df=compound_tag_remover(cur_df)
            cur_df['time']=dec
            cur_df['time']=cur_df['time']-cur_df['time']%args.temporal
            df_list.append(cur_df)

        print('Done reading compound dataframes')
        complete_df=pd.concat(df_list,ignore_index=True)

        if args.temporal!=10:
            complete_df=complete_df.groupby(['modifier','head','time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2")

    if args.cutoff==0:
        print('No cut-off applied')          
    else:
        print(f'Cut-off: {args.cutoff}')
        complete_df=process_cutoff_compound(complete_df)

    if args.ppmi:
        print('Applying PPMI')
        complete_df=ppmi(complete_df)
    
    return complete_df


def process_decades_constituent(dec_list,input_dir,ctype='word'):
        
    if os.path.exists(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2"):
        print(f'Reading file {ctype}')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2")
        
    elif os.path.exists(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl.bz2") and args.temporal!=10000:
        print(f'Reading decades file {ctype}s/10_{dec_list[0]}_{tag_str}.pkl.bz2')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl.bz2")
        
        print(f'Reducing to {args.temporal}')
        complete_df['time']=complete_df['time']-complete_df['time']%args.temporal
        complete_df=complete_df.groupby([ctype,'time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl.bz2")


    else:

        df_list=[]

        for dec in dec_list:
            cur_df=pd.read_pickle(f'{input_dir}/{ctype}s/{dec}.pkl.bz2')
            if not args.tag:
                cur_df=constituent_tag_remover(cur_df,ctype)
            cur_df['time']=dec
            cur_df['time']=cur_df['time']-cur_df['time']%args.temporal
            df_list.append(cur_df)

        print(f'Done reading {ctype} dataframes')
        complete_df=pd.concat(df_list,ignore_index=True)
        
        if args.temporal!=10:
            complete_df=complete_df.groupby([ctype,'time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}.pkl.bz2")

    if args.cutoff==0:
        print('No cut-off applied')          
    else:
        print(f'Cut-off: {args.cutoff}')
        complete_df=process_cutoff_compound(complete_df,ctype=ctype)

    if args.ppmi:
        print('Applying PPMI')
        complete_df=ppmi(complete_df)

    return complete_df

In [14]:
def feature_extractor_dec(dec_list):
    
    print(f'Current dec list {dec_list}')
    
    compounds_agnostic=process_decades_compound(dec_list,f'{args.inputdir}',ctype="phrase")

    constituents=process_decades_constituent(dec_list,f'{args.inputdir}',ctype='word')
    
    
    compounds_aware=process_decades_compound(dec_list,f'{args.inputdir}',ctype="compound")

    modifiers_aware=process_decades_constituent(dec_list,f'{args.inputdir}',ctype='modifier')

    heads_aware=process_decades_constituent(dec_list,f'{args.inputdir}',ctype='head')
    
    
    #if args.cutoff==0:
        #print('No cut-off applied')          
    #else:
        #print(f'Cut-off: {args.cutoff}')
        #compounds_aware=process_cutoff_compound(compounds_aware)
        #compounds_agnostic=process_cutoff_compound(compounds_agnostic)
        
        #constituents=process_cutoff_constituent(constituents,ctype='word')
        #modifiers_aware=process_cutoff_constituent(modifiers_aware,ctype='modifier')
        #heads_aware=process_cutoff_constituent(heads_aware,ctype='head')

    #if args.ppmi:
        #print('Applying PPMI')
        #compounds_aware=ppmi(compounds_aware)
        #modifiers_aware=ppmi(modifiers_aware)
        #heads_aware=ppmi(heads_aware)
                        
        #compounds_agnostic=ppmi(compounds_agnostic)
        #constituents=ppmi(constituents)
    
    timespan_list_aware_df=pd.DataFrame(compounds_aware.time.unique())
    timespan_list_aware_df.columns=['time']

    compound_list_aware_df=comp_ratings_df[['modifier','head']].copy()
    compound_list_aware_df=compound_list_aware_df.merge(timespan_list_aware_df,how='cross')

    modifier_list_aware_df=comp_ratings_df[['modifier']].drop_duplicates().copy()
    modifier_list_aware_df=modifier_list_aware_df.merge(timespan_list_aware_df,how='cross')

    head_list_aware_df=comp_ratings_df[['head']].drop_duplicates().copy()
    head_list_aware_df=head_list_aware_df.merge(timespan_list_aware_df,how='cross')
            
    all_comps_aware=compounds_aware[['modifier','head','time']].copy()
    all_comps_aware.drop_duplicates(inplace=True)
           
    all_mods_aware=compounds_aware[['modifier','time']].copy()
    all_mods_aware.drop_duplicates(inplace=True)
            
    all_heads_aware=compounds_aware[['head','time']].copy()
    all_heads_aware.drop_duplicates(inplace=True)
            
    not_found_compounds_aware_df=compound_list_aware_df.merge(all_comps_aware, on=['modifier','head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_compounds_aware_df=not_found_compounds_aware_df.loc[not_found_compounds_aware_df['_merge']=='left_only']
    not_found_compounds_aware_df.drop('_merge',axis=1,inplace=True)
            
            
    not_found_modifiers_aware_df=modifier_list_aware_df.merge(all_mods_aware, on=['modifier','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_modifiers_aware_df=not_found_modifiers_aware_df.loc[not_found_modifiers_aware_df['_merge']=='left_only']
    not_found_modifiers_aware_df.drop('_merge',axis=1,inplace=True)
            
    not_found_heads_aware_df=head_list_aware_df.merge(all_heads_aware, on=['head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_heads_aware_df=not_found_heads_aware_df.loc[not_found_heads_aware_df['_merge']=='left_only']
    not_found_heads_aware_df.drop('_merge',axis=1,inplace=True)

    
    
    timespan_list_agnostic_df=pd.DataFrame(compounds_agnostic.time.unique())
    timespan_list_agnostic_df.columns=['time']

    compound_list_agnostic_df=comp_ratings_df[['modifier','head']].copy()
    compound_list_agnostic_df=compound_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')

    modifier_list_agnostic_df=comp_ratings_df[['modifier']].drop_duplicates().copy()
    modifier_list_agnostic_df=modifier_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')

    head_list_agnostic_df=comp_ratings_df[['head']].drop_duplicates().copy()
    head_list_agnostic_df=head_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')
            
    all_comps_agnostic=compounds_agnostic[['modifier','head','time']].copy()
    all_comps_agnostic.drop_duplicates(inplace=True)
           
    all_mods_agnostic=compounds_agnostic[['modifier','time']].copy()
    all_mods_agnostic.drop_duplicates(inplace=True)
            
    all_heads_agnostic=compounds_agnostic[['head','time']].copy()
    all_heads_agnostic.drop_duplicates(inplace=True)
            
    not_found_compounds_agnostic_df=compound_list_agnostic_df.merge(all_comps_agnostic, on=['modifier','head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_compounds_agnostic_df=not_found_compounds_agnostic_df.loc[not_found_compounds_agnostic_df['_merge']=='left_only']
    not_found_compounds_agnostic_df.drop('_merge',axis=1,inplace=True)
                    
    not_found_modifiers_agnostic_df=modifier_list_agnostic_df.merge(all_mods_agnostic, on=['modifier','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_modifiers_agnostic_df=not_found_modifiers_agnostic_df.loc[not_found_modifiers_agnostic_df['_merge']=='left_only']
    not_found_modifiers_agnostic_df.drop('_merge',axis=1,inplace=True)
            
    not_found_heads_agnostic_df=head_list_agnostic_df.merge(all_heads_agnostic, on=['head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_heads_agnostic_df=not_found_heads_agnostic_df.loc[not_found_heads_agnostic_df['_merge']=='left_only']
    not_found_heads_agnostic_df.drop('_merge',axis=1,inplace=True)
    
    
    heads_agnostic=constituents.copy()
    heads_agnostic_cols=heads_agnostic.columns
    heads_agnostic_cols=['head' if 'word' in x else x for x in heads_agnostic_cols]
    heads_agnostic.columns=heads_agnostic_cols

    modifiers_agnostic=constituents.copy()
    modifiers_agnostic_cols=modifiers_agnostic.columns
    modifiers_agnostic_cols=['modifier' if 'word' in x else x for x in modifiers_agnostic_cols]
    modifiers_agnostic.columns=modifiers_agnostic_cols

    
    print('Calculating features')
    
    unique_mod_list=comp_ratings_df[['modifier']].drop_duplicates()['modifier'].to_list()
    unique_head_list=comp_ratings_df[['head']].drop_duplicates()['head'].to_list() 
    
    print('CompoundAware features')
    
    
    compound_features_aware=calculate_compound_features(compounds_aware,modifiers_aware,heads_aware,all_comps_aware,not_found_compounds_aware_df,not_found_modifiers_aware_df,not_found_heads_aware_df)
    compound_features_aware=compound_features_aware.loc[(compound_features_aware.modifier.isin(unique_mod_list))&(compound_features_aware['head'].isin(unique_head_list))]
    
    reduced_compounds_aware=compounds_aware.loc[(compounds_aware.modifier.isin(unique_mod_list))&(compounds_aware['head'].isin(unique_head_list))]
    reduced_modifiers_aware=modifiers_aware.loc[modifiers_aware.modifier.isin(unique_mod_list)]
    reduced_heads_aware=heads_aware.loc[heads_aware['head'].isin(unique_head_list)]
    
    cosine_sim_feat_aware=calculate_cosine_features(reduced_compounds_aware,reduced_modifiers_aware,reduced_heads_aware,not_found_compounds_aware_df)
  
    
    print('CompoundAgnostic features')

    compound_features_agnostic=calculate_compound_features(compounds_agnostic,modifiers_agnostic,heads_agnostic,all_comps_agnostic,not_found_compounds_agnostic_df,not_found_modifiers_agnostic_df,not_found_heads_agnostic_df)
    compound_features_agnostic=compound_features_agnostic.loc[(compound_features_agnostic.modifier.isin(unique_mod_list))&(compound_features_agnostic['head'].isin(unique_head_list))]

    
    reduced_compounds_agnostic=compounds_agnostic.loc[(compounds_agnostic.modifier.isin(unique_mod_list))&(compounds_agnostic['head'].isin(unique_head_list))]
    reduced_modifiers_agnostic=modifiers_agnostic.loc[modifiers_agnostic.modifier.isin(unique_mod_list)]
    reduced_heads_agnostic=heads_agnostic.loc[heads_agnostic['head'].isin(unique_head_list)]
    
    cosine_sim_feat_agnostic=calculate_cosine_features(reduced_compounds_agnostic,reduced_modifiers_agnostic,reduced_heads_agnostic,not_found_compounds_agnostic_df)
    
    
    
    print('Setting features')
    compound_setting_sim=calculate_setting_similarity(reduced_compounds_aware,reduced_modifiers_aware,reduced_heads_aware,reduced_compounds_agnostic,reduced_modifiers_agnostic,reduced_heads_agnostic,compound_list_agnostic_df)
    

    print('Combining all compound aware features')
    
    features_aware_df=pd.merge(cosine_sim_feat_aware,compound_setting_sim,on=['modifier','head','time'],how='outer')
    features_aware_df=features_aware_df.merge(compound_features_aware,on=['modifier','head','time'],how='left')

    
    print('Combining all compound agnostic features')
    
    features_agnostic_df=pd.merge(cosine_sim_feat_agnostic,compound_setting_sim,on=['modifier','head','time'],how='outer')
    features_agnostic_df=features_agnostic_df.merge(compound_features_agnostic,on=['modifier','head','time'],how='left')   
    
    return features_aware_df,features_agnostic_df

In [15]:
total_dec_list=[[1820,1830,1840,1850,1860,1870,1880,1890],[1900,1910,1920,1930,1940,1950,1960,1970,1980,1990],[2000,2010]]

In [16]:
if args.temporal!=10000:
    total_dec_list=[[1820,1830,1840,1850,1860,1870,1880,1890],[1900,1910,1920,1930,1940,1950,1960,1970,1980,1990],[2000,2010]]
    
else:
    total_dec_list=[[1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010]]
    
    
if args.ppmi:
    ppmi_str="PPMI"
else:
    ppmi_str="RAW"
    
if args.tag:
    tag_str='Tagged'
else:
    tag_str='UnTagged'
    
temp_cutoff_str=str(args.temporal)+'_'+str(args.cutoff)
temp_cutoff_str

'10_0'

In [17]:
dec_list=total_dec_list[0]

In [18]:

    
    print(f'Current dec list {dec_list}')
    
    compounds_agnostic=process_decades_compound(dec_list,f'{args.inputdir}',ctype="phrase")

    constituents=process_decades_constituent(dec_list,f'{args.inputdir}',ctype='word')
    
    
    compounds_aware=process_decades_compound(dec_list,f'{args.inputdir}',ctype="compound")

    modifiers_aware=process_decades_constituent(dec_list,f'{args.inputdir}',ctype='modifier')

    heads_aware=process_decades_constituent(dec_list,f'{args.inputdir}',ctype='head')
    

Current dec list [1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890]
Reading file phrase
No cut-off applied
Reading file word
No cut-off applied
Reading file compound
No cut-off applied
Reading file modifier
No cut-off applied
Reading file head
No cut-off applied


In [20]:
    timespan_list_aware_df=pd.DataFrame(compounds_aware.time.unique())
    timespan_list_aware_df.columns=['time']

    compound_list_aware_df=comp_ratings_df[['modifier','head']].copy()
    compound_list_aware_df=compound_list_aware_df.merge(timespan_list_aware_df,how='cross')

    modifier_list_aware_df=comp_ratings_df[['modifier']].drop_duplicates().copy()
    modifier_list_aware_df=modifier_list_aware_df.merge(timespan_list_aware_df,how='cross')

    head_list_aware_df=comp_ratings_df[['head']].drop_duplicates().copy()
    head_list_aware_df=head_list_aware_df.merge(timespan_list_aware_df,how='cross')
            
    all_comps_aware=compounds_aware[['modifier','head','time']].copy()
    all_comps_aware.drop_duplicates(inplace=True)
           
    all_mods_aware=compounds_aware[['modifier','time']].copy()
    all_mods_aware.drop_duplicates(inplace=True)
            
    all_heads_aware=compounds_aware[['head','time']].copy()
    all_heads_aware.drop_duplicates(inplace=True)
            
    not_found_compounds_aware_df=compound_list_aware_df.merge(all_comps_aware, on=['modifier','head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_compounds_aware_df=not_found_compounds_aware_df.loc[not_found_compounds_aware_df['_merge']=='left_only']
    not_found_compounds_aware_df.drop('_merge',axis=1,inplace=True)
            
            
    not_found_modifiers_aware_df=modifier_list_aware_df.merge(all_mods_aware, on=['modifier','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_modifiers_aware_df=not_found_modifiers_aware_df.loc[not_found_modifiers_aware_df['_merge']=='left_only']
    not_found_modifiers_aware_df.drop('_merge',axis=1,inplace=True)
            
    not_found_heads_aware_df=head_list_aware_df.merge(all_heads_aware, on=['head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_heads_aware_df=not_found_heads_aware_df.loc[not_found_heads_aware_df['_merge']=='left_only']
    not_found_heads_aware_df.drop('_merge',axis=1,inplace=True)

    
    
    timespan_list_agnostic_df=pd.DataFrame(compounds_agnostic.time.unique())
    timespan_list_agnostic_df.columns=['time']

    compound_list_agnostic_df=comp_ratings_df[['modifier','head']].copy()
    compound_list_agnostic_df=compound_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')

    modifier_list_agnostic_df=comp_ratings_df[['modifier']].drop_duplicates().copy()
    modifier_list_agnostic_df=modifier_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')

    head_list_agnostic_df=comp_ratings_df[['head']].drop_duplicates().copy()
    head_list_agnostic_df=head_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')
            
    all_comps_agnostic=compounds_agnostic[['modifier','head','time']].copy()
    all_comps_agnostic.drop_duplicates(inplace=True)
           
    all_mods_agnostic=compounds_agnostic[['modifier','time']].copy()
    all_mods_agnostic.drop_duplicates(inplace=True)
            
    all_heads_agnostic=compounds_agnostic[['head','time']].copy()
    all_heads_agnostic.drop_duplicates(inplace=True)
            
    not_found_compounds_agnostic_df=compound_list_agnostic_df.merge(all_comps_agnostic, on=['modifier','head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_compounds_agnostic_df=not_found_compounds_agnostic_df.loc[not_found_compounds_agnostic_df['_merge']=='left_only']
    not_found_compounds_agnostic_df.drop('_merge',axis=1,inplace=True)
                    
    not_found_modifiers_agnostic_df=modifier_list_agnostic_df.merge(all_mods_agnostic, on=['modifier','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_modifiers_agnostic_df=not_found_modifiers_agnostic_df.loc[not_found_modifiers_agnostic_df['_merge']=='left_only']
    not_found_modifiers_agnostic_df.drop('_merge',axis=1,inplace=True)
            
    not_found_heads_agnostic_df=head_list_agnostic_df.merge(all_heads_agnostic, on=['head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_heads_agnostic_df=not_found_heads_agnostic_df.loc[not_found_heads_agnostic_df['_merge']=='left_only']
    not_found_heads_agnostic_df.drop('_merge',axis=1,inplace=True)
    
    
    heads_agnostic=constituents.copy()
    heads_agnostic_cols=heads_agnostic.columns
    heads_agnostic_cols=['head' if 'word' in x else x for x in heads_agnostic_cols]
    heads_agnostic.columns=heads_agnostic_cols

    modifiers_agnostic=constituents.copy()
    modifiers_agnostic_cols=modifiers_agnostic.columns
    modifiers_agnostic_cols=['modifier' if 'word' in x else x for x in modifiers_agnostic_cols]
    modifiers_agnostic.columns=modifiers_agnostic_cols

In [None]:
compound_features=calculate_compound_features(compounds_aware,modifiers_aware,heads_aware,all_comps_aware,not_found_compounds_aware_df,not_found_modifiers_aware_df,not_found_heads_aware_df)

Calculating productivity features


In [30]:
temp_df

Unnamed: 0,head,time,head_prod,comp_size,head_family_size,head_family_size_new,modifier,mod_prod,mod_family_size,mod_family_size_new
0,!_NOUN,1880,2,52227618,-24.638310,-22.983156,zo_PROPN,10,-22.316381,-20.608826
1,!_PROPN,1880,14,52227618,-21.830955,-20.175801,del_PROPN,2007,-14.667485,-12.959929
2,!_PROPN,1880,14,52227618,-21.830955,-20.175801,hoy_PROPN,16,-21.638310,-19.930754
3,!_PROPN,1880,14,52227618,-21.830955,-20.175801,la_PROPN,22764,-11.163843,-9.456287
4,!_PROPN,1880,14,52227618,-21.830955,-20.175801,loe_PROPN,14,-21.830955,-20.123399
...,...,...,...,...,...,...,...,...,...,...
34315675,■_NOUN,1870,124,45351820,-18.480461,-16.890296,•_NOUN,49,-19.819947,-18.160276
34315676,■_NOUN,1870,124,45351820,-18.480461,-16.890296,■_NOUN,121,-18.515794,-16.856122
34315677,♦_PROPN,1870,1,45351820,-25.434657,-23.844493,♦_PROPN,14,-21.627302,-19.967631
34315678,ー_NOUN,1870,10,45351820,-22.112729,-20.522565,population_PROPN,236,-17.552014,-15.892342


In [29]:
compounds_aware.groupby(['time']).size().to_frame()

Unnamed: 0_level_0,0
time,Unnamed: 1_level_1
1820,17244657
1830,25211029
1840,28052432
1850,34796621
1860,37951254
1870,45351820
1880,52227618
1890,60258710


In [31]:
def calculate_compound_features(compounds,modifiers,heads,all_comps,not_found_compounds_df,not_found_modifiers_df,not_found_heads_df):
    
    mod_cols=modifiers.columns.tolist()
    mod_cols=['count' if 'count' in x else x for x in mod_cols]
    modifiers.columns=mod_cols

    head_cols=heads.columns.tolist()
    head_cols=['count' if 'count' in x else x for x in head_cols]
    heads.columns=head_cols

    comp_cols=compounds.columns.tolist()
    comp_cols=['count' if 'count' in x else x for x in comp_cols]
    compounds.columns=comp_cols

    print('Calculating productivity features')
    
    compound_types=compounds.groupby(['time']).size().to_frame()
    compound_types.columns=['comp_size']
    
    modifier_types=modifiers.groupby(['time']).size().to_frame()
    modifier_types.columns=['mod_size']
    
    head_types=heads.groupby(['time']).size().to_frame()
    head_types.columns=['head_size']

    mod_prod=compounds.groupby(['modifier','time']).size().to_frame()
    mod_prod.columns=['mod_prod']
    mod_prod=pd.merge(mod_prod.reset_index(),compound_types.reset_index(),on=['time'])
    mod_prod=pd.merge(mod_prod,modifier_types.reset_index(),on=['time'])

    mod_prod['mod_family_size']=np.log2(mod_prod.mod_prod/mod_prod.comp_size)
    mod_prod['mod_family_size_new']=np.log2(mod_prod.mod_prod/mod_prod.mod_size)


    not_found_mod_prod=not_found_modifiers_df.copy()
    not_found_mod_prod['mod_prod']=0
    not_found_mod_prod=pd.merge(not_found_mod_prod,compound_types.reset_index(),on=['time'])
    not_found_mod_prod=pd.merge(not_found_mod_prod,modifier_types.reset_index(),on=['time'])
    not_found_mod_prod['mod_family_size']=0
    not_found_mod_prod['mod_family_size_new']=0


    head_prod=compounds.groupby(['head','time']).size().to_frame()
    head_prod.columns=['head_prod']
    head_prod=pd.merge(head_prod.reset_index(),compound_types.reset_index(),on=['time'])
    head_prod=pd.merge(head_prod,head_types.reset_index(),on=['time'])

    head_prod['head_family_size']=np.log2(head_prod.head_prod/head_prod.comp_size)
    head_prod['head_family_size_new']=np.log2(head_prod.head_prod/head_prod.head_size)


    not_found_head_prod=not_found_heads_df.copy()
    not_found_head_prod['head_prod']=0
    not_found_head_prod=pd.merge(not_found_head_prod,compound_types.reset_index(),on=['time'])
    not_found_head_prod=pd.merge(not_found_head_prod,head_types.reset_index(),on=['time'])

    not_found_head_prod['head_family_size']=0
    not_found_head_prod['head_family_size_new']=0

    
    mod_prod=pd.concat([mod_prod,not_found_mod_prod],ignore_index=True)
    head_prod=pd.concat([head_prod,not_found_head_prod],ignore_index=True)


    prod1=pd.merge(mod_prod.drop(['mod_size','comp_size'],axis=1),all_comps,on=['modifier','time'])
    productivity=pd.merge(head_prod.drop('head_size',axis=1),prod1,on=['head','time'])

    print('Calculating information theory features')
    
    compound_time_counts=compounds.groupby('time')['count'].sum().to_frame()
    
    compound_time_counts.columns=['N']
    XY=compounds.groupby(['modifier','head','time'])['count'].sum().to_frame()    

    XY.columns=['a']
    
    not_found_XY=not_found_compounds_df.copy()
    not_found_XY['count']=0
    not_found_XY=not_found_XY.groupby(['modifier','head','time'])['count'].sum().to_frame()
    not_found_XY.columns=['a']
    
    
    X_star=compounds.groupby(['modifier','time'])['count'].sum().to_frame()
    X_star.columns=['x_star']
    
    not_found_X_star=not_found_modifiers_df.copy()
    not_found_X_star['count']=0
    not_found_X_star=not_found_X_star.groupby(['modifier','time'])['count'].sum().to_frame()
    not_found_X_star.columns=['x_star']

    Y_star=compounds.groupby(['head','time'])['count'].sum().to_frame()
    Y_star.columns=['star_y']

    not_found_Y_star=not_found_heads_df.copy()
    not_found_Y_star['count']=0    
    not_found_Y_star=not_found_Y_star.groupby(['head','time'])['count'].sum().to_frame()
    not_found_Y_star.columns=['star_y']

    XY=pd.concat([XY,not_found_XY])
    X_star=pd.concat([X_star,not_found_X_star])
    Y_star=pd.concat([Y_star,not_found_Y_star])

    merge1=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier','time'])

    information_feat=pd.merge(merge1,Y_star.reset_index(),on=['head','time'])    

    information_feat['b']=information_feat['x_star']-information_feat['a']
    information_feat['c']=information_feat['star_y']-information_feat['a']

    information_feat=pd.merge(information_feat,compound_time_counts.reset_index(),on=['time'])

    information_feat['d']=information_feat['N']-(information_feat['a']+information_feat['b']+information_feat['c'])
    information_feat['x_bar_star']=information_feat['N']-information_feat['x_star']
    information_feat['star_y_bar']=information_feat['N']-information_feat['star_y']
    information_feat['overflow_check']=np.log2((information_feat['d']*information_feat['N']+1)/(information_feat['x_bar_star']*information_feat['star_y_bar']+1))
    information_feat['overflow_check'] = information_feat['overflow_check'].fillna(0)
    information_feat['log_ratio']=2*(\
    information_feat['a']*np.log2((information_feat['a']*information_feat['N']+1)/(information_feat['x_star']*information_feat['star_y']+1))+\
    information_feat['b']*np.log2((information_feat['b']*information_feat['N']+1)/(information_feat['x_star']*information_feat['star_y_bar']+1))+\
    information_feat['c']*np.log2((information_feat['c']*information_feat['N']+1)/(information_feat['x_bar_star']*information_feat['star_y']+1))+\
    information_feat['d']*information_feat['overflow_check'])
    information_feat['ppmi']=np.log2((information_feat['a']*information_feat['N']+1)/(information_feat['x_star']*information_feat['star_y']+1))
    information_feat['local_mi']=information_feat['a']*information_feat['ppmi']
    information_feat.loc[information_feat.ppmi<=0,'ppmi']=0
    information_feat.drop(['a','x_star','star_y','b','c','N','d','x_bar_star','star_y_bar','overflow_check'],axis=1,inplace=True)

    
    compound_features=pd.merge(productivity,information_feat,on=['modifier','head','time'])
    
    print('Frequency features')
            
    modifier_time_counts=modifiers.groupby(['time'])['count'].sum().to_frame()
    modifier_time_counts.columns=['mod_time_count']
    
    head_time_counts=heads.groupby(['time'])['count'].sum().to_frame()
    head_time_counts.columns=['head_time_count']
    
    
    
    
    frequency_feat=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier','time'])
    frequency_feat=frequency_feat.merge(Y_star.reset_index(),on=['head','time'])

    frequency_feat=frequency_feat.merge(compound_time_counts.reset_index(),on='time')
    frequency_feat=frequency_feat.merge(modifier_time_counts.reset_index(),on='time')
    frequency_feat=frequency_feat.merge(head_time_counts.reset_index(),on='time')

    frequency_feat.set_index(['modifier','head','time'],inplace=True)
    frequency_feat.columns=['comp_freq','mod_freq','head_freq','N','mod_time_count','head_time_count']
    frequency_feat['comp_tf']=np.log2(1+frequency_feat.comp_freq)
    
    frequency_feat['log_comp_freq']=np.log2(frequency_feat.comp_freq/frequency_feat.N)

    frequency_feat['mod_tf']=np.log2(1+frequency_feat.mod_freq)
    frequency_feat['log_mod_freq']=np.log2(frequency_feat.mod_freq/frequency_feat.N)
    frequency_feat['log_mod_freq_new']=np.log2(frequency_feat.mod_freq/frequency_feat.mod_time_count)

    frequency_feat['head_tf']=np.log2(1+frequency_feat.head_freq)
    frequency_feat['log_head_freq']=np.log2(frequency_feat.head_freq/frequency_feat.N)
    frequency_feat['log_head_freq_new']=np.log2(frequency_feat.head_freq/frequency_feat.head_time_count)
    frequency_feat.fillna(0,inplace=True)
    frequency_feat.drop(['mod_time_count','head_time_count','N'],axis=1,inplace=True)

    
    compound_features=compound_features.merge(frequency_feat.reset_index(),on=['modifier','head','time'])
    
    return compound_features

In [None]:
    print('Calculating features')
    
    unique_mod_list=comp_ratings_df[['modifier']].drop_duplicates()['modifier'].to_list()
    unique_head_list=comp_ratings_df[['head']].drop_duplicates()['head'].to_list() 
    
    print('CompoundAware features')
    
    
    compound_features_aware=calculate_compound_features(compounds_aware,modifiers_aware,heads_aware,all_comps_aware,not_found_compounds_aware_df,not_found_modifiers_aware_df,not_found_heads_aware_df)
    compound_features_aware=compound_features_aware.loc[(compound_features_aware.modifier.isin(unique_mod_list))&(compound_features_aware['head'].isin(unique_head_list))]
    
    reduced_compounds_aware=compounds_aware.loc[(compounds_aware.modifier.isin(unique_mod_list))&(compounds_aware['head'].isin(unique_head_list))]
    reduced_modifiers_aware=modifiers_aware.loc[modifiers_aware.modifier.isin(unique_mod_list)]
    reduced_heads_aware=heads_aware.loc[heads_aware['head'].isin(unique_head_list)]
    
    cosine_sim_feat_aware=calculate_cosine_features(reduced_compounds_aware,reduced_modifiers_aware,reduced_heads_aware,not_found_compounds_aware_df)
  
    
    print('CompoundAgnostic features')

    compound_features_agnostic=calculate_compound_features(compounds_agnostic,modifiers_agnostic,heads_agnostic,all_comps_agnostic,not_found_compounds_agnostic_df,not_found_modifiers_agnostic_df,not_found_heads_agnostic_df)
    compound_features_agnostic=compound_features_agnostic.loc[(compound_features_agnostic.modifier.isin(unique_mod_list))&(compound_features_agnostic['head'].isin(unique_head_list))]

    
    reduced_compounds_agnostic=compounds_agnostic.loc[(compounds_agnostic.modifier.isin(unique_mod_list))&(compounds_agnostic['head'].isin(unique_head_list))]
    reduced_modifiers_agnostic=modifiers_agnostic.loc[modifiers_agnostic.modifier.isin(unique_mod_list)]
    reduced_heads_agnostic=heads_agnostic.loc[heads_agnostic['head'].isin(unique_head_list)]
    
    cosine_sim_feat_agnostic=calculate_cosine_features(reduced_compounds_agnostic,reduced_modifiers_agnostic,reduced_heads_agnostic,not_found_compounds_agnostic_df)
    
    
    
    print('Setting features')
    compound_setting_sim=calculate_setting_similarity(reduced_compounds_aware,reduced_modifiers_aware,reduced_heads_aware,reduced_compounds_agnostic,reduced_modifiers_agnostic,reduced_heads_agnostic,compound_list_agnostic_df)
    

    print('Combining all compound aware features')
    
    features_aware_df=pd.merge(cosine_sim_feat_aware,compound_setting_sim,on=['modifier','head','time'],how='outer')
    features_aware_df=features_aware_df.merge(compound_features_aware,on=['modifier','head','time'],how='left')

    
    print('Combining all compound agnostic features')
    
    features_agnostic_df=pd.merge(cosine_sim_feat_agnostic,compound_setting_sim,on=['modifier','head','time'],how='outer')
    features_agnostic_df=features_agnostic_df.merge(compound_features_agnostic,on=['modifier','head','time'],how='left')   

In [47]:
cur_ratings_aware_df_na,cur_ratings_aware_df_med=merge_comp_ratings(features_aware_df)
cur_ratings_agnostic_df_na,cur_ratings_agnostic_df_med=merge_comp_ratings(features_agnostic_df)

In [52]:
cur_ratings_aware_df_med

Unnamed: 0,modifier,head,arith_mean_sim:1800,arith_mean_sim:1850,beta:1800,beta:1850,comp_freq:1800,comp_freq:1850,comp_size:1800,comp_size:1850,...,sim_with_modifier:1850,avgModifier,stdevModifier,avgHead,stdevHead,compositionality,stdevHeadModifier,is_adj,compound,source
0,end_NOUN,user_NOUN,0.004930,0.013786,0.452272,0.706432,14.0,320.0,35014831.0,71789080.0,...,0.019442,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,False,end_user,reddy
1,firing_NOUN,line_NOUN,0.007932,0.072563,0.531743,0.549472,25.0,9748.0,35014831.0,71789080.0,...,0.079512,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,False,firing_line,reddy
2,game_NOUN,plan_NOUN,0.004623,0.052439,0.074804,0.036630,4.0,14.0,35014831.0,71789080.0,...,0.004040,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,False,game_plan,reddy
3,application_NOUN,form_NOUN,0.108794,0.055676,0.293081,0.417125,523.0,16116.0,35014831.0,71789080.0,...,0.046797,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,False,application_form,reddy
4,snail_NOUN,mail_NOUN,0.030280,0.032138,0.500000,0.500000,56.0,121.0,35014831.0,71789080.0,...,0.020605,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,False,snail_mail,reddy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,radioactive_ADJ,waste_PROPN,0.030280,0.032138,0.500000,0.500000,56.0,121.0,35014831.0,71789080.0,...,0.020605,4.916700,0.288700,4.583300,0.668600,4.583300,0.793000,True,radioactive_waste,cordeiro100
1296,rainy_ADJ,season_PROPN,0.030280,0.068882,0.500000,0.146063,5.0,137.0,35014831.0,71789080.0,...,0.020250,4.590900,0.854100,3.772700,1.659900,4.227300,1.066000,True,rainy_season,cordeiro100
1297,social_ADJ,insurance_PROPN,0.030280,0.010655,0.500000,0.202462,56.0,5.0,35014831.0,71789080.0,...,0.004315,3.551700,1.325200,3.103400,1.819400,2.827600,1.691800,True,social_insurance,cordeiro100
1298,white_ADJ,noise_PROPN,0.030280,0.032138,0.500000,0.500000,56.0,121.0,35014831.0,71789080.0,...,0.020605,0.652200,1.112300,4.043500,1.429500,1.173900,1.230400,True,white_noise,cordeiro100


In [52]:
for dec_list in total_dec_list:
    dfs=feature_extractor_dec(dec_list)
    features_aware_df_list.append(dfs[0])
    features_agnostic_df_list.append(dfs[1])
    
features_aware_df=pd.concat(features_aware_df_list)
features_agnostic_df=pd.concat(features_agnostic_df_list)


cur_ratings_aware_df_na,cur_ratings_aware_df_med=merge_comp_ratings(features_aware_df)
cur_ratings_agnostic_df_na,cur_ratings_agnostic_df_med=merge_comp_ratings(features_agnostic_df)

Unnamed: 0,head,time,head_prod,comp_size,head_family_size,head_family_size_new,modifier,mod_prod,mod_family_size,mod_family_size_new,...,mod_freq,head_freq,comp_tf,log_comp_freq,mod_tf,log_mod_freq,log_mod_freq_new,head_tf,log_head_freq,log_head_freq_new
0,!_NOUN,1850,2,75790390,-25.175512,-25.295588,zo_PROPN,12,-22.590549,-22.710625,...,460,42,5.426265,-27.679950,8.848623,-24.226778,-27.751689,5.426265,-27.679950,-31.204862
1,!_PROPN,1850,20,75790390,-21.853583,-21.973660,del_PROPN,3817,-14.277288,-14.397364,...,242911,1135,5.209453,-27.902343,17.890074,-15.182200,-18.707111,10.149747,-22.923791,-26.448703
2,!_PROPN,1850,20,75790390,-21.853583,-21.973660,hoy_PROPN,17,-22.088049,-22.208125,...,981,1135,4.954196,-28.165377,9.939579,-23.134158,-26.659070,10.149747,-22.923791,-26.448703
3,!_PROPN,1850,20,75790390,-21.853583,-21.973660,la_PROPN,29568,-11.323763,-11.443839,...,2176000,1135,9.113742,-23.961132,21.053248,-12.019021,-15.543932,10.149747,-22.923791,-26.448703
4,!_PROPN,1850,20,75790390,-21.853583,-21.973660,loe_PROPN,28,-21.368157,-21.488233,...,777,1135,7.707359,-25.371828,9.603626,-23.470497,-26.995408,10.149747,-22.923791,-26.448703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12459787,■_NOUN,1800,115,36590933,-18.279493,-18.728950,■_NOUN,94,-18.570394,-19.019851,...,382,428,4.087463,-26.883861,8.581201,-22.306433,-25.965132,8.744834,-22.142394,-25.801094
12459788,□_NOUN,1800,2,36590933,-24.124983,-24.574440,press_PROPN,256,-17.124983,-17.574440,...,2423,2,1.584963,-29.883861,11.243174,-19.641283,-23.299982,1.584963,-29.883861,-33.542561
12459789,_NOUN,1800,1,36590933,-25.124983,-25.574440,_PROPN,2,-24.124983,-24.574440,...,2,1,1.000000,-30.883861,1.584963,-29.883861,-33.542561,1.000000,-30.883861,-34.542561
12459790,_NOUN,1800,1,36590933,-25.124983,-25.574440,_PROPN,2,-24.124983,-24.574440,...,2,1,1.000000,-30.883861,1.584963,-29.883861,-33.542561,1.000000,-30.883861,-34.542561


In [18]:
cur_ratings_aware_df_na,cur_ratings_aware_df_med=merge_comp_ratings(features_aware_df)
cur_ratings_agnostic_df_na,cur_ratings_agnostic_df_med=merge_comp_ratings(features_agnostic_df)


In [69]:
trial_df=pd.read_csv('/data/dharp/compounds/datasets/coha/features_CompoundAgnostic_withSetting_PPMI_Tagged_10_0_med.csv',sep="\t")
trial_df

Unnamed: 0,modifier,head,arith_mean_sim:1820,arith_mean_sim:1830,arith_mean_sim:1840,arith_mean_sim:1850,arith_mean_sim:1860,arith_mean_sim:1870,arith_mean_sim:1880,arith_mean_sim:1890,...,sim_with_modifier:2010,avgModifier,stdevModifier,avgHead,stdevHead,compositionality,stdevHeadModifier,is_adj,compound,source
0,end_NOUN,user_NOUN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.016248,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,False,end_user,reddy
1,firing_NOUN,line_NOUN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.125177,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,False,firing_line,reddy
2,game_NOUN,plan_NOUN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.073576,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,False,game_plan,reddy
3,application_NOUN,form_NOUN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.017450,...,0.048546,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,False,application_form,reddy
4,snail_NOUN,mail_NOUN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.048883,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,False,snail_mail,reddy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,radioactive_ADJ,waste_PROPN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.077075,4.916700,0.288700,4.583300,0.668600,4.583300,0.793000,True,radioactive_waste,cordeiro100
1296,rainy_ADJ,season_PROPN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.077075,4.590900,0.854100,3.772700,1.659900,4.227300,1.066000,True,rainy_season,cordeiro100
1297,social_ADJ,insurance_PROPN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.077075,3.551700,1.325200,3.103400,1.819400,2.827600,1.691800,True,social_insurance,cordeiro100
1298,white_ADJ,noise_PROPN,0.062477,0.078843,0.062821,0.062579,0.06032,0.066532,0.06286,0.064894,...,0.077075,0.652200,1.112300,4.043500,1.429500,1.173900,1.230400,True,white_noise,cordeiro100


In [71]:
trial2_df=pd.read_csv('/data/dharp/compounds/datasets/coha/features_CompoundAware_withSetting_PPMI_Tagged_10_0_med.csv',sep="\t")
trial2_df

Unnamed: 0,modifier,head,arith_mean_sim:1820,arith_mean_sim:1830,arith_mean_sim:1840,arith_mean_sim:1850,arith_mean_sim:1860,arith_mean_sim:1870,arith_mean_sim:1880,arith_mean_sim:1890,...,sim_with_modifier:2010,avgModifier,stdevModifier,avgHead,stdevHead,compositionality,stdevHeadModifier,is_adj,compound,source
0,end_NOUN,user_NOUN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.074533,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,False,end_user,reddy
1,firing_NOUN,line_NOUN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.211662,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,False,firing_line,reddy
2,game_NOUN,plan_NOUN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.188230,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,False,game_plan,reddy
3,application_NOUN,form_NOUN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.180102,...,0.141449,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,False,application_form,reddy
4,snail_NOUN,mail_NOUN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.141833,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,False,snail_mail,reddy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,radioactive_ADJ,waste_PROPN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.131885,4.916700,0.288700,4.583300,0.668600,4.583300,0.793000,True,radioactive_waste,cordeiro100
1296,rainy_ADJ,season_PROPN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.131885,4.590900,0.854100,3.772700,1.659900,4.227300,1.066000,True,rainy_season,cordeiro100
1297,social_ADJ,insurance_PROPN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.131885,3.551700,1.325200,3.103400,1.819400,2.827600,1.691800,True,social_insurance,cordeiro100
1298,white_ADJ,noise_PROPN,0.116038,0.132458,0.129674,0.135475,0.112251,0.127432,0.108375,0.130113,...,0.131885,0.652200,1.112300,4.043500,1.429500,1.173900,1.230400,True,white_noise,cordeiro100


In [21]:
cur_ratings_aware_df_na,cur_ratings_aware_df_med=merge_comp_ratings(features_aware_df)
cur_ratings_agnostic_df_na,cur_ratings_agnostic_df_med=merge_comp_ratings(features_agnostic_df)

print('Saving feature datasets')

cur_ratings_aware_df_na.loc[:,~cur_ratings_aware_df_na.columns.str.contains('setting')].to_csv(f'{args.outputdir}/features_CompoundAware_woSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_na.csv',sep='\t',index=False)
cur_ratings_aware_df_med.loc[:,~cur_ratings_aware_df_med.columns.str.contains('setting')].to_csv(f'{args.outputdir}/features_CompoundAware_woSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_med.csv',sep='\t',index=False)

cur_ratings_agnostic_df_na.loc[:,~cur_ratings_agnostic_df_na.columns.str.contains('setting')].to_csv(f'{args.outputdir}/features_CompoundAgnostic_woSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_na.csv',sep='\t',index=False)
cur_ratings_agnostic_df_med.loc[:,~cur_ratings_agnostic_df_med.columns.str.contains('setting')].to_csv(f'{args.outputdir}/features_CompoundAgnostic_woSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_med.csv',sep='\t',index=False)
      

cur_ratings_aware_df_na.to_csv(f'{args.outputdir}/features_CompoundAware_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_na.csv',sep='\t',index=False)
cur_ratings_aware_df_med.to_csv(f'{args.outputdir}/features_CompoundAware_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_med.csv',sep='\t',index=False)

cur_ratings_agnostic_df_na.to_csv(f'{args.outputdir}/features_CompoundAgnostic_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_na.csv',sep='\t',index=False)
cur_ratings_agnostic_df_med.to_csv(f'{args.outputdir}/features_CompoundAgnostic_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_med.csv',sep='\t',index=False)

Saving feature datasets


In [38]:
args.outputdir

'/data/dharp/compounds/datasets/features/'