In [54]:
import pandas as pd
import numpy as np
import argparse
import time
import pickle as pkl

from itertools import product
from functools import reduce
import glob
import os

import seaborn as sns
sns.set(style="whitegrid", font_scale = 2.5)
sns.set_context(rc={"lines.markersize": 17, "lines.linewidth": 2})

import matplotlib.pyplot as plt      
from sklearn.impute import SimpleImputer

In [2]:
parser = argparse.ArgumentParser(description='Compute temporal variation features from sparse dataset for google version')

parser.add_argument('--inputdir',type=str,
                    help='Provide directory where features are located')
parser.add_argument('--outputdir',type=str,
                    help='Where should the output be stored?')
parser.add_argument('--tag', action='store_true',
                    help='Should the POS tag be kept?')
parser.add_argument('--ppmi', action='store_true',
                    help='Should co-occurence matrix be converted to PPMI values')
parser.add_argument('--temporal',  type=int,
                    help='Value to bin the temporal information: 10000 (remove temporal information), 1 (no binning), 10 (binning to decades), 20 (binning each 20 years) or 50 (binning each 50 years)')
parser.add_argument('--cutoff', type=int, default=0,
                    help='Cut-off frequency for each compound per time period : none (0), 20, 50 and 100')
args = parser.parse_args('--inputdir /datanaco/dharp/compounds/datasets/ --outputdir /data/dharp/compounds/datasets/google/ --temporal 10'.split())

In [3]:
reddy_df=pd.read_csv('data/reddy_90.txt',sep='\t')
reddy_df['source']='reddy'
cordeiro90_df=pd.read_csv('data/cordeiro_90.txt',sep='\t')
cordeiro90_df['source']='cordeiro90'
cordeiro100_df=pd.read_csv('data/cordeiro_100.txt',sep='\t')
cordeiro100_df['source']='cordeiro100'

    
comp_ratings_df=pd.concat([reddy_df,cordeiro90_df,cordeiro100_df])
#comp_ratings_df.drop_duplicates(inplace=True)

In [4]:
def testset_tagger(df):

    #### NOUN NOUN
    
    copy_df_1=df.copy()
    copy_df_1.modifier=copy_df_1.modifier+'_NOUN'
    copy_df_1['head']=copy_df_1['head']+'_NOUN'

    ### PROPN NOUN

    copy_df_2=df.copy()
    copy_df_2.modifier=copy_df_2.modifier+'_PROPN'
    copy_df_2['head']=copy_df_2['head']+'_NOUN'
    
    ### NOUN PROPN

    copy_df_3=df.copy()
    copy_df_3.modifier=copy_df_3.modifier+'_NOUN'
    copy_df_3['head']=copy_df_3['head']+'_PROPN'
    
    ### PROPN PROPN    

    copy_df_4=df.copy()
    copy_df_4.modifier=copy_df_4.modifier+'_PROPN'
    copy_df_4['head']=copy_df_4['head']+'_PROPN'
    
   
    ### ADJ/NOUN NOUN
    
    copy_df_5=df.copy()
    
    copy_df_5.loc[copy_df_5.is_adj==True,"modifier"]+="_ADJ"
    copy_df_5.loc[copy_df_5.is_adj==False,"modifier"]+="_NOUN"
    copy_df_5['head']=copy_df_5['head']+'_NOUN'   
    
    
    ### ADJ/NOUN PROPN
    
    copy_df_6=df.copy()
    copy_df_6.loc[copy_df_6.is_adj==True,"modifier"]+="_ADJ"
    copy_df_6.loc[copy_df_6.is_adj==False,"modifier"]+="_NOUN"
    copy_df_6['head']=copy_df_6['head']+'_PROPN'  

    
    #### ADJ/PROPN NOUN
    
    copy_df_7=df.copy()
    copy_df_7.loc[copy_df_7.is_adj==True,"modifier"]+="_ADJ"
    copy_df_7.loc[copy_df_7.is_adj==False,"modifier"]+="_PROPN"
    copy_df_7['head']=copy_df_7['head']+'_NOUN' 
    
    
    #### ADJ/PROPN PROPN
    
    copy_df_8=df.copy()
    copy_df_8.loc[copy_df_8.is_adj==True,"modifier"]+="_ADJ"
    copy_df_8.loc[copy_df_8.is_adj==False,"modifier"]+="_PROPN"
    copy_df_8['head']=copy_df_8['head']+'_PROPN' 
    
    
    complete_df=pd.concat([copy_df_1,copy_df_2,copy_df_3,copy_df_4,copy_df_5,copy_df_6,copy_df_7,copy_df_8],ignore_index=True)
                           
    return complete_df     

In [5]:
if args.tag:
    comp_ratings_df=testset_tagger(comp_ratings_df)
comp_ratings_df

Unnamed: 0,modifier,head,avgModifier,stdevModifier,avgHead,stdevHead,compositionality,stdevHeadModifier,is_adj,compound,source
0,end,user,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,False,end_user,reddy
1,firing,line,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,False,firing_line,reddy
2,game,plan,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,False,game_plan,reddy
3,application,form,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,False,application_form,reddy
4,snail,mail,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,False,snail_mail,reddy
...,...,...,...,...,...,...,...,...,...,...,...
97,wedding,day,4.764700,0.562300,4.058800,1.434900,4.941200,0.242500,False,wedding_day,cordeiro100
98,white,noise,0.652200,1.112300,4.043500,1.429500,1.173900,1.230400,True,white_noise,cordeiro100
99,white,spirit,1.538500,1.240300,2.038500,1.949000,1.307700,1.257600,True,white_spirit,cordeiro100
100,winter,solstice,5.000000,0.000000,4.681800,1.086100,4.545500,1.335500,False,winter_solstice,cordeiro100


In [6]:
def process_decades_compound(dec_list,input_dir,unique_mod_list,unique_head_list,ctype='compound'):

    if os.path.exists(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl"):
        print('Reading file')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl")
        
    elif os.path.exists(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl") and args.temporal!=10000:
        print(f'Reading decades file {ctype}s/10_{dec_list[0]}_{tag_str}.pkl')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl")
        
        print(f'Reducing to {args.temporal}')
        complete_df['time']=complete_df['time']-complete_df['time']%args.temporal

        complete_df=complete_df.groupby(['modifier','head','time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl")


    else:

        df_list=[]

        for dec in dec_list:
            print(dec)
            cur_df=pd.read_pickle(f'{input_dir}/{ctype}s/{dec}.pkl')
            
            if not args.tag:
                cur_df=compound_tag_remover(cur_df)
            cur_df['time']=dec
            cur_df['time']=cur_df['time']-cur_df['time']%args.temporal
            df_list.append(cur_df)

        print('Done reading compound dataframes')
        complete_df=pd.concat(df_list,ignore_index=True)

        if args.temporal!=10:
            complete_df=complete_df.groupby(['modifier','head','time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl")
    
    reduced_complete_df=complete_df.loc[(complete_df.modifier.isin(unique_mod_list))&(complete_df['head'].isin(unique_head_list))]            
    return reduced_complete_df


def process_decades_constituent(dec_list,input_dir,unique_constituent_list,ctype='word'):
        
    if os.path.exists(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl"):
        print('Reading file')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl")
        
    elif os.path.exists(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl") and args.temporal!=10000:
        print(f'Reading decades file {ctype}s/10_{dec_list[0]}_{tag_str}.pkl')
        complete_df=pd.read_pickle(f"{input_dir}/{ctype}s/10_{dec_list[0]}_{tag_str}.pkl")
        
        print(f'Reducing to {args.temporal}')
        complete_df['time']=complete_df['time']-complete_df['time']%args.temporal
        complete_df=complete_df.groupby([ctype,'time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl")


    else:

        df_list=[]

        for dec in dec_list:
            cur_df=pd.read_pickle(f'{input_dir}/{ctype}s/{dec}.pkl')
            if not args.tag:
                cur_df=constituent_tag_remover(cur_df,ctype)
            cur_df['time']=dec
            cur_df['time']=cur_df['time']-cur_df['time']%args.temporal
            df_list.append(cur_df)

        print(f'Done reading {ctype} dataframes')
        complete_df=pd.concat(df_list,ignore_index=True)
        
        if args.temporal!=10:
            complete_df=complete_df.groupby([ctype,'time','context'])['count'].sum().to_frame().reset_index()
        
        print("Saving file")
        complete_df.to_pickle(f"{input_dir}/{ctype}s/{args.temporal}_{dec_list[0]}_{tag_str}.pkl")

    if ctype=='modifier':
        reduced_complete_df=complete_df.loc[complete_df.modifier.isin(unique_constituent_list)]
    elif ctype=='head':
        reduced_complete_df=complete_df.loc[complete_df['head'].isin(unique_constituent_list)]
    else:
        reduced_complete_df=complete_df.loc[complete_df.word.isin(unique_constituent_list)]

    return reduced_complete_df

In [7]:
def compound_tag_remover(compounds):
    
    print('Removing tags for compound dataset')
    compounds['head']=compounds['head'].str.replace('_NOUN|_PROPN','',regex=True)
    compounds.modifier=compounds.modifier.str.replace('_NOUN|_PROPN|_ADJ','',regex=True)
    
    compounds=compounds.groupby(['modifier','head','context'])['count'].sum().to_frame().reset_index()

    return compounds


def constituent_tag_remover(constituents,ctype='word'):
    
    print(f'Removing tags for {ctype} dataset')
    constituents[ctype]=constituents[ctype].str.replace('_NOUN|_PROPN|_ADJ','',regex=True)
    
    constituents=constituents.groupby([ctype,'context'])['count'].sum().to_frame().reset_index()

    return constituents

In [8]:
def process_cutoff_compound(df):

    df=df.loc[df.groupby(['modifier','head','time'])['count'].transform('sum').gt(args.cutoff)]
    
    return df


def process_cutoff_constituent(df,ctype='word'):

    df=df.loc[df.groupby([ctype,'time'])['count'].transform('sum').gt(args.cutoff)]
    
    return df

In [9]:
def ppmi(ppmi_df):
    
    ppmi_cols=ppmi_df.columns.tolist()
    ppmi_cols=['XY' if 'count' in x else x for x in ppmi_cols]
    ppmi_df.columns=ppmi_cols

    ppmi_time_counts=ppmi_df.groupby('time')['XY'].sum().to_frame()
    ppmi_time_counts.columns=['N']


    Y_star=ppmi_df.groupby(['context','time'])['XY'].sum().to_frame()
    Y_star.columns=['Y']

    ppmi_df=pd.merge(ppmi_df,Y_star.reset_index(),on=['context','time'])
    
    X_cols=[x for x in ppmi_cols if x not in ['context','XY'] ]


    X_star=ppmi_df.groupby(X_cols)['XY'].sum().to_frame()
    X_star.columns=['X']

    ppmi_df=pd.merge(ppmi_df,X_star.reset_index(),on=X_cols)
    ppmi_df=pd.merge(ppmi_df,ppmi_time_counts.reset_index(),on=['time'])
    ppmi_df['count']=np.log2((ppmi_df['XY']*ppmi_df['N'])/(ppmi_df['X']*ppmi_df['Y']))
    ppmi_df=ppmi_df.loc[ppmi_df['count']>=0]
    ppmi_df.drop(['XY','X','Y','N'],axis=1,inplace=True)
    
    return ppmi_df

In [10]:
def merge_comp_ratings(features_df):

    features_df=pd.pivot_table(features_df, index=['modifier','head'], columns=['time'])
    features_df_columns_1=features_df.columns.get_level_values(0)
    features_df_columns_2=features_df.columns.get_level_values(1)

    cur_year=0
    new_columns=[]
    for year in features_df_columns_2:
        new_columns.append(features_df_columns_1[cur_year]+":"+str(year))
        cur_year+=1

    features_df.columns=new_columns
    cur_ratings_df_na=features_df.reset_index().merge(comp_ratings_df,on=['modifier','head'])


    imputer= SimpleImputer(strategy="median")
    df_med=pd.DataFrame(imputer.fit_transform(features_df))
    df_med.columns=features_df.columns
    df_med.index=features_df.index

    cur_ratings_df_med=df_med.reset_index().merge(comp_ratings_df,on=['modifier','head'])
    
    return cur_ratings_df_na,cur_ratings_df_med

In [11]:
total_dec_list=[[1820,1830,1840,1850,1860,1870,1880,1890],[1900,1910,1920,1930,1940,1950,1960,1970,1980,1990],[2000,2010]]
    
    
if args.ppmi:
    ppmi_str="PPMI"
else:
    ppmi_str="RAW"
    
if args.tag:
    tag_str='Tagged'
else:
    tag_str='UnTagged'
    
temp_cutoff_str=str(args.temporal)+'_'+str(args.cutoff)
temp_cutoff_str

'10_0'

In [12]:
    unique_mod_list=comp_ratings_df[['modifier']].drop_duplicates()['modifier'].to_list()
    unique_head_list=comp_ratings_df[['head']].drop_duplicates()['head'].to_list() 
    unique_constituent_list=list(set(unique_mod_list+unique_head_list))

In [33]:
compounds_agnostic_list=[]
constituents_list=[]
compounds_aware_list=[]
modifiers_aware_list=[]
heads_aware_list=[]
for dec_list in total_dec_list:
    
    print(f'Current dec list {dec_list}')
    
    cur_compounds_agnostic=process_decades_compound(dec_list,f'{args.inputdir}',unique_mod_list,unique_head_list,ctype="phrase")
    cur_constituents=process_decades_constituent(dec_list,f'{args.inputdir}',unique_constituent_list,ctype='word')
    
    cur_compounds_aware=process_decades_compound(dec_list,f'{args.inputdir}',unique_mod_list,unique_head_list,ctype="compound")

    cur_modifiers_aware=process_decades_constituent(dec_list,f'{args.inputdir}',unique_mod_list,ctype='modifier')

    cur_heads_aware=process_decades_constituent(dec_list,f'{args.inputdir}',unique_head_list,ctype='head')
    
    compounds_agnostic_list.append(cur_compounds_agnostic)
    constituents_list.append(cur_constituents)
    
    compounds_aware_list.append(cur_compounds_aware)
    modifiers_aware_list.append(cur_modifiers_aware)
    heads_aware_list.append(cur_heads_aware)

Current dec list [1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890]
Reading file
Reading file
Reading file
Reading file
Reading file
Current dec list [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990]
Reading file
Reading file
Reading file
Reading file
Reading file
Current dec list [2000, 2010]
Reading file
Reading file
Reading file
Reading file
Reading file


In [38]:
compounds_agnostic=pd.concat(compounds_agnostic_list,ignore_index=True)
constituents=pd.concat(constituents_list,ignore_index=True)

compounds_aware=pd.concat(compounds_aware_list,ignore_index=True)
modifiers_aware=pd.concat(modifiers_aware_list,ignore_index=True)
heads_aware=pd.concat(heads_aware_list,ignore_index=True)

In [39]:

    if args.cutoff==0:
        print('No cut-off applied')          
    else:
        print(f'Cut-off: {args.cutoff}')
        print(compounds_aware)

        compounds_aware=process_cutoff_compound(compounds_aware)
        
        print(compounds_aware)

        print(compounds_agnostic)

        compounds_agnostic=process_cutoff_compound(compounds_agnostic)
        
        print(compounds_agnostic)

        
        constituents=process_cutoff_constituent(constituents,ctype='word')
        modifiers_aware=process_cutoff_constituent(modifiers_aware,ctype='modifier')
        heads_aware=process_cutoff_constituent(heads_aware,ctype='head')
    
    

No cut-off applied


In [40]:
    if args.ppmi:
        print('Applying PPMI')
        compounds_aware=ppmi(compounds_aware)
        modifiers_aware=ppmi(modifiers_aware)
        heads_aware=ppmi(heads_aware)
                        
        compounds_agnostic=ppmi(compounds_agnostic)
        constituents=ppmi(constituents)
    timespan_list_aware_df=pd.DataFrame(compounds_aware.time.unique())
    timespan_list_aware_df.columns=['time']

    compound_list_aware_df=comp_ratings_df[['modifier','head']].copy()
    compound_list_aware_df=compound_list_aware_df.merge(timespan_list_aware_df,how='cross')

    modifier_list_aware_df=comp_ratings_df[['modifier']].drop_duplicates().copy()
    modifier_list_aware_df=modifier_list_aware_df.merge(timespan_list_aware_df,how='cross')

    head_list_aware_df=comp_ratings_df[['head']].drop_duplicates().copy()
    head_list_aware_df=head_list_aware_df.merge(timespan_list_aware_df,how='cross')
            
    all_comps_aware=compounds_aware[['modifier','head','time']].copy()
    all_comps_aware.drop_duplicates(inplace=True)
           
    all_mods_aware=compounds_aware[['modifier','time']].copy()
    all_mods_aware.drop_duplicates(inplace=True)
            
    all_heads_aware=compounds_aware[['head','time']].copy()
    all_heads_aware.drop_duplicates(inplace=True)
            
    not_found_compounds_aware_df=compound_list_aware_df.merge(all_comps_aware, on=['modifier','head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_compounds_aware_df=not_found_compounds_aware_df.loc[not_found_compounds_aware_df['_merge']=='left_only']
    not_found_compounds_aware_df.drop('_merge',axis=1,inplace=True)
            
            
    not_found_modifiers_aware_df=modifier_list_aware_df.merge(all_mods_aware, on=['modifier','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_modifiers_aware_df=not_found_modifiers_aware_df.loc[not_found_modifiers_aware_df['_merge']=='left_only']
    not_found_modifiers_aware_df.drop('_merge',axis=1,inplace=True)
            
    not_found_heads_aware_df=head_list_aware_df.merge(all_heads_aware, on=['head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_heads_aware_df=not_found_heads_aware_df.loc[not_found_heads_aware_df['_merge']=='left_only']
    not_found_heads_aware_df.drop('_merge',axis=1,inplace=True)

    
    
    timespan_list_agnostic_df=pd.DataFrame(compounds_agnostic.time.unique())
    timespan_list_agnostic_df.columns=['time']

    compound_list_agnostic_df=comp_ratings_df[['modifier','head']].copy()
    compound_list_agnostic_df=compound_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')

    modifier_list_agnostic_df=comp_ratings_df[['modifier']].drop_duplicates().copy()
    modifier_list_agnostic_df=modifier_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')

    head_list_agnostic_df=comp_ratings_df[['head']].drop_duplicates().copy()
    head_list_agnostic_df=head_list_agnostic_df.merge(timespan_list_agnostic_df,how='cross')
            
    all_comps_agnostic=compounds_agnostic[['modifier','head','time']].copy()
    all_comps_agnostic.drop_duplicates(inplace=True)
           
    all_mods_agnostic=compounds_agnostic[['modifier','time']].copy()
    all_mods_agnostic.drop_duplicates(inplace=True)
            
    all_heads_agnostic=compounds_agnostic[['head','time']].copy()
    all_heads_agnostic.drop_duplicates(inplace=True)
            
    not_found_compounds_agnostic_df=compound_list_agnostic_df.merge(all_comps_agnostic, on=['modifier','head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_compounds_agnostic_df=not_found_compounds_agnostic_df.loc[not_found_compounds_agnostic_df['_merge']=='left_only']
    not_found_compounds_agnostic_df.drop('_merge',axis=1,inplace=True)
                    
    not_found_modifiers_agnostic_df=modifier_list_agnostic_df.merge(all_mods_agnostic, on=['modifier','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_modifiers_agnostic_df=not_found_modifiers_agnostic_df.loc[not_found_modifiers_agnostic_df['_merge']=='left_only']
    not_found_modifiers_agnostic_df.drop('_merge',axis=1,inplace=True)
            
    not_found_heads_agnostic_df=head_list_agnostic_df.merge(all_heads_agnostic, on=['head','time'], how='outer', suffixes=['', '_'], indicator=True)
    not_found_heads_agnostic_df=not_found_heads_agnostic_df.loc[not_found_heads_agnostic_df['_merge']=='left_only']
    not_found_heads_agnostic_df.drop('_merge',axis=1,inplace=True)
    
    compounds_aware=compounds_aware.merge(comp_ratings_df[['modifier','head']],on=['modifier','head'])

    compounds_agnostic=compounds_agnostic.merge(comp_ratings_df[['modifier','head']],on=['modifier','head'])

    heads_agnostic=constituents.copy()
    heads_agnostic_cols=heads_agnostic.columns
    heads_agnostic_cols=['head' if 'word' in x else x for x in heads_agnostic_cols]
    heads_agnostic.columns=heads_agnostic_cols
    heads_agnostic=heads_agnostic.loc[heads_agnostic['head'].isin(unique_head_list)]


    modifiers_agnostic=constituents.copy()
    modifiers_agnostic_cols=modifiers_agnostic.columns
    modifiers_agnostic_cols=['modifier' if 'word' in x else x for x in modifiers_agnostic_cols]
    modifiers_agnostic.columns=modifiers_agnostic_cols
    modifiers_agnostic=modifiers_agnostic.loc[modifiers_agnostic.modifier.isin(unique_mod_list)]
    
    print('Calculating features')
    

    
    print('CompoundAware features')

Calculating features
CompoundAware features


In [41]:
def cosine_bw_rows(df):
    df_orig=df.copy()
    df_shifted=df.shift().copy()
    denom_df_orig=(df_orig**2).sum(axis=1)
    denom_df_shifted=(df_shifted**2).sum(axis=1)
    denominator=np.sqrt(denom_df_orig*denom_df_shifted)
    numerator=(df_orig*df_shifted).sum(axis=1)
    if df.index.nlevels==3:
        cosine_sim_df=(numerator/denominator).reset_index(level=[0,1],drop=True)
    else:
        cosine_sim_df=(numerator/denominator).reset_index(level=[0],drop=True)        
    cosine_sim_df.dropna(inplace=True)
    cosine_sim_df=cosine_sim_df.to_frame()
    return cosine_sim_df

In [42]:
def temporal_features(compounds,modifiers,heads,compound_list_df):
    
    compounds_pivot=pd.pivot_table(compounds, values='count', index=['modifier','head', 'time'],
                       columns=['context'], aggfunc="sum",fill_value=0)
    modifiers_pivot=pd.pivot_table(modifiers, values='count', index=['modifier','time'],
                       columns=['context'], aggfunc="sum",fill_value=0)
    heads_pivot=pd.pivot_table(heads, values='count', index=['head','time'],
                       columns=['context'], aggfunc="sum",fill_value=0)
    
    change_compounds_df=compounds_pivot.groupby(level=[0,1]).apply(cosine_bw_rows)
    change_compounds_df.columns=['change_comp']

    change_modifiers_df=modifiers_pivot.groupby(level=[0]).apply(cosine_bw_rows)
    change_modifiers_df.columns=['change_mod']
    change_heads_df=heads_pivot.groupby(level=[0]).apply(cosine_bw_rows)
    change_heads_df.columns=['change_head']
    
    
    changed_df=pd.merge(change_compounds_df.reset_index(),compound_list_df,on=['modifier','head','time'],how='right')
    changed_df=pd.merge(changed_df,change_modifiers_df.reset_index(),on=['modifier','time'],how='right')
    
    changed_df=pd.merge(changed_df,change_heads_df.reset_index(),on=['head','time'])
    return changed_df

In [43]:
num_modifiers_features_df=(modifiers_aware.groupby(['modifier','time'])['count'].agg(perc_token_modifier='sum', perc_type_modifier='size')/modifiers_agnostic.groupby(['modifier','time'])['count'].agg(perc_token_modifier='sum', perc_type_modifier='size')).reset_index()
num_modifiers_features_df

Unnamed: 0,modifier,time,perc_token_modifier,perc_type_modifier
0,academy,1820,0.003118,0.034500
1,academy,1830,0.003979,0.041901
2,academy,1840,0.005512,0.052464
3,academy,1850,0.010834,0.073388
4,academy,1860,0.015582,0.086483
...,...,...,...,...
4832,zebra,1970,0.244970,0.297218
4833,zebra,1980,0.449932,0.419554
4834,zebra,1990,0.607710,0.491024
4835,zebra,2000,0.502937,0.453294


In [49]:
num_heads_features_df=(heads_aware.groupby(['head','time'])['count'].agg(perc_token_head='sum', perc_type_head='size')/heads_agnostic.groupby(['head','time'])['count'].agg(perc_token_head='sum', perc_type_head='size')).reset_index()
num_heads_features_df

Unnamed: 0,head,time,perc_token_head,perc_type_head
0,account,1820,0.273542,0.301059
1,account,1830,0.267762,0.317516
2,account,1840,0.272242,0.319656
3,account,1850,0.267778,0.329674
4,account,1860,0.269994,0.329282
...,...,...,...,...
4889,year,1970,0.399238,0.639785
4890,year,1980,0.381366,0.635348
4891,year,1990,0.363366,0.634835
4892,year,2000,0.351876,0.637860


In [55]:
num_compounds_features_df=(compounds_aware.groupby(['modifier','head','time'])['count'].agg(perc_token_comp='sum', perc_type_comp='size')/compounds_agnostic.groupby(['modifier','head','time'])['count'].agg(perc_token_comp='sum', perc_type_comp='size')).reset_index()
num_compounds_features_df=pd.merge(num_compounds_features_df,num_modifiers_features_df,on=['modifier','time'])
num_compounds_features_df=pd.merge(num_compounds_features_df,num_heads_features_df,on=['head','time'])
merge_comp_ratings(num_compounds_features_df)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(        modifier        head  perc_token_comp:1820  perc_token_comp:1830  \
 0        academy       award                   NaN              0.515152   
 1           acid        test              1.000000              0.909091   
 2          agony        aunt                   NaN                   NaN   
 3        ancient     history              0.997131              0.994727   
 4    application        form              1.000000              0.926680   
 ..           ...         ...                   ...                   ...   
 278        white      spirit              0.946360              0.834395   
 279       winter    solstice              1.000000              1.000000   
 280         word    painting                   NaN                   NaN   
 281        world  conference              1.000000                   NaN   
 282        zebra    crossing                   NaN                   NaN   
 
      perc_token_comp:1840  perc_token_comp:1850  perc_token_comp:1860  \


In [25]:
change_aware_df=temporal_features(compounds_aware,modifiers_aware,heads_aware,all_comps_aware)
change_aware_df

       modifier     head  time  change_comp  change_mod
0       academy    award  1830          NaN    0.310599
1       academy     land  1830          NaN    0.310599
2       academy      lot  1830          NaN    0.310599
3       academy  picture  1830          NaN    0.310599
4       academy     room  1830          NaN    0.310599
...         ...      ...   ...          ...         ...
283569    zebra    horse  2010          NaN    0.960327
283570    zebra     line  2010          NaN    0.960327
283571    zebra    print  2010          NaN    0.960327
283572    zebra    shark  2010          NaN    0.960327
283573    zebra    study  2010          NaN    0.960327

[283574 rows x 5 columns]
          modifier      head  time  change_comp  change_mod  change_head
0          academy     award  1830          NaN    0.310599     0.918114
1            blind     award  1830          NaN    0.995921     0.918114
2             case     award  1830          NaN    0.848085     0.918114
3        

Unnamed: 0,modifier,head,time,change_comp,change_mod,change_head
0,academy,award,1830,,0.310599,0.918114
1,blind,award,1830,,0.995921,0.918114
2,case,award,1830,,0.848085,0.918114
3,cash,award,1830,,0.907187,0.918114
4,disability,award,1830,,0.349618,0.918114
...,...,...,...,...,...,...
279397,winter,solstice,1900,0.970616,0.995596,0.999839
279398,word,solstice,1900,,0.960697,0.999839
279399,winter,solstice,1910,0.983041,0.985240,0.999717
279400,word,solstice,1910,,0.940345,0.999717


In [26]:
change_agnostic_df=temporal_features(compounds_agnostic,modifiers_agnostic,heads_agnostic,all_comps_agnostic)
change_agnostic_df

       modifier     head  time  change_comp  change_mod
0       academy    award  1830          NaN    0.910445
1       academy     land  1830          NaN    0.910445
2       academy      lot  1830          NaN    0.910445
3       academy  picture  1830          NaN    0.910445
4       academy     room  1830          NaN    0.910445
...         ...      ...   ...          ...         ...
290853    zebra    horse  2010          NaN    0.949693
290854    zebra     line  2010          NaN    0.949693
290855    zebra    print  2010          NaN    0.949693
290856    zebra    shark  2010          NaN    0.949693
290857    zebra    study  2010          NaN    0.949693

[290858 rows x 5 columns]
          modifier      head  time  change_comp  change_mod  change_head
0          academy     award  1830          NaN    0.910445     0.981156
1            blind     award  1830          NaN    0.993280     0.981156
2             case     award  1830          NaN    0.993940     0.981156
3        

Unnamed: 0,modifier,head,time,change_comp,change_mod,change_head
0,academy,award,1830,,0.910445,0.981156
1,blind,award,1830,,0.993280,0.981156
2,case,award,1830,,0.993940,0.981156
3,cash,award,1830,,0.821940,0.981156
4,disability,award,1830,,0.903540,0.981156
...,...,...,...,...,...,...
290781,winter,solstice,1900,0.969173,0.993283,0.999591
290782,word,solstice,1900,,0.991863,0.999591
290783,winter,solstice,1910,0.976609,0.981344,0.999290
290784,word,solstice,1910,,0.982114,0.999290


In [27]:
cur_ratings_aware_df_na,cur_ratings_aware_df_med=merge_comp_ratings(change_aware_df)
cur_ratings_agnostic_df_na,cur_ratings_agnostic_df_med=merge_comp_ratings(change_agnostic_df)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [30]:
print('Saving feature datasets')


cur_ratings_aware_df_na.to_csv(f'{args.outputdir}/temporal_CompoundAware_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_na.csv',sep='\t',index=False)
cur_ratings_aware_df_med.to_csv(f'{args.outputdir}/temporal_CompoundAware_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_med.csv',sep='\t',index=False)

cur_ratings_agnostic_df_na.to_csv(f'{args.outputdir}/temporal_CompoundAgnostic_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_na.csv',sep='\t',index=False)
cur_ratings_agnostic_df_med.to_csv(f'{args.outputdir}/temporal_CompoundAgnostic_withSetting_{ppmi_str}_{tag_str}_{temp_cutoff_str}_med.csv',sep='\t',index=False)

Saving feature datasets


In [38]:
args.outputdir

'/data/dharp/compounds/datasets/features/'