In [53]:
import numpy as np
import pandas as pd
import pickle as pkl
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split,LeaveOneOut,KFold,RepeatedStratifiedKFold
%matplotlib inline
import matplotlib
#matplotlib.use('agg')
matplotlib.style.use('ggplot')

from matplotlib import pyplot as plt
from collections import Counter
from functools import reduce
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import nltk
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('unigrams/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz','unigrams/stanford-ner-2018-02-27/stanford-ner-3.9.1.jar',encoding='utf-8')


In [54]:
br_to_us=pd.read_excel("Book.xlsx")
br_to_us_dict=dict(zip(br_to_us.UK.tolist(),br_to_us.US.tolist()))
decades=[2000, 1990, 1980, 1970, 1960, 1950, 1940, 1930, 1920, 1900, 1910,
            1890, 1880, 1870, 1850, 1860, 1840, 1830, 1820, 1810, 1800]
old_decades=decades[1:]
l1=dict(zip([1,-1],['Positive','Negative']))
replacements={'modifier':br_to_us_dict,'head':br_to_us_dict,'Class':l1}
new_heads = pkl.load( open( "new_heads.p", "rb" ) )
new_modifiers = pkl.load( open( "new_modifiers.p", "rb" ) )

In [55]:
def tupleconverter(tup):
    dicter={}
    for word,decade in tup:
        #print(word,decade)
        if word in dicter:
            dicter[word].append(decade)
        else:
            dicter[word]=[]
            dicter[word].append(decade)
            
    #for word,decades in dicter.items():
        #print(word)
        #print(decades)
        #dicter[word]=Counter(decades)
        #print(dicter[word])
    return(dicter)


def decremover(dc):

    new_constituents=[]
    for constituent,decades in dc.items():
        if len(decades)==1:
            if 2000 in decades:
                new_constituents.append(constituent)
    return(new_constituents)




def magnituder(x):
    return(np.sqrt(np.sum(np.square(x))))

def new_compound_finder(df,decade=2000):
    new_compounds=[]
    for key,val in compound_decades.items():
        if len(val)==1 and decade in val:
            new_compounds.append(key)
    return new_compounds

def lemma_maker(x):
    #print(x,y)
    return lemmatizer.lemmatize(x)

def ner_checker(stat):
    temp_list=st.tag(stat)
    if temp_list[0][1]=="PERSON" and temp_list[1][1]=="PERSON":
        return stat,True
    else:
        return stat,False


## Loading the individual datasets of Modifiers, Heads and Compounds

In [56]:
heads=pd.read_csv("heads.csv",sep="\t",header=None,usecols=[1,2,4])
heads.columns=["head","context","count"]
#heads.info()
total_head_count=heads['count'].sum()
heads=heads.groupby(['head','context'])['count'].sum().to_frame()
heads.reset_index(inplace=True)
heads=heads.loc[~heads['head'].isin(new_heads)]
heads.set_index(['head'],inplace=True)
heads

Unnamed: 0_level_0,context,count
head,Unnamed: 1_level_1,Unnamed: 2_level_1
a_n,abridge_v,59
a_n,abstract_n,569
a_n,accredited_a,74
a_n,acquire_v,116
a_n,activity_n,105
a_n,acute_a,44
a_n,alarm_n,46
a_n,almanac_n,899
a_n,always_r,509
a_n,amend_v,204


In [57]:
head_counts=heads.groupby(['head'])['count'].sum().to_frame()
head_counts.columns=['head_count']
head_counts

Unnamed: 0_level_0,head_count
head,Unnamed: 1_level_1
a_n,73116
aa_n,356
aaron_n,984
ab_n,8318
ababa_n,587
abandonment_n,432
abatement_n,350
abba_n,150
abbas_n,732
abbess_n,472


In [58]:
modifiers=pd.read_csv("modifiers.csv",sep="\t",header=None,usecols=[1,2,4])
modifiers.columns=["modifier","context","count"]
total_modifier_count=modifiers['count'].sum()
modifiers=modifiers.groupby(['modifier','context'])['count'].sum().to_frame()
modifiers.reset_index(inplace=True)
modifiers=modifiers.loc[~modifiers['modifier'].isin(new_modifiers)]
modifiers.set_index(['modifier'],inplace=True)
modifiers

Unnamed: 0_level_0,context,count
modifier,Unnamed: 1_level_1,Unnamed: 2_level_1
a_n,almanac_n,100
a_n,b_n,55
a_n,be_v,185
a_n,bear_v,88
a_n,bell_n,173
a_n,best_a,159
a_n,book_n,44
a_n,broadway_a,88
a_n,c_n,55
a_n,cassell_n,194


In [59]:
modifier_counts=modifiers.groupby(['modifier'])['count'].sum().to_frame()
modifier_counts.columns=['modifier_count']
modifier_counts

Unnamed: 0_level_0,modifier_count
modifier,Unnamed: 1_level_1
a_n,6326
aa_n,2571
aaa_n,147
aaron_n,6724
ab_n,1514
aba_n,2560
ababa_n,224
abatement_n,2076
abbe_n,252
abbey_n,831


In [60]:
compounds=pd.read_csv("compounds.csv",sep="\t",header=None,usecols=[1,2,3,5])
compounds.columns=["modifier","head","context","count"]
compounds=compounds=compounds.groupby(['modifier','head','context'])['count'].sum().to_frame()
compounds.reset_index(inplace=True)
compounds=compounds.loc[~(compounds.modifier.isin(new_modifiers)&compounds['head'].isin(new_modifiers))]
compounds=compounds.loc[compounds.modifier!=compounds['head']]

compounds['compound']=compounds['modifier'].str[:-2]+" "+compounds['head'].str[:-2]
#compounds['NER']=np.vectorize(ner_checker)(compounds['compound'])
compounds.set_index(['modifier','head'],inplace=True)
compounds

Unnamed: 0_level_0,Unnamed: 1_level_0,context,count,compound
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a_n,bell_n,international_a,173,a bell
a_n,bell_n,microfilm_n,173,a bell
a_n,bell_n,university_n,173,a bell
a_n,c_n,b_n,55,a c
a_n,c_n,be_v,55,a c
a_n,c_n,let_v,55,a c
a_n,cassell_n,house_n,106,a cassell
a_n,cassell_n,imprint_v,194,a cassell
a_n,cassell_n,press_n,88,a cassell
a_n,cassell_n,university_n,44,a cassell


In [61]:
compound_counts=compounds.groupby(['modifier','head'])['count'].sum().to_frame()
compound_counts.columns=['compound_count']
compound_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,compound_count
modifier,head,Unnamed: 2_level_1
a_n,bell_n,519
a_n,c_n,165
a_n,cassell_n,582
a_n,chorus_n,264
a_n,colonial_n,126
a_n,man_n,129
a_n,publisher_n,477
a_n,sand_n,300
a_n,series_n,1224
a_n,star_n,264


In [62]:
information_feat=compound_counts
information_feat.columns=['a']
X_star=compounds.groupby(['modifier'])['count'].sum().to_frame()
X_star.columns=['x_star']
Y_star=compounds.groupby(['head'])['count'].sum().to_frame()
Y_star.columns=['star_y']
information_feat=pd.merge(information_feat.reset_index(),X_star.reset_index(),on=['modifier'])
information_feat=pd.merge(information_feat,Y_star.reset_index(),on=['head'])
information_feat['b']=information_feat['x_star']-information_feat['a']
information_feat['c']=information_feat['star_y']-information_feat['a']
information_feat['N']=compound_counts.sum()[0]
information_feat['d']=information_feat['N']-(information_feat['a']+information_feat['b']+information_feat['c'])
information_feat['x_bar_star']=information_feat['N']-information_feat['x_star']
information_feat['star_y_bar']=information_feat['N']-information_feat['star_y']
information_feat.set_index(['modifier','head'],inplace=True)
information_feat.replace(0,0.001,inplace=True)
information_feat['log_ratio']=2*(information_feat['a']*np.log((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y']))+\
information_feat['b']*np.log((information_feat['b']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y_bar']))+\
information_feat['c']*np.log((information_feat['c']*information_feat['N'])/(information_feat['x_bar_star']*information_feat['star_y']))+\
information_feat['d']*np.log((information_feat['d']*information_feat['N'])/(information_feat['x_bar_star']*information_feat['star_y_bar'])))
information_feat['ppmi']=np.log2((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y']))
information_feat['local_mi']=information_feat['a']*information_feat['ppmi']
information_feat.ppmi.loc[information_feat.ppmi<=0]=0
information_feat.drop(['a','x_star','star_y','b','c','d','N','d','x_bar_star','star_y_bar'],axis=1,inplace=True)
information_feat

Unnamed: 0_level_0,Unnamed: 1_level_0,log_ratio,ppmi,local_mi
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a_n,bell_n,5378.361,8.823,4579.386
alarm_n,bell_n,42169.085,9.950,34475.197
alice_n,bell_n,2123.495,9.189,1791.797
anne_n,bell_n,970.274,6.480,894.268
baby_n,bell_n,850.718,4.977,850.987
bedroom_n,bell_n,906.043,6.722,826.820
brass_n,bell_n,2799.953,7.924,2448.612
canterbury_n,bell_n,1323.286,8.761,1130.172
castle_n,bell_n,1444.815,7.009,1303.627
chapel_n,bell_n,3560.192,9.252,2997.583


In [63]:
information_feat.describe()

Unnamed: 0,log_ratio,ppmi,local_mi
count,78143.0,78143.0,78143.0
mean,13873.818,6.909,10926.867
std,198083.945,4.014,126064.993
min,0.0,0.0,-22997.543
25%,956.022,3.922,944.18
50%,2311.664,6.72,2001.628
75%,5751.565,9.638,4867.354
max,42809567.691,19.968,25489354.556


In [64]:
new_modifiers=modifiers.groupby(['modifier','context'])['count'].sum().to_frame()
new_modifiers.columns=['a']
mod_star=modifiers.groupby(['modifier'])['count'].sum().to_frame()
mod_star.columns=['mod_star']
context_star=modifiers.groupby(['context'])['count'].sum().to_frame()
context_star.columns=['context_star']
merge1=pd.merge(new_modifiers.reset_index(),mod_star.reset_index(),on=['modifier'])
new_modifiers=pd.merge(merge1,context_star.reset_index(),on=['context'])
new_modifiers['N']=new_modifiers['a'].sum()
new_modifiers['modifier_ppmi']=np.log2((new_modifiers['a']*new_modifiers['N'])/(new_modifiers['mod_star']*new_modifiers['context_star']**0.75))
new_modifiers.set_index(['modifier'],inplace=True)
new_modifiers.drop(['a','mod_star','context_star','N'],axis=1,inplace=True)
new_modifiers.modifier_ppmi.loc[new_modifiers.modifier_ppmi<=0]=0
new_modifiers

Unnamed: 0_level_0,context,modifier_ppmi
modifier,Unnamed: 1_level_1,Unnamed: 2_level_1
a_n,almanac_n,12.782
adam_n,almanac_n,10.748
almanac_n,almanac_n,12.156
county_n,almanac_n,9.637
data_n,almanac_n,7.221
employment_n,almanac_n,7.459
job_n,almanac_n,7.538
letter_n,almanac_n,9.402
pennsylvania_n,almanac_n,11.757
publishing_n,almanac_n,12.487


In [65]:
new_heads=heads.groupby(['head','context'])['count'].sum().to_frame()
new_heads.columns=['a']
head_star=heads.groupby(['head'])['count'].sum().to_frame()
head_star.columns=['head_star']
context_star=heads.groupby(['context'])['count'].sum().to_frame()
context_star.columns=['context_star']
merge1=pd.merge(new_heads.reset_index(),head_star.reset_index(),on=['head'])
new_heads=pd.merge(merge1,context_star.reset_index(),on=['context'])
new_heads['N']=new_heads['a'].sum()
new_heads['head_ppmi']=np.log2((new_heads['a']*new_heads['N'])/(new_heads['head_star']*new_heads['context_star']**0.75))
new_heads.set_index(['head'],inplace=True)
new_heads.drop(['a','head_star','context_star','N'],axis=1,inplace=True)
new_heads.head_ppmi.loc[new_heads.head_ppmi<=0]=0
new_heads

Unnamed: 0_level_0,context,head_ppmi
head,Unnamed: 1_level_1,Unnamed: 2_level_1
a_n,abridge_v,9.208
certificate_n,abridge_v,10.567
congress_n,abridge_v,8.340
freedom_n,abridge_v,13.575
interest_n,abridge_v,7.570
matter_n,abridge_v,9.490
right_n,abridge_v,8.134
state_n,abridge_v,3.836
a_n,abstract_n,11.966
b_n,abstract_n,8.631


In [66]:
new_compounds=compounds.groupby(['modifier','head','context'])['count'].sum().to_frame()
new_compounds.columns=['a']
compound_star=compounds.groupby(['modifier','head'])['count'].sum().to_frame()
compound_star.columns=['compound_star']
context_star=compounds.groupby(['context'])['count'].sum().to_frame()
context_star.columns=['context_star']
new_compounds=pd.merge(new_compounds.reset_index(),compound_star.reset_index(),on=['modifier','head'])
new_compounds=pd.merge(new_compounds,context_star.reset_index(),on=['context'])
new_compounds['N']=new_compounds['a'].sum()
new_compounds['compound_ppmi']=np.log2((new_compounds['a']*new_compounds['N'])/(new_compounds['compound_star']*new_compounds['context_star']**0.75))
new_compounds.set_index(['modifier','head'],inplace=True)
new_compounds.drop(['a','compound_star','context_star','N'],axis=1,inplace=True)
new_compounds.compound_ppmi.loc[new_compounds.compound_ppmi<=0]=0
new_compounds

Unnamed: 0_level_0,Unnamed: 1_level_0,context,compound_ppmi
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1
a_n,bell_n,international_a,13.227
academic_n,advisory_n,international_a,13.227
accounting_n,standard_n,international_a,8.200
acquisition_n,workshop_n,international_a,13.227
aid_n,agency_n,international_a,8.578
air_n,navigation_n,international_a,12.764
air_n,service_n,international_a,11.441
air_n,transport_n,international_a,10.742
arbitral_n,institution_n,international_a,13.227
arbitration_n,agreement_n,international_a,8.043


In [68]:
modifier_denom=new_modifiers.groupby(['modifier'])['modifier_ppmi'].agg(lambda x: np.sqrt(np.sum(np.square(x)))).to_frame()
modifier_denom.columns=['modifier_denom']
modifier_denom

Unnamed: 0_level_0,modifier_denom
modifier,Unnamed: 1_level_1
a_n,77.354
aa_n,27.615
aaa_n,25.453
aaron_n,62.945
ab_n,50.903
aba_n,49.411
ababa_n,28.419
abatement_n,33.624
abbe_n,24.109
abbey_n,33.385


In [69]:
head_denom=new_heads.groupby(['head'])['head_ppmi'].agg(lambda x: np.sqrt(np.sum(np.square(x)))).to_frame()
head_denom.columns=['head_denom']
head_denom

Unnamed: 0_level_0,head_denom
head,Unnamed: 1_level_1
a_n,119.832
aa_n,32.724
aaron_n,39.766
ab_n,67.987
ababa_n,38.792
abandonment_n,18.500
abatement_n,27.165
abba_n,25.134
abbas_n,33.879
abbess_n,27.137


In [70]:
compound_denom=new_compounds.groupby(['modifier','head'])['compound_ppmi'].agg(lambda x: np.sqrt(np.sum(np.square(x)))).to_frame()
compound_denom.columns=['compound_denom']
compound_denom

Unnamed: 0_level_0,Unnamed: 1_level_0,compound_denom
modifier,head,Unnamed: 2_level_1
a_n,bell_n,27.114
a_n,c_n,21.745
a_n,cassell_n,33.788
a_n,chorus_n,29.180
a_n,colonial_n,21.125
a_n,man_n,28.819
a_n,publisher_n,26.288
a_n,sand_n,28.849
a_n,series_n,24.723
a_n,star_n,20.248


In [71]:
compound_modifier_sim=pd.merge(new_compounds.reset_index(),new_modifiers.reset_index(),on=["modifier","context"])
compound_modifier_sim['numerator']=compound_modifier_sim['compound_ppmi']*compound_modifier_sim['modifier_ppmi']
compound_modifier_sim=compound_modifier_sim.groupby(['modifier','head'])['numerator'].sum().to_frame()
compound_modifier_sim=pd.merge(compound_modifier_sim.reset_index(),compound_denom.reset_index(),on=["modifier","head"])
compound_modifier_sim=pd.merge(compound_modifier_sim,modifier_denom.reset_index(),on=['modifier'])
compound_modifier_sim['sim_with_modifier']=compound_modifier_sim['numerator']/(compound_modifier_sim['compound_denom']*compound_modifier_sim['modifier_denom'])
compound_modifier_sim.set_index(['modifier','head'],inplace=True)
compound_modifier_sim.drop(['numerator','compound_denom'],axis=1,inplace=True)
compound_modifier_sim

Unnamed: 0_level_0,Unnamed: 1_level_0,modifier_denom,sim_with_modifier
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1
a_n,bell_n,77.354,0.265
a_n,c_n,77.354,0.173
a_n,cassell_n,77.354,0.291
a_n,chorus_n,77.354,0.274
a_n,colonial_n,77.354,0.152
a_n,man_n,77.354,0.245
a_n,publisher_n,77.354,0.249
a_n,sand_n,77.354,0.274
a_n,series_n,77.354,0.271
a_n,star_n,77.354,0.180


In [73]:
compound_head_sim=pd.merge(new_compounds.reset_index(),new_heads.reset_index(),on=["head","context"])

compound_head_sim['numerator']=compound_head_sim['compound_ppmi']*compound_head_sim['head_ppmi']
compound_head_sim=compound_head_sim.groupby(['modifier','head'])['numerator'].sum().to_frame()
compound_head_sim=pd.merge(compound_head_sim.reset_index(),compound_denom.reset_index(),on=["modifier","head"])
compound_head_sim=pd.merge(compound_head_sim,head_denom.reset_index(),on=['head'])
compound_head_sim['sim_with_head']=compound_head_sim['numerator']/(compound_head_sim['compound_denom']*compound_head_sim['head_denom'])
compound_head_sim.set_index(['modifier','head'],inplace=True)
compound_head_sim.drop(['numerator','compound_denom'],axis=1,inplace=True)
compound_head_sim

Unnamed: 0_level_0,Unnamed: 1_level_0,head_denom,sim_with_head
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1
a_n,bell_n,116.903,0.106
alarm_n,bell_n,116.903,0.237
alice_n,bell_n,116.903,0.112
anne_n,bell_n,116.903,0.141
baby_n,bell_n,116.903,0.052
bedroom_n,bell_n,116.903,0.140
brass_n,bell_n,116.903,0.166
canterbury_n,bell_n,116.903,0.101
castle_n,bell_n,116.903,0.124
chapel_n,bell_n,116.903,0.149


In [76]:
constituent_sim=pd.merge(new_heads.reset_index(),new_compounds.reset_index(),on=["head","context"])
constituent_sim=pd.merge(constituent_sim,new_modifiers.reset_index(),on=["modifier","context"])
constituent_sim['numerator']=constituent_sim['head_ppmi']*constituent_sim['modifier_ppmi']
constituent_sim=constituent_sim.groupby(['modifier','head'])['numerator'].sum().to_frame()
constituent_sim=pd.merge(constituent_sim.reset_index(),head_denom.reset_index(),on=["head"])
constituent_sim=pd.merge(constituent_sim,modifier_denom.reset_index(),on=["modifier"])
constituent_sim['sim_bw_constituents']=constituent_sim['numerator']/(constituent_sim['head_denom']*constituent_sim['modifier_denom'])
constituent_sim.set_index(['modifier','head'],inplace=True)
constituent_sim.drop(['numerator','modifier_denom','head_denom'],axis=1,inplace=True)
constituent_sim

Unnamed: 0_level_0,Unnamed: 1_level_0,sim_bw_constituents
modifier,head,Unnamed: 2_level_1
a_n,bell_n,0.028
a_n,c_n,0.021
a_n,cassell_n,0.266
a_n,chorus_n,0.141
a_n,colonial_n,0.125
a_n,man_n,0.017
a_n,publisher_n,0.034
a_n,sand_n,0.058
a_n,series_n,0.024
a_n,star_n,0.018


In [85]:
dfs = [constituent_sim.reset_index(), compound_head_sim.reset_index(), compound_modifier_sim.reset_index(), information_feat.reset_index()]
compounds_final = reduce(lambda left,right: pd.merge(left,right,on=['modifier','head']), dfs)
compounds_final.drop(['head_denom','modifier_denom'],axis=1,inplace=True)
compounds_final.set_index(['modifier','head'],inplace=True)
compounds_final.fillna(0,inplace=True)
compounds_final -= compounds_final.min()
compounds_final /= compounds_final.max()
compounds_final

Unnamed: 0_level_0,Unnamed: 1_level_0,sim_bw_constituents,sim_with_head,sim_with_modifier,log_ratio,ppmi,local_mi
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a_n,bell_n,0.028,0.106,0.263,0.000,0.442,0.001
a_n,c_n,0.021,0.124,0.170,0.000,0.284,0.001
a_n,cassell_n,0.266,0.911,0.289,0.000,0.731,0.001
a_n,chorus_n,0.141,0.513,0.272,0.000,0.642,0.001
a_n,colonial_n,0.125,0.820,0.150,0.000,0.731,0.001
a_n,man_n,0.017,0.070,0.243,0.000,0.262,0.001
a_n,publisher_n,0.034,0.135,0.247,0.000,0.406,0.001
a_n,sand_n,0.058,0.209,0.272,0.000,0.424,0.001
a_n,series_n,0.024,0.089,0.269,0.000,0.399,0.001
a_n,star_n,0.018,0.101,0.178,0.000,0.414,0.001


In [86]:
graves13_study=pd.read_excel("13428_2012_256_MOESM1_ESM.xlsx",usecols=[0,1,2,3])
graves13_study['modifier'],graves13_study['head']=graves13_study['Stim'].str.split('-', 1).str
graves13_study['modifier']=np.vectorize(lemma_maker)(graves13_study['modifier'])
graves13_study['head']=np.vectorize(lemma_maker)(graves13_study['head'])
graves13_study.replace(replacements,inplace=True)
graves13_study['modifier']=graves13_study['modifier']+"_n"
graves13_study['head']=graves13_study['head']+"_n"
graves13_study.drop(["Stim"],axis=1,inplace=True)
graves13_study.set_index(['modifier','head'],inplace=True)
graves13_study

Unnamed: 0_level_0,Unnamed: 1_level_0,Class,Mean_rate,SD
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bus_n,bridge_n,Positive,1.345,1.396
bus_n,window_n,Positive,3.828,0.539
bus_n,home_n,Positive,1.828,1.605
bike_n,pant_n,Positive,2.897,1.472
bike_n,barn_n,Positive,1.429,1.425
bike_n,leg_n,Positive,1.138,1.356
bike_n,home_n,Positive,1.483,1.595
bike_n,headlight_n,Positive,3.448,0.985
bike_n,seat_n,Positive,3.793,0.774
police_n,dog_n,Positive,4.000,0.000


In [87]:
compounds_final_graves=pd.merge(graves13_study.reset_index(),compounds_final.reset_index(),on=['modifier','head'])
#compounds_final.set_index(['modifier','head'],inplace=True)
compounds_final_graves.set_index(['modifier','head'],inplace=True)
compounds_final_graves

Unnamed: 0_level_0,Unnamed: 1_level_0,Class,Mean_rate,SD,sim_bw_constituents,sim_with_head,sim_with_modifier,log_ratio,ppmi,local_mi
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
police_n,car_n,Positive,4.000,0.000,0.047,0.363,0.125,0.000,0.315,0.001
police_n,tape_n,Positive,3.655,0.974,0.009,0.213,0.036,0.000,0.181,0.001
cab_n,home_n,Positive,1.724,1.533,0.015,0.100,0.158,0.000,0.380,0.001
beach_n,sand_n,Positive,3.250,1.206,0.041,0.147,0.277,0.000,0.486,0.001
television_n,picture_n,Positive,3.690,0.712,0.031,0.175,0.171,0.000,0.314,0.001
field_n,kitchen_n,Positive,0.793,1.177,0.016,0.255,0.061,0.000,0.324,0.001
field_n,hospital_n,Positive,2.138,1.663,0.012,0.168,0.067,0.000,0.263,0.001
field_n,mouse_n,Positive,3.621,1.147,0.026,0.379,0.065,0.000,0.291,0.001
field_n,army_n,Positive,2.690,1.285,0.005,0.128,0.027,0.000,0.051,0.001
dog_n,book_n,Positive,2.517,1.661,0.005,0.033,0.167,0.000,0.102,0.001


In [89]:
reddy11_study=pd.read_csv("ijcnlp_compositionality_data/MeanAndDeviations.clean.txt",sep="\t")
#print(reddy11_study.columns)
reddy11_study.columns=['compund','to_divide']
reddy11_study['modifier_mean'],reddy11_study['modifier_std'],reddy11_study['head_mean'],reddy11_study['head_std'],reddy11_study['compound_mean'],reddy11_study['compound_std'],_=reddy11_study.to_divide.str.split(" ",7).str
reddy11_study['modifier'],reddy11_study['head']=reddy11_study['compund'].str.split(" ",2).str
reddy11_study.modifier=reddy11_study.modifier.str[:-2]
reddy11_study['head']=reddy11_study['head'].str[:-2]
reddy11_study.drop(['compund','to_divide'],axis=1,inplace=True)
reddy11_study['modifier']=np.vectorize(lemma_maker)(reddy11_study['modifier'])
reddy11_study['head']=np.vectorize(lemma_maker)(reddy11_study['head'])
reddy11_study.replace(replacements,inplace=True)
reddy11_study['modifier']=reddy11_study['modifier']+"_n"
reddy11_study['head']=reddy11_study['head']+"_n"
reddy11_study

Unnamed: 0,modifier_mean,modifier_std,head_mean,head_std,compound_mean,compound_std,modifier,head
0,3.866667,1.117537,4.866667,0.339935,4.250000,0.871165,end_n,user_n
1,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,firing_n,line_n
2,2.821429,1.964935,4.862069,0.344828,3.827586,1.233693,game_n,plan_n
3,4.766667,0.422953,4.862069,0.344828,4.800000,0.476095,application_n,form_n
4,0.600000,0.800000,4.586207,1.099129,1.310345,1.020596,snail_n,mail_n
5,2.678571,1.691440,3.933333,1.181336,3.785714,1.205853,web_n,site_n
6,0.379310,0.805746,4.714286,0.839096,1.517241,1.133219,flea_n,market_n
7,0.428571,0.775913,5.000000,0.000000,2.642857,1.315139,grandfather_n,clock_n
8,1.066667,1.289272,3.407407,1.340516,1.703704,1.047566,diamond_n,wedding_n
9,4.714286,0.589015,0.862069,0.936764,2.185185,1.155888,guilt_n,trip_n


In [92]:
compounds_final_reddy=pd.merge(reddy11_study,compounds_final.reset_index(),on=['modifier','head'])
compounds_final_reddy.set_index(['modifier','head'],inplace=True)
compounds_final_reddy

Unnamed: 0_level_0,Unnamed: 1_level_0,modifier_mean,modifier_std,head_mean,head_std,compound_mean,compound_std,sim_bw_constituents,sim_with_head,sim_with_modifier,log_ratio,ppmi,local_mi
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
end_n,user_n,3.866667,1.117537,4.866667,0.339935,4.25,0.871165,0.045,0.292,0.146,0.001,0.375,0.003
firing_n,line_n,1.607143,1.654848,1.892857,1.496169,1.703704,1.717337,0.032,0.068,0.429,0.0,0.353,0.001
application_n,form_n,4.766667,0.422953,4.862069,0.344828,4.8,0.476095,0.055,0.265,0.2,0.005,0.467,0.007
web_n,site_n,2.678571,1.69144,3.933333,1.181336,3.785714,1.205853,0.365,0.576,0.632,0.018,0.484,0.023
silver_n,spoon_n,1.592593,1.472193,1.444444,1.77082,1.518519,1.449658,0.048,0.7,0.067,0.0,0.548,0.001
rush_n,hour_n,3.107143,1.371633,2.862069,1.357588,3.333333,1.273665,0.138,0.169,0.76,0.0,0.583,0.001
mailing_n,list_n,4.666667,0.537484,4.933333,0.249444,4.666667,0.471405,0.099,0.213,0.465,0.0,0.497,0.001
spelling_n,bee_n,4.814815,0.77158,0.517241,1.037925,2.448276,1.248066,0.071,0.286,0.241,0.0,0.621,0.001
public_n,service_n,4.666667,0.649786,4.766667,0.61554,4.4,0.757188,0.025,0.101,0.228,0.0,0.099,0.001
video_n,game_n,4.5,0.718795,5.0,0.0,4.6,0.61101,0.159,0.322,0.494,0.001,0.536,0.002


In [93]:
with open('compounds_final_graves.csv', 'w') as f1:
    compounds_final_graves.to_csv(f1, header=True,sep="\t")

In [94]:
with open('compounds_final_reddy.csv', 'w') as f1:
    compounds_final_reddy.to_csv(f1, header=True,sep="\t")