In [57]:
import pandas as pd
import numba
import numpy as np
import dask
import pickle as pkl
import dask.dataframe as dd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None
import time
from scipy.stats.stats import pearsonr
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
from functools import reduce

In [55]:
@numba.jit(nopython=True)
def year_binner(year,val=10):
    return year - year%val

def lemma_maker(x, y):
    #print(lemmatizer(x,y)[0])
    return lemmatizer(x,y)[0]

In [53]:
br_to_us=pd.read_excel("Book.xlsx",skiprows=[0])
br_to_us_dict=dict(zip(br_to_us.UK.tolist(),br_to_us.US.tolist()))
spelling_replacement={'modifier':br_to_us_dict,'head':br_to_us_dict}

In [5]:
modifier_list=pkl.load( open("modifier_list_reduced.pkl",'rb'))
head_list=pkl.load( open("head_list_reduced.pkl",'rb'))

In [8]:
compounds=pd.read_pickle("/data/dharp/compounding/datasets/phrases.pkl")
compounds.reset_index(inplace=True)
#compounds = dd.from_pandas(compounds, npartitions=100)
compounds.year=compounds.year.astype("int32")
compounds=compounds.query('1800 <= year <= 2010').copy()
compounds['time']=year_binner(compounds['year'].values,10)
#compounds = dd.from_pandas(compounds, npartitions=100)
compounds=compounds.groupby(['modifier','head','context','time'])['count'].sum().to_frame()
compounds.reset_index(inplace=True)
compounds=compounds.loc[compounds.groupby(['modifier','head','time'])['count'].transform('sum').gt(50)]
compounds=compounds.loc[compounds['modifier'].isin(modifier_list) & compounds['head'].isin(head_list)]
compounds.info()
compounds.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47638659 entries, 610554 to 62228900
Data columns (total 5 columns):
modifier    object
head        object
context     object
time        int64
count       float64
dtypes: float64(1), int64(1), object(3)
memory usage: 2.1+ GB


Unnamed: 0,modifier,head,context,time,count
610554,a_noun,a_noun,a_noun,1920,156.0
610555,a_noun,a_noun,a_noun,1950,132.0
610556,a_noun,a_noun,a_noun,1960,324.0
610557,a_noun,a_noun,a_noun,1970,1056.0
610558,a_noun,a_noun,a_noun,1980,3456.0


In [9]:
constituents=pd.read_pickle("/data/dharp/compounding/datasets/words.pkl")
constituents.reset_index(inplace=True)
#constituents = dd.from_pandas(constituents, npartitions=30)
constituents.year=constituents.year.astype("int32")
constituents=constituents.query('1800 <= year <= 2010').copy()
constituents['time']=year_binner(constituents['year'].values,10)
#constituents = dd.from_pandas(constituents, npartitions=30)
constituents=constituents.groupby(['word','context','time'])['count'].sum().to_frame()
constituents.reset_index(inplace=True)
constituents.info()
constituents.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212720528 entries, 0 to 212720527
Data columns (total 4 columns):
word       object
context    object
time       int64
count      int64
dtypes: int64(2), object(2)
memory usage: 6.3+ GB


Unnamed: 0,word,context,time,count
0,a_noun,'_adv,1800,1
1,a_noun,'_adv,1820,10
2,a_noun,'_adv,1830,2
3,a_noun,'_adv,1840,1
4,a_noun,'_adv,1850,2


In [10]:
modifiers=constituents.loc[constituents.word.isin(modifier_list)]
modifiers.columns=['modifier','context','time','count']
modifiers.info()
modifiers.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200033385 entries, 0 to 212720527
Data columns (total 4 columns):
modifier    object
context     object
time        int64
count       int64
dtypes: int64(2), object(2)
memory usage: 7.5+ GB


Unnamed: 0,modifier,context,time,count
0,a_noun,'_adv,1800,1
1,a_noun,'_adv,1820,10
2,a_noun,'_adv,1830,2
3,a_noun,'_adv,1840,1
4,a_noun,'_adv,1850,2


In [11]:
heads=constituents.loc[constituents.word.isin(head_list)]
heads.columns=['head','context','time','count']
heads.info()
heads.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201032577 entries, 0 to 212720497
Data columns (total 4 columns):
head       object
context    object
time       int64
count      int64
dtypes: int64(2), object(2)
memory usage: 7.5+ GB


Unnamed: 0,head,context,time,count
0,a_noun,'_adv,1800,1
1,a_noun,'_adv,1820,10
2,a_noun,'_adv,1830,2
3,a_noun,'_adv,1840,1
4,a_noun,'_adv,1850,2


In [12]:
compound_decade_counts=compounds.groupby(['time'])['count'].sum().to_frame()
compound_decade_counts.columns=['N']
compound_decade_counts

Unnamed: 0_level_0,N
time,Unnamed: 1_level_1
1800,1708629.0
1810,3105745.0
1820,5007102.0
1830,7574135.0
1840,10188965.0
1850,16953355.0
1860,14841765.0
1870,18665161.0
1880,28630886.0
1890,37967677.0


In [13]:
compounds = dd.from_pandas(compounds, npartitions=30)
XY=compounds.groupby(['modifier','head','time'])['count'].sum().to_frame()
XY=XY.compute()
XY.columns=['a']

X_star=compounds.groupby(['modifier','time'])['count'].sum().to_frame()
X_star=X_star.compute()
X_star.columns=['x_star']

Y_star=compounds.groupby(['head','time'])['count'].sum().to_frame()
Y_star=Y_star.compute()
Y_star.columns=['star_y']


merge1=pd.merge(XY.reset_index(),X_star.reset_index(),on=['modifier','time'])
information_feat=pd.merge(merge1,Y_star.reset_index(),on=['head','time'])

information_feat=dd.from_pandas(information_feat, npartitions=30)
information_feat['b']=information_feat['x_star']-information_feat['a']
information_feat['c']=information_feat['star_y']-information_feat['a']


information_feat=information_feat.compute()
information_feat=pd.merge(information_feat,compound_decade_counts.reset_index(),on=['time'])
information_feat=dd.from_pandas(information_feat, npartitions=30)
information_feat['d']=information_feat['N']-(information_feat['a']+information_feat['b']+information_feat['c'])
information_feat['x_bar_star']=information_feat['N']-information_feat['x_star']
information_feat['star_y_bar']=information_feat['N']-information_feat['star_y']
#information_feat['LR']=-2*np.sum(information_feat['a']*np.log2((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y'])))

information_feat=information_feat.compute()
information_feat.set_index(['modifier','head','time'],inplace=True)

information_feat.replace(0,0.001,inplace=True)
information_feat['log_ratio']=2*(information_feat['a']*np.log((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y']))+\
information_feat['b']*np.log((information_feat['b']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y_bar']))+\
information_feat['c']*np.log((information_feat['c']*information_feat['N'])/(information_feat['x_bar_star']*information_feat['star_y']))+\
information_feat['d']*np.log((information_feat['d']*information_feat['N'])/(information_feat['x_bar_star']*information_feat['star_y_bar'])))
information_feat['ppmi']=np.log2((information_feat['a']*information_feat['N'])/(information_feat['x_star']*information_feat['star_y']))
information_feat['local_mi']=information_feat['a']*information_feat['ppmi']
information_feat.ppmi.loc[information_feat.ppmi<=0]=0
information_feat.drop(['a','x_star','star_y','b','c','d','N','d','x_bar_star','star_y_bar'],axis=1,inplace=True)
information_feat.info()
information_feat.head()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3046187 entries, (a_noun, a_noun, 1920) to (zona_noun, glomerulosa_noun, 1940)
Data columns (total 3 columns):
log_ratio    float64
ppmi         float64
local_mi     float64
dtypes: float64(3)
memory usage: 96.9+ MB


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,log_ratio,ppmi,local_mi
modifier,head,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a_noun,a_noun,1920,590.682,4.079,636.312
aa_noun,a_noun,1920,2388.512,9.036,1969.888
aaa_noun,a_noun,1920,1594.993,9.779,1261.53
angle_noun,a_noun,1920,461.158,4.997,459.701
appendix_noun,a_noun,1920,14267.31,7.923,12281.175


In [14]:
modifiers=dd.from_pandas(modifiers, npartitions=30)
modifier_decade_counts=modifiers.groupby(['time'])['count'].sum().to_frame().compute()
modifier_decade_counts.columns=['N']
modifier_decade_counts

Unnamed: 0_level_0,N
time,Unnamed: 1_level_1
1800,74187121
1810,113969872
1820,188870342
1830,234680785
1840,297231032
1850,444818054
1860,373676721
1870,453730442
1880,629087313
1890,773254258


In [15]:
heads=dd.from_pandas(heads, npartitions=30)
head_decade_counts=heads.groupby(['time'])['count'].sum().to_frame().compute()
head_decade_counts.columns=['N']
head_decade_counts

Unnamed: 0_level_0,N
time,Unnamed: 1_level_1
1800,73825707
1810,113435614
1820,188074895
1830,233456015
1840,295479151
1850,442176687
1860,371179385
1870,450335687
1880,624345308
1890,767191702


In [17]:
modifier_denom=modifiers.groupby(['modifier','time'])['count'].apply(lambda x: np.sqrt(np.sum(np.square(x)))).to_frame().compute()
modifier_denom.columns=['modifier_denom']
modifier_denom

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,modifier_denom
modifier,time,Unnamed: 2_level_1
aa_noun,1990,34712.588
aalborg_noun,2000,82.183
aana_noun,1960,4.123
aana_noun,1970,1.414
aap_noun,1960,5.831
aar_noun,1860,11.225
aar_noun,1900,169.266
aardrijkskundig_noun,1950,59.841
aarp_noun,1990,640.073
aartsbisdom_noun,1950,1.414


In [23]:
head_denom=heads.groupby(['head',"time"])['count'].apply(lambda x: np.sqrt(np.sum(np.square(x)))).to_frame().compute()
head_denom.columns=['head_denom']
head_denom

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,head_denom
head,time,Unnamed: 2_level_1
aa_noun,1990,34712.588
aall_noun,2000,315.370
aandsliv_noun,1970,15.000
aap_noun,1960,5.831
aar_noun,1860,11.225
aar_noun,1900,169.266
aaraaf_noun,1960,33.853
aardrijkskundig_noun,1950,59.841
aba_noun,1870,3.162
ababa_noun,1960,8942.075


In [25]:
compounds = dd.from_pandas(compounds, npartitions=30)

In [29]:
compound_denom=compounds.groupby(['modifier','head',"time"])['count'].apply(lambda x: np.sqrt(np.sum(np.square(x)))).to_frame().compute()
compound_denom.columns=['compound_denom']
compound_denom

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,compound_denom
modifier,head,time,Unnamed: 3_level_1
a_noun,american_noun,1990,136.843
a_noun,ant_noun,1980,116.000
a_noun,asset_noun,2000,71.554
a_noun,bank_noun,2000,49.193
a_noun,boswell_noun,1930,144.187
a_noun,cat_noun,1990,234.009
a_noun,chaste_noun,1950,38.288
a_noun,chronicle_noun,1920,156.074
a_noun,circu_noun,1990,134.283
a_noun,corner_noun,2000,44.553


In [34]:
mod_cols=modifiers.columns.tolist()
mod_cols[-1]="mod_count"
modifiers.columns=mod_cols
#compounds.drop(['comp_count'],axis=1,inplace=True)
comp_cols=compounds.columns.tolist()
comp_cols[-1]="comp_count"
compounds.columns=comp_cols
compound_modifier_sim=pd.merge(compounds.compute(),modifiers.compute(),on=["modifier","context",'time'])
compound_modifier_sim['numerator']=compound_modifier_sim['comp_count']*compound_modifier_sim['mod_count']
compound_modifier_sim=compound_modifier_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
compound_modifier_sim=pd.merge(compound_modifier_sim.reset_index(),compound_denom.reset_index(),on=["modifier","head",'time'])
compound_modifier_sim=pd.merge(compound_modifier_sim,modifier_denom.reset_index(),on=['modifier','time'])
compound_modifier_sim['sim_with_modifier']=compound_modifier_sim['numerator']/(compound_modifier_sim['compound_denom']*compound_modifier_sim['modifier_denom'])
compound_modifier_sim.set_index(['modifier','head','time'],inplace=True)
compound_modifier_sim.drop(['numerator','compound_denom'],axis=1,inplace=True)
compound_modifier_sim.info()
compound_modifier_sim.head()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2962296 entries, (a_noun, a_noun, 1920) to (zzzz_noun, best_noun, 2000)
Data columns (total 2 columns):
modifier_denom       float64
sim_with_modifier    float64
dtypes: float64(2)
memory usage: 71.6+ MB


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,modifier_denom,sim_with_modifier
modifier,head,time,Unnamed: 3_level_1,Unnamed: 4_level_1
a_noun,a_noun,1920,1213036.27,0.004
a_noun,b_noun,1920,1213036.27,0.061
a_noun,bill_noun,1920,1213036.27,0.007
a_noun,book_noun,1920,1213036.27,0.044
a_noun,boswell_noun,1920,1213036.27,0.005


In [39]:
head_cols=heads.columns.tolist()
head_cols[-1]="head_count"
heads.columns=head_cols

compound_head_sim=pd.merge(compounds.compute(),heads.compute(),on=["head","context",'time'])
compound_head_sim['numerator']=compound_head_sim['comp_count']*compound_head_sim['head_count']
compound_head_sim=compound_head_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
compound_head_sim=pd.merge(compound_head_sim.reset_index(),compound_denom.reset_index(),on=["modifier","head",'time'])
compound_head_sim=pd.merge(compound_head_sim,head_denom.reset_index(),on=['head','time'])
compound_head_sim['sim_with_head']=compound_head_sim['numerator']/(compound_head_sim['compound_denom']*compound_head_sim['head_denom'])
compound_head_sim.set_index(['modifier','head','time'],inplace=True)
compound_head_sim.drop(['numerator','compound_denom'],axis=1,inplace=True)
compound_head_sim.info()
compound_head_sim.head()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2978161 entries, (a_noun, a_noun, 1920) to (zyl_noun, slabbert_noun, 2000)
Data columns (total 2 columns):
head_denom       float64
sim_with_head    float64
dtypes: float64(2)
memory usage: 72.0+ MB


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,head_denom,sim_with_head
modifier,head,time,Unnamed: 3_level_1,Unnamed: 4_level_1
a_noun,a_noun,1920,1213036.27,0.004
aa_noun,a_noun,1920,1213036.27,0.002
aaa_noun,a_noun,1920,1213036.27,0.004
angle_noun,a_noun,1920,1213036.27,0.072
appendix_noun,a_noun,1920,1213036.27,0.564


In [41]:
constituent_sim=pd.merge(heads.compute(),compounds.compute(),on=["head","context","time"])
#constituent_sim.drop('comp_count',axis=1,inplace=True)
constituent_sim=pd.merge(constituent_sim,modifiers.compute(),on=["modifier","context","time"])
constituent_sim['numerator']=constituent_sim['head_count']*constituent_sim['mod_count']
constituent_sim=constituent_sim.groupby(['modifier','head','time'])['numerator'].sum().to_frame()
constituent_sim=pd.merge(constituent_sim.reset_index(),head_denom.reset_index(),on=["head","time"])
constituent_sim=pd.merge(constituent_sim,modifier_denom.reset_index(),on=["modifier","time"])
constituent_sim['sim_bw_constituents']=constituent_sim['numerator']/(constituent_sim['head_denom']*constituent_sim['modifier_denom'])
constituent_sim.set_index(['modifier','head','time'],inplace=True)
constituent_sim.drop(['numerator','modifier_denom','head_denom'],axis=1,inplace=True)
constituent_sim

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sim_bw_constituents
modifier,head,time,Unnamed: 3_level_1
a_noun,a_noun,1920,0.000
a_noun,b_noun,1920,0.453
a_noun,bill_noun,1920,0.000
a_noun,book_noun,1920,0.001
a_noun,boswell_noun,1920,0.000
a_noun,c_noun,1920,0.487
a_noun,century_noun,1920,0.000
a_noun,certain_noun,1920,0.004
a_noun,chronicle_noun,1920,0.002
a_noun,company_noun,1920,0.001


In [45]:
dfs = [constituent_sim.reset_index(), compound_head_sim.reset_index(), compound_modifier_sim.reset_index(), information_feat.reset_index()]
compounds_final = reduce(lambda left,right: pd.merge(left,right,on=['modifier','head','time']), dfs)
compounds_final.drop(['head_denom','modifier_denom'],axis=1,inplace=True)
compounds_final=pd.pivot_table(compounds_final, index=['modifier','head'], columns=['time'])
compounds_final.fillna(0,inplace=True)
compounds_final -= compounds_final.min()
compounds_final /= compounds_final.max()
compounds_final_1=compounds_final.columns.get_level_values(0)
compounds_final_2=compounds_final.columns.get_level_values(1)

cur_year=0
new_columns=[]
for year in compounds_final_2:
    new_columns.append(str(year)+"_"+compounds_final_1[cur_year])
    cur_year+=1
compounds_final.columns=new_columns
compounds_final

Unnamed: 0_level_0,Unnamed: 1_level_0,1800_local_mi,1810_local_mi,1820_local_mi,1830_local_mi,1840_local_mi,1850_local_mi,1860_local_mi,1870_local_mi,1880_local_mi,1890_local_mi,...,1910_sim_with_modifier,1920_sim_with_modifier,1930_sim_with_modifier,1940_sim_with_modifier,1950_sim_with_modifier,1960_sim_with_modifier,1970_sim_with_modifier,1980_sim_with_modifier,1990_sim_with_modifier,2000_sim_with_modifier
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
a_noun,a_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.004,0.000,0.000,0.007,0.008,0.009,0.010,0.007,0.007
a_noun,aa_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.001,0.001,0.001,0.001,0.001,0.001
a_noun,aaa_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.004,0.005,0.005,0.004,0.003
a_noun,american_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.009,0.009,0.000
a_noun,anti_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.027
a_noun,appendix_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.000,0.017,0.017,0.014,0.013
a_noun,archive_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.001,0.001,0.000,0.000,0.000
a_noun,area_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.000,0.028,0.028,0.027,0.024
a_noun,asiapac_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.006
a_noun,asset_noun,0.002,0.004,0.003,0.002,0.003,0.003,0.002,0.003,0.002,0.001,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.016,0.016


In [46]:
compounds_final.to_pickle("/data/dharp/compounding/datasets/Dist_Features_Temporal_non_Contextual.pkl")

# Temporal Analysis

In [110]:
def cosine(row1,row2):
    if row1.name[:-1]!=row2.name[:-1]:
        return np.nan
    else:
        denom1=np.sqrt(np.sum(np.square(row1)))
        denom2=np.sqrt(np.sum(np.square(row2)))
        num=np.sum(row1*row2)
        return num/(denom1*denom2)

In [116]:
compounds_reduced=pd.read_pickle('/data/dharp/compounding/datasets/compounds_CompoundAgnostic_DecadeCentric_300.pkl')

In [117]:
compounds_reduced.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,common,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
modifier,head,decade,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
----_n,x_n,1920,----_n x_n,0.003,0.002,0.0,-0.007,0.008,-0.003,-0.003,0.001,-0.01,...,0.004,-0.001,-0.002,0.002,-0.0,0.001,-0.001,0.003,-0.004,-0.002
----_n,x_n,1930,----_n x_n,0.003,0.002,0.0,-0.007,0.008,-0.003,-0.003,0.001,-0.01,...,0.004,-0.001,-0.002,0.002,-0.0,0.001,-0.001,0.003,-0.004,-0.002
----_n,x_n,1940,----_n x_n,0.003,0.002,0.0,-0.007,0.008,-0.003,-0.003,0.001,-0.01,...,0.004,-0.001,-0.002,0.002,-0.0,0.001,-0.001,0.003,-0.004,-0.002
----_n,x_n,1950,----_n x_n,0.003,0.002,0.0,-0.007,0.008,-0.003,-0.003,0.001,-0.01,...,0.004,-0.001,-0.002,0.002,-0.0,0.001,-0.001,0.003,-0.004,-0.002
----_n,x_n,1960,----_n x_n,0.003,0.002,0.0,-0.007,0.008,-0.003,-0.003,0.001,-0.01,...,0.004,-0.001,-0.002,0.002,-0.0,0.001,-0.001,0.003,-0.004,-0.002


In [63]:
reddy11_study=pd.read_csv("/data/dharp/compounding/datasets/ijcnlp_compositionality_data/MeanAndDeviations.clean.txt",sep="\t")
#print(reddy11_study.columns)
reddy11_study.columns=['compound','to_divide']
reddy11_study['modifier_mean'],reddy11_study['modifier_std'],reddy11_study['head_mean'],reddy11_study['head_std'],reddy11_study['compound_mean'],reddy11_study['compound_std'],_=reddy11_study.to_divide.str.split(" ",7).str
reddy11_study['modifier'],reddy11_study['head']=reddy11_study['compound'].str.split(" ",2).str
reddy11_study.modifier=reddy11_study.modifier.str[:-2]
reddy11_study['head']=reddy11_study['head'].str[:-2]
reddy11_study.drop(['compound','to_divide'],axis=1,inplace=True)
reddy11_study['modifier']=np.vectorize(lemma_maker)(reddy11_study['modifier'],'noun')
reddy11_study['head']=np.vectorize(lemma_maker)(reddy11_study['head'],'noun')
reddy11_study.replace(spelling_replacement,inplace=True)
reddy11_study['modifier']=reddy11_study['modifier']+"_n"
reddy11_study['head']=reddy11_study['head']+"_n"
reddy11_study=reddy11_study.apply(pd.to_numeric, errors='ignore')
#reddy11_study.set_index(['modifier','head'],inplace=True)
reddy11_study.info()
reddy11_study.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 8 columns):
modifier_mean    90 non-null float64
modifier_std     90 non-null float64
head_mean        90 non-null float64
head_std         90 non-null float64
compound_mean    90 non-null float64
compound_std     90 non-null float64
modifier         90 non-null object
head             90 non-null object
dtypes: float64(6), object(2)
memory usage: 5.7+ KB


Unnamed: 0,modifier_mean,modifier_std,head_mean,head_std,compound_mean,compound_std,modifier,head
0,3.867,1.118,4.867,0.34,4.25,0.871,end_n,user_n
1,1.607,1.655,1.893,1.496,1.704,1.717,firing_n,line_n
2,2.821,1.965,4.862,0.345,3.828,1.234,game_n,plan_n
3,4.767,0.423,4.862,0.345,4.8,0.476,application_n,form_n
4,0.6,0.8,4.586,1.099,1.31,1.021,snail_n,mail_n


In [118]:
merge_df=reddy11_study[['modifier','head']].merge(compounds_reduced.drop(['common'],axis=1).reset_index(),on=['modifier','head'],how='inner')
merge_df.set_index(["modifier", "head",'decade'], inplace = True)
merge_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
modifier,head,decade,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
end_n,user_n,1880,0.01,0.003,0.0,-0.012,0.006,0.007,-0.007,0.004,-0.025,-0.003,...,0.098,0.134,0.133,-0.086,0.043,-0.049,0.061,-0.004,0.023,-0.064
end_n,user_n,1930,0.015,0.012,0.003,-0.039,0.013,-0.003,-0.008,0.004,-0.027,0.0,...,-0.257,-0.024,-0.078,-0.094,0.047,0.019,-0.045,-0.003,0.049,-0.069
end_n,user_n,1940,0.467,-0.142,-0.018,0.188,-0.103,0.103,-0.101,-0.078,0.234,-0.135,...,-0.0,0.03,0.002,-0.005,-0.011,-0.005,0.017,0.023,-0.035,0.025
end_n,user_n,1950,0.855,-0.265,-0.056,0.186,-0.068,-0.091,0.089,0.036,-0.051,-0.003,...,-0.024,-0.004,0.006,-0.011,0.011,-0.007,-0.005,0.001,-0.007,-0.016
end_n,user_n,1960,0.889,-0.257,-0.062,0.116,0.002,-0.131,0.08,0.035,-0.088,-0.005,...,0.017,-0.016,-0.027,0.012,-0.009,-0.006,-0.007,-0.007,-0.004,-0.003


In [121]:
merge_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
modifier,head,decade,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
end_n,user_n,1880,0.010,0.003,0.000,-0.012,0.006,0.007,-0.007,0.004,-0.025,-0.003,...,0.098,0.134,0.133,-0.086,0.043,-0.049,0.061,-0.004,0.023,-0.064
end_n,user_n,1930,0.015,0.012,0.003,-0.039,0.013,-0.003,-0.008,0.004,-0.027,0.000,...,-0.257,-0.024,-0.078,-0.094,0.047,0.019,-0.045,-0.003,0.049,-0.069
end_n,user_n,1940,0.467,-0.142,-0.018,0.188,-0.103,0.103,-0.101,-0.078,0.234,-0.135,...,-0.000,0.030,0.002,-0.005,-0.011,-0.005,0.017,0.023,-0.035,0.025
end_n,user_n,1950,0.855,-0.265,-0.056,0.186,-0.068,-0.091,0.089,0.036,-0.051,-0.003,...,-0.024,-0.004,0.006,-0.011,0.011,-0.007,-0.005,0.001,-0.007,-0.016
end_n,user_n,1960,0.889,-0.257,-0.062,0.116,0.002,-0.131,0.080,0.035,-0.088,-0.005,...,0.017,-0.016,-0.027,0.012,-0.009,-0.006,-0.007,-0.007,-0.004,-0.003
end_n,user_n,1970,0.878,-0.259,-0.059,0.133,-0.043,-0.115,0.102,0.047,-0.101,-0.002,...,0.002,0.003,-0.004,0.012,-0.004,-0.013,-0.009,-0.005,-0.004,-0.004
end_n,user_n,1980,0.867,-0.256,-0.057,0.142,-0.041,-0.098,0.074,0.033,-0.075,-0.035,...,-0.010,-0.001,-0.004,0.011,-0.008,-0.016,-0.011,0.002,0.004,0.003
end_n,user_n,1990,0.848,-0.245,-0.053,0.141,-0.053,-0.071,0.052,0.019,-0.044,-0.049,...,-0.012,-0.005,-0.006,0.020,-0.009,-0.019,-0.012,0.003,-0.003,0.009
firing_n,line_n,1890,0.683,-0.239,-0.051,0.187,-0.041,-0.019,0.064,0.054,-0.126,0.017,...,-0.002,0.024,0.009,-0.041,0.078,0.012,-0.046,-0.037,0.006,0.043
firing_n,line_n,1900,0.634,-0.232,-0.050,0.196,-0.020,0.051,0.016,0.048,-0.138,0.023,...,0.010,0.016,0.013,-0.034,0.074,0.017,-0.049,-0.031,0.003,0.039


In [119]:
cosine_compound_agnostic=[np.nan]
for i in range(1,merge_df.shape[0]):
    cosine_compound_agnostic.append(cosine(merge_df.iloc[i-1],merge_df.iloc[i]))

In [115]:
len(cosine_compound_aware)

711

In [122]:
merge_df['compound_cosine']=cosine_compound_agnostic

In [131]:
compound_df=pd.pivot_table(merge_df.reset_index(), values = 'compound_cosine', index=['modifier','head'], columns = 'decade')
compound_df

Unnamed: 0_level_0,decade,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960,1970,1980,1990
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
acid_n,test_n,,,,,1.000,,1.000,1.000,1.000,0.998,0.765,0.645,0.960,0.959,1.000,1.000,0.691,0.998,0.997
application_n,form_n,,,,,0.047,,0.070,0.837,0.957,0.873,0.945,0.990,0.986,0.985,0.996,0.993,0.995,0.994,0.969
balance_n,sheet_n,0.012,0.362,0.693,0.664,0.908,0.916,0.977,0.947,0.950,0.969,0.971,0.990,0.998,0.997,0.997,0.990,0.998,0.990,0.997
bank_n,account_n,,0.008,0.000,0.884,0.857,0.766,0.734,0.796,0.957,0.979,0.987,0.992,0.991,0.989,0.995,0.990,0.992,0.987,0.993
blame_n,game_n,,,,,,,,,,,,,,,,,,0.995,0.990
brick_n,wall_n,0.978,0.803,0.857,0.987,0.992,0.980,0.992,0.984,0.978,0.990,0.990,0.993,0.994,0.996,0.991,0.996,0.995,0.996,0.995
call_n,center_n,,,,,,,,,,,,,,,,,,,0.730
car_n,park_n,,,,,,,,,,-0.044,,,-0.046,0.745,0.928,0.950,0.984,0.986,0.994
case_n,study_n,,-0.019,-0.037,,,0.060,-0.024,0.015,-0.009,0.619,0.577,0.844,0.981,0.983,0.992,0.993,0.996,0.997,0.998
cash_n,cow_n,,,,,,,,,,,,,,,,,0.111,0.868,0.962


In [125]:
compounds_reduced_aware=pd.read_pickle('/data/dharp/compounding/datasets/compounds_CompoundCentric_DecadeCentric_300.pkl')

In [126]:
merge_df_aware=reddy11_study[['modifier','head']].merge(compounds_reduced_aware.drop(['common'],axis=1).reset_index(),on=['modifier','head'],how='inner')
merge_df_aware.set_index(["modifier", "head",'decade'], inplace = True)
merge_df_aware.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
modifier,head,decade,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
end_n,user_n,1940,0.66,-0.061,-0.147,-0.018,-0.011,-0.17,0.028,0.002,-0.065,0.108,...,-0.007,0.046,0.062,-0.023,-0.03,-0.016,0.013,0.032,0.005,-0.02
end_n,user_n,1950,0.883,-0.071,-0.124,-0.038,-0.039,0.206,-0.017,-0.04,-0.04,-0.205,...,0.022,0.025,0.037,-0.004,-0.001,0.001,-0.005,0.008,0.001,0.008
end_n,user_n,1960,0.877,-0.068,-0.111,-0.038,-0.042,0.266,-0.013,-0.043,-0.033,-0.238,...,-0.0,-0.005,0.03,0.002,-0.011,-0.006,0.011,-0.01,0.001,0.005
end_n,user_n,1970,0.872,-0.066,-0.112,-0.039,-0.041,0.274,-0.016,-0.046,-0.02,-0.251,...,0.002,0.005,0.035,0.002,-0.015,0.002,0.008,-0.007,0.004,0.008
end_n,user_n,1980,0.868,-0.067,-0.117,-0.037,-0.04,0.241,-0.012,-0.043,-0.017,-0.232,...,0.008,-0.003,0.039,0.007,-0.01,0.004,0.01,0.001,-0.003,0.008


In [130]:
merge_df_aware['compound_cosine']=cosine_compound_aware
compound_aware_df=pd.pivot_table(merge_df_aware.reset_index(), values = 'compound_cosine', index=['modifier','head'], columns = 'decade')
compound_aware_df

Unnamed: 0_level_0,decade,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960,1970,1980,1990
modifier,head,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
acid_n,test_n,,,,,,,,,1.000,1.000,0.949,0.317,,1.000,1.000,1.000,0.899,0.987,0.994
application_n,form_n,,,,,,,,,0.981,0.898,0.956,0.995,0.994,0.990,0.999,0.996,0.997,0.998,0.991
balance_n,sheet_n,,,0.644,0.706,0.945,0.946,0.980,0.951,0.963,0.978,0.977,0.990,0.998,0.999,0.999,0.991,0.998,0.993,0.999
bank_n,account_n,,,0.008,0.856,0.886,0.754,0.810,0.754,0.984,0.989,0.987,0.992,0.993,0.989,0.996,0.993,0.994,0.991,0.995
blame_n,game_n,,,,,,,,,,,,,,,,,,0.612,0.748
car_n,park_n,,,,,,,,,,,,,,0.841,0.943,0.967,0.989,0.990,0.996
case_n,study_n,,,,,,,,,0.958,0.695,0.739,0.905,0.984,0.984,0.992,0.993,0.997,0.997,0.998
cash_n,cow_n,,,,,,,,,,,,,,,,,,,0.978
chain_n,reaction_n,,,,,,,,,,,0.469,0.483,0.989,0.959,0.995,0.998,0.997,0.997,0.972
climate_n,change_n,,,,,,,1.000,,0.160,,,0.015,-0.005,0.644,0.762,0.900,0.963,0.937,0.914
