In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.preprocessing import Normalizer
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None
import numba
from scipy.sparse import csr_matrix

In [2]:
@numba.jit(nopython=True)
def year_binner(year,val=10):
    return year - year%val

In [3]:
def dim_reduction(df,rows):
    df_svd = TruncatedSVD(n_components=300, n_iter=10, random_state=1991)
    print(f'Explained variance ratio {(df_svd.fit(df).explained_variance_ratio_.sum()):2.3f}')
    #df_list=df_svd.fit(df).explained_variance_ratio_
    df_reduced = df_svd.fit_transform(df)
    df_reduced = Normalizer(copy=False).fit_transform(df_reduced)
    df_reduced=pd.DataFrame(df_reduced,index=rows)
    #df_reduced.reset_index(inplace=True)
    if temporal!=0:
        df_reduced.index = pd.MultiIndex.from_tuples(df_reduced.index, names=['common', 'time'])
    return df_reduced

In [4]:
temporal=100
cutoff=100

In [5]:
        compounds=pd.read_pickle("../../datasets/phrases.pkl")
        compounds.reset_index(inplace=True)
        compounds=compounds.head(1000)
        compounds.year=compounds.year.astype("int32")
        compounds=compounds.query('1800 <= year <= 2010').copy()
        compounds['common']=compounds['modifier']+" "+compounds['head']


        if temporal==0:
            print('No temporal information is stored')
            compounds=compounds.groupby(['common','context'])['count'].sum().to_frame()
            compounds.reset_index(inplace=True)
            compounds=compounds.loc[compounds.groupby(['common'])['count'].transform('sum').gt(cutoff)]
            compounds=compounds.groupby(['common','context'])['count'].sum()
        else:
            compounds['time']=year_binner(compounds['year'].values,temporal)
            compounds=compounds.groupby(['common','context','time'])['count'].sum().to_frame()
            compounds=compounds.loc[compounds.groupby(['common','time'])['count'].transform('sum').gt(cutoff)]
            compounds=compounds.groupby(['common','time','context'])['count'].sum()



In [6]:
        constituents=pd.read_pickle("/data/dharp/compounding/datasets/words.pkl")
        constituents.reset_index(inplace=True)
        constituents=constituents.head(1000)

        constituents.year=constituents.year.astype("int32")
        constituents=constituents.query('1800 <= year <= 2010').copy()
        constituents.columns=['common','context','year','count']
        if temporal==0:
            print('No temporal information is stored')
            constituents=constituents.groupby(['common','context'])['count'].sum().to_frame()
            constituents.reset_index(inplace=True)
            constituents=constituents.loc[constituents.groupby(['common'])['count'].transform('sum').gt(cutoff)]
            constituents=constituents.groupby(['common','context'])['count'].sum()           
        else:
            constituents['time']=year_binner(constituents['year'].values,temporal)
            constituents=constituents.groupby(['common','context','time'])['count'].sum().to_frame()
            constituents.reset_index(inplace=True)
            constituents=constituents.loc[constituents.groupby(['common','time'])['count'].transform('sum').gt(cutoff)]
            constituents=constituents.groupby(['common','time','context'])['count'].sum()

        print('Concatenating all the datasets together')

Concatenating all the datasets together


In [7]:
df=pd.concat([constituents,compounds], sort=True)
dtype = pd.SparseDtype(np.float, fill_value=0)
df=df.astype(dtype)

In [12]:


    #df=df.to_sparse()
    if temporal!=0:    
        df, rows, _ = df.sparse.to_coo(row_levels=['common','time'],column_levels=['context'],sort_labels=False)

    else:
        df, rows, _ = df.spare.to_coo(row_levels=['common'],column_levels=['context'],sort_labels=False)

    print('Running SVD') 

Running SVD


In [13]:
  
    df_reduced=dim_reduction(df,rows)

    print('Splitting back into individual datasets are saving them')
    if temporal!=0:
        df_reduced.index.names = ['common','time']
    else:
        df_reduced.index.names = ['common']

ValueError: n_components must be < n_features; got 300 >= 23

In [31]:
    compounds_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.contains(r'\w \w')]
    compounds_reduced.reset_index(inplace=True)
    #print(compounds_reduced.head())
    compounds_reduced['modifier'],compounds_reduced['head']=compounds_reduced['common'].str.split(' ', 1).str

In [36]:
    constituents_reduced=df_reduced.loc[~df_reduced.index.get_level_values(0).str.contains(r'\w \w')]
    constituents_reduced.reset_index(inplace=True)
    constituents_reduced['constituent']=constituents_reduced['common']
    constituents_reduced.drop(['common'],axis=1,inplace=True)
    
    if temporal!=0:
        compounds_reduced.set_index(['modifier','head','time'],inplace=True)
        constituents_reduced.set_index(['constituent','time'],inplace=True)
    else:
        compounds_reduced.set_index(['modifier','head'],inplace=True)
        constituents_reduced.set_index(['constituent'],inplace=True)

In [39]:
compounds_reduced

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,common,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
modifier,head,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
'_noun,a'isha_noun,1900,'_noun a'isha_noun,0.083,0.009,-0.079,-0.013,-0.024,0.015,-0.084,-0.042,0.021,...,0.048,0.044,-0.012,0.058,0.212,-0.006,-0.025,-0.019,-0.016,-0.007
'_noun,a'isha_noun,2000,'_noun a'isha_noun,0.091,0.008,-0.087,-0.018,-0.034,0.014,-0.081,-0.033,0.020,...,0.042,0.037,-0.024,0.059,0.221,-0.003,-0.016,-0.020,-0.015,-0.001
'_noun,a_noun,1800,'_noun a_noun,0.091,-0.029,0.015,0.015,-0.015,0.017,-0.012,-0.021,-0.017,...,0.015,0.025,0.031,-0.044,0.040,-0.014,-0.047,-0.040,-0.014,-0.013
'_noun,a_noun,1900,'_noun a_noun,0.206,-0.062,-0.013,-0.005,-0.043,-0.014,0.060,0.007,-0.016,...,0.079,-0.074,0.035,0.014,-0.048,0.054,0.061,0.010,-0.124,-0.100
'_noun,a_noun,2000,'_noun a_noun,0.295,-0.128,0.121,-0.019,-0.059,-0.013,0.034,-0.006,0.019,...,0.062,-0.060,0.019,0.013,-0.036,0.060,0.059,0.001,-0.080,-0.097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zyl_noun,slabbert_noun,1900,zyl_noun slabbert_noun,0.010,0.000,-0.005,0.011,-0.012,0.015,0.009,0.000,0.001,...,0.068,0.031,0.020,0.001,0.056,0.045,0.009,0.029,0.006,-0.019
zyl_noun,slabbert_noun,2000,zyl_noun slabbert_noun,0.010,0.000,-0.005,0.011,-0.012,0.015,0.009,0.001,0.002,...,0.064,0.026,0.019,0.005,0.056,0.042,0.010,0.028,0.004,-0.019
zymogen_noun,granule_noun,1900,zymogen_noun granule_noun,0.652,-0.348,0.513,-0.038,0.055,-0.028,-0.105,-0.098,-0.206,...,0.005,-0.007,-0.005,0.013,-0.004,0.004,0.011,0.013,0.003,-0.046
zysman_noun,american_noun,1900,zysman_noun american_noun,0.025,0.002,-0.022,0.005,-0.007,0.010,-0.007,-0.001,-0.007,...,-0.053,-0.016,-0.153,-0.030,0.150,-0.111,0.074,0.026,0.086,-0.015


In [6]:
        heads_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.endswith(r'_h')]
        heads_reduced.reset_index(inplace=True)
        heads_reduced['head']=heads_reduced['common'].str.replace(r'_h$', r'_noun', regex=True)
        heads_reduced.drop(['common'],axis=1,inplace=True)

        modifiers_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.endswith(r'_m')]
        modifiers_reduced.reset_index(inplace=True)   
        modifiers_reduced['modifier']=modifiers_reduced['common'].str.replace(r'_m$', r'_noun', regex=True)
        modifiers_reduced.drop(['common'],axis=1,inplace=True)

        if args.temporal!=0:
            compounds_reduced.set_index(['modifier','head','time'],inplace=True)
            heads_reduced.set_index(['head','time'],inplace=True)
            modifiers_reduced.set_index(['modifier','time'],inplace=True)
        else:
            compounds_reduced.set_index(['modifier','head'],inplace=True)
            heads_reduced.set_index(['head'],inplace=True)
            modifiers_reduced.set_index(['modifier'],inplace=True)

common               time  context      
'_noun '_noun        1840  allegro_noun      2.000
                           author_noun       8.000
                           be_verb           1.000
                           board_noun        2.000
                           case_noun         1.000
                                             ...  
zz_noun top_noun     2000  gibbon_noun      47.000
zzzz_noun best_noun  1980  re_adj           47.000
                           security_noun    47.000
                     2000  re_adj          100.000
                           security_noun   100.000
Name: count, Length: 32993706, dtype: float64

In [4]:

        
        
        
        modifiers['common']=modifiers['common'].str.replace(r'_noun$', r'_m', regex=True)


        modifiers.reset_index(inplace=True)
        if args.temporal==0:
            modifiers=modifiers.loc[modifiers.groupby(['common'])['count'].transform('sum').gt(args.cutoff)]
            modifiers=modifiers.groupby(['common','context'])['count'].sum()
        else:
            modifiers=modifiers.loc[modifiers.groupby(['common','time'])['count'].transform('sum').gt(args.cutoff)]
            modifiers=modifiers.groupby(['common','time','context'])['count'].sum()

In [7]:
constituents['decade'].value_counts()

2008    16068828
2007    15020386
2004    14933984
2006    14533186
2005    13388851
          ...   
1594          91
1614          79
1610          63
1597          42
1602          25
Name: decade, Length: 425, dtype: int64

In [7]:
constituents.reset_index(inplace=True)
constituents.columns=['joiner','context','decade','count']
display(constituents.shape)
#constituents=constituents.groupby(['joiner','decade','context'])['count'].sum()
display(constituents.shape)
constituents.head()

(1085627595, 4)

(1085627595,)

joiner  decade  context    
a_noun  1505    basi_noun      1
                be_verb        1
                bottom_noun    1
                come_verb      1
                copy_noun      1
Name: count, dtype: int64

In [3]:
compounds=pd.read_csv("/data/dharp/compounding/datasets/phrases.csv",nrows=1_000_000)
compounds.columns=['modifier','head','context','decade','count']
compounds=compounds.query('decade != 2000')
compounds['joiner']=compounds['modifier']+" "+compounds['head']
compounds=compounds.groupby(['joiner','decade','context'])['count'].sum()
display(compounds.shape)
compounds.head()

(850316,)

joiner    decade  context
a_n aa_n  1820    a_n        1
                  aa_n       1
          1870    a_n        1
                  aa_n       1
          1920    a_n        1
Name: count, dtype: int64

In [4]:
df=pd.concat([constituents,compounds])

In [5]:
df=df.to_sparse()

In [6]:
df, rows, columns = df.to_coo(row_levels=['joiner','decade'],column_levels=['context'],sort_labels=False)

In [7]:
len(columns)

39573

In [8]:
df_svd = TruncatedSVD(n_components=300, n_iter=10, random_state=1991)
display(df_svd.fit(df).explained_variance_ratio_.sum())
df_list=df_svd.fit(df).explained_variance_ratio_
df_reduced = df_svd.fit_transform(df)
df_reduced = Normalizer(copy=False).fit_transform(df_reduced)
df_reduced=pd.DataFrame(df_reduced,index=rows)
#df_reduced.reset_index(inplace=True)
df_reduced.shape

0.9999994049311242

(337167, 300)

In [9]:
df_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
"(a_n, 1800)",0.957,-0.225,-0.104,-0.002,0.12,-0.033,0.037,-0.001,0.02,-0.006,...,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0
"(a_n, 1810)",0.96,-0.225,-0.1,-0.003,0.111,-0.033,0.033,-0.001,0.02,-0.005,...,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0
"(a_n, 1820)",0.96,-0.228,-0.103,-0.005,0.106,-0.029,0.028,-0.0,0.013,-0.004,...,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0
"(a_n, 1830)",0.965,-0.221,-0.089,-0.009,0.099,-0.035,0.028,-0.0,0.017,-0.001,...,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0
"(a_n, 1840)",0.967,-0.219,-0.084,-0.007,0.087,-0.031,0.022,-0.0,0.012,0.0,...,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0


In [10]:
df_reduced.index = pd.MultiIndex.from_tuples(df_reduced.index, names=['joiner', 'decade'])

In [12]:
co_occ_reduced.reset_index(inplace=True)

In [13]:
    compounds_reduced=df_reduced.loc[~df_reduced.index.get_level_values(0).str.contains(r'\w \w')]
    #compounds_reduced.reset_index(inplace=True)
    compounds_reduced

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
joiner,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
a_n,1800,0.957,-0.225,-0.104,-0.002,0.120,-0.033,0.037,-0.001,0.020,-0.006,...,0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,-0.000
a_n,1810,0.960,-0.225,-0.100,-0.003,0.111,-0.033,0.033,-0.001,0.020,-0.005,...,-0.000,-0.000,0.000,-0.000,0.000,0.000,0.000,0.000,-0.000,0.000
a_n,1820,0.960,-0.228,-0.103,-0.005,0.106,-0.029,0.028,-0.000,0.013,-0.004,...,-0.000,-0.000,0.000,-0.000,0.000,0.000,0.000,-0.000,0.000,0.000
a_n,1830,0.965,-0.221,-0.089,-0.009,0.099,-0.035,0.028,-0.000,0.017,-0.001,...,0.000,0.000,0.000,0.000,-0.000,-0.000,0.000,-0.000,-0.000,-0.000
a_n,1840,0.967,-0.219,-0.084,-0.007,0.087,-0.031,0.022,-0.000,0.012,0.000,...,0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,0.000
a_n,1850,0.969,-0.218,-0.084,-0.007,0.074,-0.023,0.016,0.000,0.004,0.001,...,-0.000,0.000,-0.000,-0.000,0.000,-0.000,-0.000,0.000,0.000,0.000
a_n,1860,0.972,-0.214,-0.082,-0.007,0.052,-0.009,0.007,0.000,-0.005,0.002,...,-0.000,-0.000,-0.000,0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000
a_n,1870,0.975,-0.205,-0.071,-0.009,0.036,-0.001,-0.002,-0.000,-0.008,0.001,...,-0.000,-0.000,0.000,0.000,0.000,0.000,0.000,-0.000,-0.000,-0.000
a_n,1880,0.977,-0.199,-0.065,-0.007,0.023,0.006,-0.006,-0.000,-0.007,0.001,...,0.000,-0.000,-0.000,-0.000,-0.000,-0.000,-0.000,0.000,0.000,0.000
a_n,1890,0.980,-0.191,-0.060,-0.007,0.007,0.011,-0.010,-0.000,-0.007,-0.000,...,0.000,-0.000,0.000,0.000,0.000,-0.000,0.000,0.000,0.000,-0.000


In [16]:
modifiers=co_occ_reduced.loc[co_occ_reduced.constituent.str.endswith(r'_m')]
modifiers['constituent']=modifiers['constituent'].str.replace(r'_m$', r'_n', regex=True)
modifiers.to_csv('/data/dharp/compounding/datasets/modifiers_context_decadal_svd.csv',index=False,header=False,sep='\t')