In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.preprocessing import Normalizer
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None

In [2]:
constituents=pd.read_csv("/data/dharp/compounding/datasets/words.csv",nrows=1_000_000)
constituents.columns=['joiner','context','decade','count']
constituents=constituents.query('decade != 2000')
constituents=constituents.groupby(['joiner','decade','context'])['count'].sum()
display(constituents.shape)
constituents.head()

(924870,)

joiner  decade  context
a_n     1800    a_a         15
                a_n        128
                aa_n        20
                aaa_n       16
                aaron_n      6
Name: count, dtype: int64

In [3]:
compounds=pd.read_csv("/data/dharp/compounding/datasets/phrases.csv",nrows=1_000_000)
compounds.columns=['modifier','head','context','decade','count']
compounds=compounds.query('decade != 2000')
compounds['joiner']=compounds['modifier']+" "+compounds['head']
compounds=compounds.groupby(['joiner','decade','context'])['count'].sum()
display(compounds.shape)
compounds.head()

(850316,)

joiner    decade  context
a_n aa_n  1820    a_n        1
                  aa_n       1
          1870    a_n        1
                  aa_n       1
          1920    a_n        1
Name: count, dtype: int64

In [4]:
df=pd.concat([constituents,compounds])

In [5]:
df=df.to_sparse()

In [6]:
df, rows, columns = df.to_coo(row_levels=['joiner','decade'],column_levels=['context'],sort_labels=False)

In [7]:
len(columns)

39573

In [8]:
df_svd = TruncatedSVD(n_components=300, n_iter=10, random_state=1991)
display(df_svd.fit(df).explained_variance_ratio_.sum())
df_list=df_svd.fit(df).explained_variance_ratio_
df_reduced = df_svd.fit_transform(df)
df_reduced = Normalizer(copy=False).fit_transform(df_reduced)
df_reduced=pd.DataFrame(df_reduced,index=rows)
#df_reduced.reset_index(inplace=True)
df_reduced.shape

0.9999994049311242

(337167, 300)

In [9]:
df_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
"(a_n, 1800)",0.957,-0.225,-0.104,-0.002,0.12,-0.033,0.037,-0.001,0.02,-0.006,...,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0
"(a_n, 1810)",0.96,-0.225,-0.1,-0.003,0.111,-0.033,0.033,-0.001,0.02,-0.005,...,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0
"(a_n, 1820)",0.96,-0.228,-0.103,-0.005,0.106,-0.029,0.028,-0.0,0.013,-0.004,...,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0
"(a_n, 1830)",0.965,-0.221,-0.089,-0.009,0.099,-0.035,0.028,-0.0,0.017,-0.001,...,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0
"(a_n, 1840)",0.967,-0.219,-0.084,-0.007,0.087,-0.031,0.022,-0.0,0.012,0.0,...,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0


In [10]:
df_reduced.index = pd.MultiIndex.from_tuples(df_reduced.index, names=['joiner', 'decade'])

In [12]:
co_occ_reduced.reset_index(inplace=True)

In [13]:
    compounds_reduced=df_reduced.loc[~df_reduced.index.get_level_values(0).str.contains(r'\w \w')]
    #compounds_reduced.reset_index(inplace=True)
    compounds_reduced

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
joiner,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
a_n,1800,0.957,-0.225,-0.104,-0.002,0.120,-0.033,0.037,-0.001,0.020,-0.006,...,0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,-0.000
a_n,1810,0.960,-0.225,-0.100,-0.003,0.111,-0.033,0.033,-0.001,0.020,-0.005,...,-0.000,-0.000,0.000,-0.000,0.000,0.000,0.000,0.000,-0.000,0.000
a_n,1820,0.960,-0.228,-0.103,-0.005,0.106,-0.029,0.028,-0.000,0.013,-0.004,...,-0.000,-0.000,0.000,-0.000,0.000,0.000,0.000,-0.000,0.000,0.000
a_n,1830,0.965,-0.221,-0.089,-0.009,0.099,-0.035,0.028,-0.000,0.017,-0.001,...,0.000,0.000,0.000,0.000,-0.000,-0.000,0.000,-0.000,-0.000,-0.000
a_n,1840,0.967,-0.219,-0.084,-0.007,0.087,-0.031,0.022,-0.000,0.012,0.000,...,0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000,0.000
a_n,1850,0.969,-0.218,-0.084,-0.007,0.074,-0.023,0.016,0.000,0.004,0.001,...,-0.000,0.000,-0.000,-0.000,0.000,-0.000,-0.000,0.000,0.000,0.000
a_n,1860,0.972,-0.214,-0.082,-0.007,0.052,-0.009,0.007,0.000,-0.005,0.002,...,-0.000,-0.000,-0.000,0.000,0.000,-0.000,0.000,-0.000,0.000,-0.000
a_n,1870,0.975,-0.205,-0.071,-0.009,0.036,-0.001,-0.002,-0.000,-0.008,0.001,...,-0.000,-0.000,0.000,0.000,0.000,0.000,0.000,-0.000,-0.000,-0.000
a_n,1880,0.977,-0.199,-0.065,-0.007,0.023,0.006,-0.006,-0.000,-0.007,0.001,...,0.000,-0.000,-0.000,-0.000,-0.000,-0.000,-0.000,0.000,0.000,0.000
a_n,1890,0.980,-0.191,-0.060,-0.007,0.007,0.011,-0.010,-0.000,-0.007,-0.000,...,0.000,-0.000,0.000,0.000,0.000,-0.000,0.000,0.000,0.000,-0.000


In [16]:
modifiers=co_occ_reduced.loc[co_occ_reduced.constituent.str.endswith(r'_m')]
modifiers['constituent']=modifiers['constituent'].str.replace(r'_m$', r'_n', regex=True)
modifiers.to_csv('/data/dharp/compounding/datasets/modifiers_context_decadal_svd.csv',index=False,header=False,sep='\t')