In [87]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.preprocessing import Normalizer
import argparse
import time
import pickle as pkl
import dask.array as da
import dask

In [78]:
def year_binner(year,val=10):
    return year - year%val

In [88]:
def dim_reduction(df,rows):
    df_svd = TruncatedSVD(n_components=300, n_iter=10, random_state=1991)
    print(f'Explained variance ratio {(df_svd.fit(df).explained_variance_ratio_.sum()):2.3f}')
    #df_list=df_svd.fit(df).explained_variance_ratio_
    df_reduced = df_svd.fit_transform(df)
    df_reduced = Normalizer(copy=False).fit_transform(df_reduced)
    df_reduced=pd.DataFrame(df_reduced,index=rows)
    #df_reduced.reset_index(inplace=True)

    df_reduced.index = pd.MultiIndex.from_tuples(df_reduced.index, names=['common', 'time'])
    return df_reduced

In [80]:
    print("CompoundCentric Model")
    print("Loading the constituent and compound vector datasets")


    compounds=pd.read_csv("/data/dharp/compounds/datasets/v3_aware/compounds.csv",sep="\t")

    compounds.year=compounds.year.astype("int32")
    compounds['count']=compounds.year.astype("float64")
    compounds=compounds.loc[compounds.year>=1800]
    compounds=compounds.loc[compounds['count']<1_000_000]
    
    compounds['common']=compounds['modifier']+" "+compounds['head']

        #head_list_reduced=compounds['head'].unique().tolist()
        #modifier_list_reduced=compounds['modifier'].unique().tolist()




    compounds['time']=year_binner(compounds['year'].values,10)
    compounds=compounds.groupby(['common','context','time'])['count'].sum()
    compounds

CompoundCentric Model
Loading the constituent and compound vector datasets


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


common             context      time
#_NOUN #_NOUN      date_VERB    1980     1989.0
                                1990     1999.0
                                2000    20045.0
                                2010    14091.0
#_NOUN cent_NOUN   hour_NOUN    1860     1869.0
                                         ...   
◦_NOUN c._PROPN    however_ADV  2010    20145.0
◦_NOUN c_NOUN      rise_VERB    2000    14041.0
                                2010    20145.0
◦_NOUN field_NOUN  view_NOUN    2000    14035.0
                                2010    18128.0
Name: count, Length: 2097957, dtype: float64

In [81]:
    modifiers=pd.read_csv("/data/dharp/compounds/datasets/v3_aware/modifiers.csv",sep="\t")

    modifiers.year=modifiers.year.astype("int32")
    modifiers['count']=modifiers.year.astype("float64")
    modifiers=modifiers.loc[modifiers.year>=1800]
    modifiers=modifiers.loc[modifiers['count']<1_000_000]
    modifiers.columns=['common','context','year','count']
    modifiers['common']=modifiers['common'].str.replace(r'_(NOUN|PROPN|ADJ)$', r'_m', regex=True)
    
    modifiers['time']=year_binner(modifiers['year'].values,10)
    modifiers=modifiers.groupby(['common','context','time'])['count'].sum()
    modifiers

common  context      time
"_m     year_NOUN    1890      1894.0
                     1900      1909.0
                     1920      1922.0
                     1940      5841.0
                     1950     19558.0
                               ...   
♦_m     dredge_NOUN  1900     13330.0
                     2000      2007.0
        email_NOUN   2000     14042.0
❯_m     step_NOUN    2000     58232.0
                     2010    590274.0
Name: count, Length: 2619407, dtype: float64

In [82]:
    heads=pd.read_csv("/data/dharp/compounds/datasets/v3_aware/heads.csv",sep="\t")

    heads.year=heads.year.astype("int32")
    heads['count']=heads.year.astype("float64")
    heads=heads.loc[heads.year>=1800]
    heads=heads.loc[heads['count']<1_000_000]
    heads.columns=['common','context','year','count']
    heads['common']=heads['common'].str.replace(r'_(NOUN|PROPN|ADJ)$', r'_h', regex=True)
    
    heads['time']=year_binner(heads['year'].values,10)
    heads=heads.groupby(['common','context','time'])['count'].sum()
    heads

common  context     time
#_h     #_NOUN      1830     9173.0
                    1840     3690.0
                    1850    12983.0
                    1860     7458.0
                    1870     7494.0
                             ...   
♦_h     unit_NOUN   1920     1927.0
                    1930     5813.0
                    1940    15548.0
一_h     add_VERB    1990    11979.0
        value_NOUN  1990    11979.0
Name: count, Length: 2570019, dtype: float64

In [83]:
df=pd.concat([heads,modifiers,compounds], sort=True)

In [84]:
type(df)

pandas.core.series.Series

In [85]:
dtype = pd.SparseDtype(float, fill_value=0)
df=df.astype(dtype)

In [86]:
type(df)

pandas.core.series.Series

In [89]:
df, rows, _ = df.sparse.to_coo(row_levels=['common','time'],column_levels=['context'],sort_labels=False)

IOStream.flush timed out


In [90]:
print('Running SVD')   
df_reduced=dim_reduction(df,rows)

Running SVD
Explained variance ratio 0.781


In [92]:
compounds_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.contains(r'\w \w')]
compounds_reduced.reset_index(inplace=True)
    #print(compounds_reduced.head())
compounds_reduced[['modifier','head']]=compounds_reduced['common'].str.split(' ', n=1,expand=True).copy()
compounds_reduced

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,common,time,0,1,2,3,4,5,6,7,...,292,293,294,295,296,297,298,299,modifier,head
0,#_NOUN cent_NOUN,1860,0.094566,-0.052127,0.286609,-0.122367,-0.068207,0.062645,0.127152,-0.134290,...,-0.002582,-0.010485,-0.013443,0.020738,0.001167,0.007712,-0.001959,0.001110,#_NOUN,cent_NOUN
1,#_NOUN cent_NOUN,1900,0.101859,-0.052361,0.281479,-0.125681,-0.069403,0.064050,0.128603,-0.126002,...,-0.000953,-0.012016,-0.013635,0.022357,0.004108,0.008506,-0.003962,0.000574,#_NOUN,cent_NOUN
2,#_NOUN cent_NOUN,1910,0.105959,-0.052084,0.275655,-0.126883,-0.069636,0.064469,0.128516,-0.119329,...,0.000160,-0.012954,-0.013652,0.023270,0.006069,0.008974,-0.005289,0.000205,#_NOUN,cent_NOUN
3,#_NOUN cent_NOUN,1920,0.112453,-0.050240,0.256327,-0.126514,-0.068497,0.063863,0.125279,-0.101845,...,0.002601,-0.014708,-0.013364,0.024721,0.010232,0.009788,-0.008077,-0.000610,#_NOUN,cent_NOUN
4,#_NOUN cent_NOUN,1930,0.094541,-0.052125,0.286618,-0.122354,-0.068202,0.062639,0.127145,-0.134312,...,-0.002587,-0.010480,-0.013442,0.020733,0.001158,0.007709,-0.001953,0.001112,#_NOUN,cent_NOUN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946464,◦_NOUN c._PROPN,2010,0.047717,0.032979,0.020162,0.020614,0.160696,0.048546,0.052705,0.256397,...,-0.053334,-0.001947,-0.006811,-0.032499,0.027492,0.028426,-0.018280,-0.001175,◦_NOUN,c._PROPN
946465,◦_NOUN c_NOUN,2000,0.036595,0.004921,0.029418,0.043014,0.033974,0.019600,-0.002898,0.117485,...,-0.011691,-0.106389,0.174744,-0.004846,-0.079863,0.041529,0.018362,-0.095796,◦_NOUN,c_NOUN
946466,◦_NOUN c_NOUN,2010,0.036595,0.004921,0.029418,0.043014,0.033974,0.019600,-0.002898,0.117485,...,-0.011691,-0.106389,0.174744,-0.004846,-0.079863,0.041529,0.018362,-0.095796,◦_NOUN,c_NOUN
946467,◦_NOUN field_NOUN,2000,0.026877,-0.010657,0.002220,0.019800,0.069524,0.015960,-0.007309,0.019861,...,0.084099,-0.048537,0.013446,-0.075868,-0.059511,0.019224,-0.045717,-0.040587,◦_NOUN,field_NOUN


In [94]:
    heads_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.endswith(r'_h')]
    heads_reduced.reset_index(inplace=True)
    heads_reduced['head']=heads_reduced['common'].str.replace(r'_h$', r'_NOUN', regex=True)
    heads_reduced.drop(['common'],axis=1,inplace=True)

    modifiers_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.endswith(r'_m')]
    modifiers_reduced.reset_index(inplace=True)   
    modifiers_reduced['modifier']=modifiers_reduced['common'].str.replace(r'_m$', r'_NOUN', regex=True)
    modifiers_reduced.drop(['common'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heads_reduced['head']=heads_reduced['common'].str.replace(r'_h$', r'_NOUN', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  modifiers_reduced['modifier']=modifiers_reduced['common'].str.replace(r'_m$', r'_NOUN', regex=True)


In [96]:
        compounds_reduced.set_index(['modifier','head','time'],inplace=True)
        heads_reduced.set_index(['head','time'],inplace=True)
        modifiers_reduced.set_index(['modifier','time'],inplace=True)

In [104]:
compounds_reduced.drop('common',axis=1).to_csv("/data/dharp/compounds/datasets/v3_aware/compounds_reduced.csv",sep="\t",header=False)

KeyError: "['common'] not found in axis"

In [107]:
modifiers_reduced.to_csv("/data/dharp/compounds/datasets/v3_aware/modifiers_reduced.csv",sep="\t",header=False)
heads_reduced.to_csv("/data/dharp/compounds/datasets/v3_aware/heads_reduced.csv",sep="\t",header=False)