In [9]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.preprocessing import Normalizer
import argparse
import time
import pickle as pkl

In [2]:
def year_binner(year,val=10):
    return year - year%val

In [3]:
def dim_reduction(df,rows):
    df_svd = TruncatedSVD(n_components=300, n_iter=10, random_state=args.seed)
    print(f'Explained variance ratio {(df_svd.fit(df).explained_variance_ratio_.sum()):2.3f}')
    #df_list=df_svd.fit(df).explained_variance_ratio_
    df_reduced = df_svd.fit_transform(df)
    df_reduced = Normalizer(copy=False).fit_transform(df_reduced)
    df_reduced=pd.DataFrame(df_reduced,index=rows)
    #df_reduced.reset_index(inplace=True)
    if args.temporal!=0:
        df_reduced.index = pd.MultiIndex.from_tuples(df_reduced.index, names=['common', 'time'])
    return df_reduced

In [4]:
parser = argparse.ArgumentParser(description='Gather data necessary for performing Regression')

parser.add_argument('--inputdir',type=str,
                    help='Provide directory that has the files with the fivegram counts')
parser.add_argument('--outputdir',type=str,
                    help='Provide directory in that the output files should be stored')
parser.add_argument('--temporal',  type=int, default=0,
                    help='Value to bin the temporal information: 0 (remove temporal information), 1 (no binning), 10 (binning to decades), 20 (binning each 20 years) or 50 (binning each 50 years)')

parser.add_argument('--contextual', action='store_true',
                    help='Is the model contextual')

parser.add_argument('--cutoff', type=int, default=50,
                    help='Cut-off frequency for each compound per time period : none (0), 20, 50 and 100')


parser.add_argument('--seed', type=int, default=1991,
                    help='random seed')

parser.add_argument('--storedf', action='store_true',
                    help='Should the embeddings be saved')

parser.add_argument('--dims', type=int, default=300,
                    help='Desired number of reduced dimensions')

parser.add_argument('--input_format',type=str,default='csv',choices=['csv','pkl'],
                    help='In what format are the input files : csv or pkl')
parser.add_argument('--save_format', type=str,default='pkl',choices=['pkl','csv'],
                    help='In what format should the reduced datasets be saved : csv or pkl')


args = parser.parse_args('--inputdir ../Compounding/coha_compounds/ --outputdir ../Compounding/coha_compounds/ --cutoff 10 --storedf --input_format csv --save_format csv'.split())

In [5]:
print(f'Cutoff: {args.cutoff}')
print(f'Time span:  {args.temporal}')
print(f'Dimensionality: {args.dims}')

Cutoff: 10
Time span:  0
Dimensionality: 300


In [10]:
print("Creating dense embeddings")
if args.contextual:
    print("CompoundCentric Model")
    print("Loading the constituent and compound vector datasets")

    if args.input_format=="csv":
        compounds=pd.read_csv(args.inputdir+"/compounds.csv",sep="\t")
    elif args.input=="pkl":
        compounds=pd.read_pickle(args.inputdir+"/compounds.pkl")
        compounds.reset_index(inplace=True)
    compounds.year=compounds.year.astype("int32")
    compounds=compounds.query('1800 <= year <= 2010').copy()
    compounds['common']=compounds['modifier']+" "+compounds['head']

        #head_list_reduced=compounds['head'].unique().tolist()
        #modifier_list_reduced=compounds['modifier'].unique().tolist()

    if args.temporal==0:
        print('No temporal information is stored')
        compounds=compounds.groupby(['common','context'])['count'].sum().to_frame()
        compounds.reset_index(inplace=True)
        compounds=compounds.loc[compounds.groupby(['common'])['count'].transform('sum').gt(args.cutoff)]
        compounds=compounds.groupby(['common','context'])['count'].sum()

    else:
        compounds['time']=year_binner(compounds['year'].values,args.temporal)
        compounds=compounds.groupby(['common','context','time'])['count'].sum().to_frame()
        compounds.reset_index(inplace=True)
        compounds=compounds.loc[compounds.groupby(['common','time'])['count'].transform('sum').gt(args.cutoff)]
        compounds=compounds.groupby(['common','time','context'])['count'].sum()




    if args.input_format=="csv":
        modifiers=pd.read_csv(args.inputdir+"/modifiers.csv",sep="\t")
    elif args.input=="pkl":
        modifiers=pd.read_pickle(args.inputdir+"/modifiers.pkl")
        modifiers.reset_index(inplace=True)
    modifiers.year=modifiers.year.astype("int32")
    modifiers=modifiers.query('1800 <= year <= 2010').copy()
    modifiers.columns=['common','context','year','count']
    modifiers['common']=modifiers['common'].str.replace(r'_noun$', r'_m', regex=True)
        
    if args.temporal==0:
        print('No temporal information is stored')
        modifiers=modifiers.groupby(['common','context'])['count'].sum().to_frame()
        modifiers.reset_index(inplace=True)
        modifiers=modifiers.loc[modifiers.groupby(['common'])['count'].transform('sum').gt(args.cutoff)]
        modifiers=modifiers.groupby(['common','context'])['count'].sum()
    else:
        modifiers['time']=year_binner(modifiers['year'].values,args.temporal)
        modifiers=modifiers.groupby(['common','context','time'])['count'].sum().to_frame()
        modifiers=modifiers.loc[modifiers.groupby(['common','time'])['count'].transform('sum').gt(args.cutoff)]
        modifiers=modifiers.groupby(['common','time','context'])['count'].sum()

    if args.input_format=="csv":
        heads=pd.read_csv(args.inputdir+"/heads.csv",sep="\t")
    elif args.input_format=="pkl":
        heads=pd.read_pickle(args.inputdir+"/heads.pkl")
        heads.reset_index(inplace=True)
    heads.year=heads.year.astype("int32")
    heads=heads.query('1800 <= year <= 2010').copy()
    heads.columns=['common','context','year','count']
    heads['common']=heads['common'].str.replace(r'_noun$', r'_h', regex=True)
    if args.temporal==0:
        print('No temporal information is stored')
        heads=heads.groupby(['common','context'])['count'].sum().to_frame()
        heads.reset_index(inplace=True)
        heads=heads.loc[heads.groupby(['common'])['count'].transform('sum').gt(args.cutoff)]
        heads=heads.groupby(['common','context'])['count'].sum()
    else:
        heads['time']=year_binner(heads['year'].values,args.temporal)
        heads=heads.groupby(['common','context','time'])['count'].sum().to_frame()
        heads=heads.loc[heads.groupby(['common','time'])['count'].transform('sum').gt(args.cutoff)]
        heads=heads.groupby(['common','time','context'])['count'].sum()

    print('Concatenating all the datasets together')
    df=pd.concat([heads,modifiers,compounds], sort=True)

else:
    print("CompoundAgnostic Model")
    wordlist = pkl.load( open( "data/coha_wordlist.pkl", "rb" ) )
        
    if args.input_format=="csv":
        compounds=pd.read_csv(args.inputdir+"/phrases.csv",sep="\t")
    elif args.input_format=="pkl":
        compounds=pd.read_pickle(args.inputdir+"/phrases.pkl")
        compounds.reset_index(inplace=True)
    compounds.year=compounds.year.astype("int32")
    compounds=compounds.query('1800 <= year <= 2010').copy()
    compounds['common']=compounds['modifier']+" "+compounds['head']


    if args.temporal==0:
        print('No temporal information is stored')
        compounds=compounds.groupby(['common','context'])['count'].sum().to_frame()
        compounds.reset_index(inplace=True)
        compounds=compounds.loc[compounds.groupby(['common'])['count'].transform('sum').gt(args.cutoff)]
        compounds=compounds.groupby(['common','context'])['count'].sum()
    else:
        compounds['time']=year_binner(compounds['year'].values,args.temporal)
            #compounds = dd.from_pandas(compounds, npartitions=100)
        compounds=compounds.groupby(['common','context','time'])['count'].sum().to_frame()
        compounds=compounds.loc[compounds.groupby(['common','time'])['count'].transform('sum').gt(args.cutoff)]
        compounds=compounds.groupby(['common','time','context'])['count'].sum()
        
    if args.input_format=="csv":
        constituents=pd.read_csv(args.outputdir+"/words.csv",sep="\t")
    elif args.input_format=="pkl":
        constituents=pd.read_pickle(args.outputdir+"/words.pkl")
        constituents.reset_index(inplace=True)
    constituents.year=constituents.year.astype("int32")
    constituents=constituents.query('1800 <= year <= 2010').copy()
    constituents.columns=['common','context','year','count']
    constituents.query('common in @wordlist',inplace=True)
    if args.temporal==0:
        print('No temporal information is stored')
        constituents=constituents.groupby(['common','context'])['count'].sum().to_frame()
        constituents.reset_index(inplace=True)
        constituents=constituents.loc[constituents.groupby(['common'])['count'].transform('sum').gt(args.cutoff)]
        constituents=constituents.groupby(['common','context'])['count'].sum()           
    else:
        constituents['time']=year_binner(constituents['year'].values,args.temporal)
        constituents=constituents.groupby(['common','context','time'])['count'].sum().to_frame()
        constituents.reset_index(inplace=True)
        constituents=constituents.loc[constituents.groupby(['common','time'])['count'].transform('sum').gt(args.cutoff)]
        constituents=constituents.groupby(['common','time','context'])['count'].sum()

    print('Concatenating all the datasets together')
    df=pd.concat([constituents,compounds], sort=True)

Creating dense embeddings
CompoundAgnostic Model
No temporal information is stored
No temporal information is stored
Concatenating all the datasets together


In [13]:
dtype = pd.SparseDtype(np.float, fill_value=0)
df=df.astype(dtype)
if args.temporal!=0:    
    df, rows, _ = df.sparse.to_coo(row_levels=['common','time'],column_levels=['context'],sort_labels=False)

else:
    df, rows, _ = df.sparse.to_coo(row_levels=['common'],column_levels=['context'],sort_labels=False)

In [15]:
print('Running SVD')   
df_reduced=dim_reduction(df,rows)

print('Splitting back into individual datasets are saving them')
if args.temporal!=0:
    df_reduced.index.names = ['common','time']
else:
    df_reduced.index.names = ['common']

Running SVD
Explained variance ratio 0.971
Splitting back into individual datasets are saving them


In [18]:
compounds_reduced=df_reduced.loc[df_reduced.index.get_level_values(0).str.contains(r'\w \w')]
compounds_reduced.reset_index(inplace=True)
    #print(compounds_reduced.head())
#compounds_reduced['modifier'],compounds_reduced['head']=compounds_reduced['common'].str.split(' ', 1).str

In [30]:
compounds_reduced[['modifier','head']]=compounds_reduced['common'].str.split(' ', n=1,expand=True).copy()
compounds_reduced

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,common,0,1,2,3,4,5,6,7,8,...,292,293,294,295,296,297,298,299,modifier,head
0,--_noun ernment_noun,0.013156,0.010049,0.042117,0.015222,0.021992,0.035819,0.095712,-0.051622,-0.035909,...,0.020727,-0.000598,-0.007909,-0.011186,-0.034584,0.010278,-0.018117,-0.017537,--_noun,ernment_noun
1,.50-caliber_noun machine_noun,0.000255,0.000482,0.001424,0.003878,-0.000756,0.000926,0.003635,-0.004018,-0.000624,...,-0.003629,-0.000007,0.007167,-0.012708,0.004627,-0.009013,0.001364,-0.008112,.50-caliber_noun,machine_noun
2,18-hole_noun golf_noun,0.002611,0.001083,0.005383,0.001914,0.002936,0.008797,0.029849,-0.021480,-0.024156,...,-0.002405,0.012832,-0.023661,0.033115,-0.001968,-0.033928,0.003360,0.003674,18-hole_noun,golf_noun
3,<nul>_noun mead_noun,0.006032,0.013232,0.277146,-0.048300,-0.016587,-0.050491,-0.082149,0.030528,0.030586,...,-0.001800,0.032068,-0.054139,0.010141,-0.003791,0.032247,-0.050930,0.021734,<nul>_noun,mead_noun
4,<nul>_noun tuberculosis_noun,0.011637,0.008685,0.034091,0.022776,0.005681,0.032657,0.093642,-0.069827,-0.009551,...,-0.050107,0.138060,0.010479,0.021163,0.026493,0.013883,0.039912,-0.006302,<nul>_noun,tuberculosis_noun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10357,zinc_noun oxide_noun,0.005456,0.004083,0.016491,0.013181,0.000993,0.018088,0.055689,-0.053783,-0.003219,...,0.052466,-0.032312,0.027851,0.051536,-0.009092,0.001179,-0.021105,0.036009,zinc_noun,oxide_noun
10358,zinc_noun sulphate_noun,0.030771,0.010413,0.052327,0.036359,0.006845,0.043198,0.122054,-0.090039,-0.038553,...,-0.025569,0.150223,-0.175859,0.007048,-0.157568,0.122526,-0.121641,-0.150570,zinc_noun,sulphate_noun
10359,zintl_noun art_noun,0.001855,0.002240,0.016377,0.001576,0.002928,0.022535,0.022752,-0.015936,-0.012259,...,0.020603,-0.042375,0.039904,-0.013654,-0.062860,-0.003009,0.023237,0.000342,zintl_noun,art_noun
10360,zip_noun code_noun,0.012933,0.010429,0.027632,0.037465,-0.002563,0.027326,0.089923,-0.075393,-0.013152,...,-0.009517,0.018017,0.034409,0.046129,0.036112,-0.043758,0.064175,-0.014812,zip_noun,code_noun
