In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import ast
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse
import networkx as nx

In [3]:
import io
from contextlib import contextmanager


@contextmanager
def quote_brackets(filename):
    with open(filename) as f:
        yield io.StringIO(f.read().replace('[','"[').replace(']',']"'))

def combine_features(df):
    def str_cat_df(df,cols):
        series = (df[c] for c in cols)
        series = list(map(lambda x: x.apply(str), series))
        return series[0].str.cat(series[1:]).astype('category')
    
    author_features=['verified', 'activity', 'defaultprofile', 'userurl']
    tweet_features=['hashtag', 'tweeturl', 'mentions', 'media']
    
    df.insert(0,'author_feature',str_cat_df(df, author_features))
    df.insert(1,'tweet_feature',str_cat_df(df, tweet_features))
    
    df.drop(columns=author_features, inplace=True)
    df.drop(columns=tweet_features, inplace=True)

def normalize_level_column(colname):
    if colname.startswith('level'):
        return int(colname.removeprefix('level'))
    try:
        return int(colname)
    except ValueError:
        return colname

def retweet_csv(filename, literal_eval=('retweets','tree')):
    df = pd.read_csv(filename, dtype={'author_feature':'str','tweet_feature':'str'})
    if 'author_feature' not in df:
        combine_features(df)

    for col in literal_eval:
        if col in df:
            df[col] = df[col].apply(ast.literal_eval)
    return df


def normalized_csv(oldfile, newfile=None, max_depth=10):
    df=retweet_csv(oldfile,())
    df.rename(mapper=normalize_level_column, axis=1, inplace=True)
    df.drop(columns=['author','tree','retweets'], inplace=True, errors='ignore')
    for l in range(max_depth + 1):
        if l not in df:
            df[l]=0
    if newfile is not None:
        df.to_csv(newfile, index=False)
    return df

def by_feature(df):
    df=df.groupby(['author_feature', 'tweet_feature']).agg(['sum','mean',list])
    return df.dropna().swaplevel(axis=1).sort_index(axis=1)

def retweeted_by_feature(df):
    return by_feature(df[df.max(numeric_only=True, axis=1)>0])


In [45]:
with quote_brackets('vegan_20210729_analysed.csv') as f:
    vegan_a=normalized_csv(f, 'vegan_analyzed.csv')
    
# fpoe_a=normalized_csv('fpoeanalysed.csv','fpoe_analyzed.csv')
# neos_a=normalized_csv('neos_20210729_analysed.csv','neos_analyzed.csv')
fpoe_w=normalized_csv('fpoe_w.csv','fpoe_analyzed.csv')
neos_w=normalized_csv('neos_w.csv','neos_analyzed.csv')

In [43]:
fpoe_w[fpoe_w.max(numeric_only=True, axis=1)==0]
for i in range(2):
    print(i)

0
1


In [19]:
fpoe_g=normalized_csv('fpoe_generated.csv')
fpoe_p=normalized_csv('fpoe_with_params.csv')
neos_g=normalized_csv('neos_generated.csv')
neos_p=normalized_csv('neos_with_params.csv')

In [20]:
rfpoe_w=retweeted_by_feature(fpoe_w)
rneos_w=retweeted_by_feature(neos_w)
rfpoe_g=retweeted_by_feature(fpoe_g)
rfpoe_p=retweeted_by_feature(fpoe_p)
rneos_g=retweeted_by_feature(neos_g)
rneos_p=retweeted_by_feature(neos_p)

In [21]:
fpoe_w=by_feature(fpoe_w)
neos_w=by_feature(neos_w)
fpoe_g=by_feature(fpoe_g)
fpoe_p=by_feature(fpoe_p)
neos_g=by_feature(neos_g)
neos_p=by_feature(neos_p)


In [32]:
rfpoe_w

Unnamed: 0_level_0,Unnamed: 1_level_0,list,list,list,list,list,list,list,list,list,list,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,3,4,5,6,7,8,9,10,...,1,2,3,4,5,6,7,8,9,10
author_feature,tweet_feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0000,0000,"[2, 1, 1, 0, 3, 1, 1, 1, 0, 1, 9, 9, 2, 1, 1, ...","[0, 2, 0, 1, 0, 0, 1, 0, 1, 1, 2, 22, 1, 0, 2,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,357,369,22,0,0,0,0,0,0,0
0000,0001,"[1, 5, 1, 0, 1, 5, 1, 2, 8, 1, 1, 9, 3, 12, 5,...","[0, 8, 0, 1, 2, 0, 1, 0, 0, 0, 1, 14, 6, 9, 2,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,64,48,2,0,0,0,0,0,0,0
0000,0010,"[2, 8, 2, 1, 2, 12, 14, 13, 0, 0, 8, 3, 8, 28,...","[0, 8, 13, 0, 4, 8, 8, 4, 2, 2, 59, 10, 6, 11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,118,139,4,0,0,0,0,0,0,0
0000,0011,"[30, 4, 20, 2, 1, 1, 1, 1, 1, 1]","[42, 3, 27, 3, 0, 2, 0, 0, 12, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",...,62,89,1,0,0,0,0,0,0,0
0000,0100,"[2, 1, 4, 1, 6, 2, 5, 1, 1, 1, 11, 2, 2, 2, 1,...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 107, 0, 0, 3, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,171,208,11,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,1000,[9],[6],[0],[0],[0],[0],[0],[0],[0],[0],...,9,6,0,0,0,0,0,0,0,0
1110,1010,[5],[3],[0],[0],[0],[0],[0],[0],[0],[0],...,5,3,0,0,0,0,0,0,0,0
1110,1110,[1],[1],[0],[0],[0],[0],[0],[0],[0],[0],...,1,1,0,0,0,0,0,0,0,0
1111,1010,"[3, 4]","[0, 3]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]",...,7,3,0,0,0,0,0,0,0,0


In [31]:
fpoe_w

Unnamed: 0_level_0,Unnamed: 1_level_0,list,list,list,list,list,list,list,list,list,list,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,3,4,5,6,7,8,9,10,...,1,2,3,4,5,6,7,8,9,10
author_feature,tweet_feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0000,0000,"[2, 1, 1, 0, 3, 1, 1, 1, 0, 1, 9, 9, 2, 1, 1, ...","[0, 2, 0, 1, 0, 0, 1, 0, 1, 1, 2, 22, 1, 0, 2,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,357,369,22,0,0,0,0,0,0,0
0000,0001,"[1, 5, 1, 0, 1, 5, 1, 2, 8, 1, 1, 9, 3, 12, 5,...","[0, 8, 0, 1, 2, 0, 1, 0, 0, 0, 1, 14, 6, 9, 2,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,64,48,2,0,0,0,0,0,0,0
0000,0010,"[2, 8, 2, 1, 2, 12, 14, 13, 0, 0, 8, 3, 8, 28,...","[0, 8, 13, 0, 4, 8, 8, 4, 2, 2, 59, 10, 6, 11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,118,139,4,0,0,0,0,0,0,0
0000,0011,"[30, 4, 20, 2, 1, 1, 1, 1, 1, 1]","[42, 3, 27, 3, 0, 2, 0, 0, 12, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",...,62,89,1,0,0,0,0,0,0,0
0000,0100,"[2, 1, 4, 1, 6, 2, 5, 1, 1, 1, 11, 2, 2, 2, 1,...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 107, 0, 0, 3, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,171,208,11,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,1000,[9],[6],[0],[0],[0],[0],[0],[0],[0],[0],...,9,6,0,0,0,0,0,0,0,0
1110,1010,[5],[3],[0],[0],[0],[0],[0],[0],[0],[0],...,5,3,0,0,0,0,0,0,0,0
1110,1110,[1],[1],[0],[0],[0],[0],[0],[0],[0],[0],...,1,1,0,0,0,0,0,0,0,0
1111,1010,"[3, 4]","[0, 3]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]",...,7,3,0,0,0,0,0,0,0,0


In [30]:
rfpoe_w.eq(fpoe_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,list,list,list,list,list,list,list,list,list,list,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,3,4,5,6,7,8,9,10,...,1,2,3,4,5,6,7,8,9,10
author_feature,tweet_feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0000,0000,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
0000,0001,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
0000,0010,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
0000,0011,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
0000,0100,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,1000,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1110,1010,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1110,1110,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1111,1010,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [39]:
len(fpoe_w.list[1].at['0000','0100']),len(rfpoe_w.list[1].at['0000','0100'])

(61, 59)

In [74]:
import scipy.stats

def observed_levels(df):
    """Return list of level observations per feature vector."""
    df=df['sum']
    return df.apply(lambda x: [l for c in df.columns for l in [c]*x[c]], axis=1)

def observed_levels_distance(a, b, distance=scipy.stats.wasserstein_distance):
    df=pd.DataFrame({'a':observed_levels(a), 'b':observed_levels(b)}).dropna()
    return df.apply(lambda x:distance(*x),axis=1)

def mean_levels_distance(a, b, distance=scipy.stats.wasserstein_distance):
    df=pd.concat([a['mean'],b['mean']],keys=['a','b'],axis=1).dropna()
    columns=a['mean'].columns
    return df.apply(lambda x: distance(columns,columns,x['a'],x['b']),axis=1)

#mean_levels_distance(fpoe_a,fpoe_p).mean()
#observed_levels_distance(fpoe_a,fpoe_p).mean()
#len(observed_levels(fpoe_a)[('0000','0000')])

def mean_error(a, b):
    return (a['mean'] - b['mean']).dropna().mean(axis=1)

def mean_absolute_error(a, b):
    return (a['mean'] - b['mean']).dropna().abs().mean(axis=1)


def compare(a,b):
    m=mean_error(a,b)
    ma=mean_absolute_error(a,b)
    return {'mamem':m.abs().mean(),
            'mmaem':ma.mean(),
            'mmem':m.mean(),
            # 'mywasserstein_levels': observed_levels_distance(a,b, lambda u,v:wasserstein_2d(np.array([u]).transpose(),np.array([v]).transpose())).mean(),
            'mwasserstein_levels': mean_levels_distance(a,b, scipy.stats.wasserstein_distance).mean(),
            'menergy_levels': mean_levels_distance(a,b, scipy.stats.energy_distance).mean()
           }

def table(analyzed, without_params, with_params):
    return pd.DataFrame([compare(analyzed,without_params),compare(analyzed,with_params)],index=['without params', 'with params'])

In [100]:
table(fpoe_a,fpoe_g,fpoe_p) 

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.704789,0.717423,0.665919,0.553836,0.53679
with params,0.35763,0.555201,0.251749,0.524343,0.528336


In [49]:
table(rfpoe_a,rfpoe_g,rfpoe_p) 

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.664786,0.678986,0.62529,0.553836,0.53679
with params,0.32314,0.76052,-0.185203,0.524343,0.528336


In [50]:
table(neos_a, neos_g, neos_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.5628,0.571218,0.463697,0.448518,0.457164
with params,0.308064,0.426988,0.220152,0.411395,0.430974


In [51]:
table(rneos_a, rneos_g, rneos_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.552382,0.564451,0.452748,0.448518,0.457164
with params,0.293702,0.597478,-0.111128,0.411395,0.430974


In [12]:
# https://stackoverflow.com/a/57563383/
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment

def wasserstein_2d(U,V, normalize=True):
    if normalize:
        U = U#/U.shape[0]#U.sum().sum()
        V = V#/V.shape[0]#V.sum().sum()
    d = cdist(U,V)
    # display(d)

    assignment = linear_sum_assignment(d)
    return d[assignment].sum()#/(U.sum().sum()+V.sum().sum())

def wasserstein_2d_row_ab(row, normalize=False):
    a=np.array(row['a'].to_list()).transpose()
    b=np.array(row['b'].to_list()).transpose()
    return wasserstein_2d(a,b,normalize)

def wasserstein_2d_df(a,b,normalize=False):
    a=a['list']
    b=b['list']
#     a=pd.Series([observed_levels(a)])
#     b=pd.Series([observed_levels(b)])
    df=pd.concat([a,b], keys=['a','b'], axis=1).dropna()
#     display(df)
    return df.apply(wasserstein_2d_row_ab, axis=1, normalize=normalize)

# display(observed_levels_distance(fpoe_a,fpoe_p))

wasserstein_2d_df(fpoe_a,fpoe_g,True)
# mine=observed_levels_distance(fpoe_a,fpoe_g, lambda u,v:wasserstein_2d(np.array([u]).transpose(),np.array([v]).transpose()))
# orig=observed_levels_distance(fpoe_a,fpoe_g)
# mine.loc[('0000','0000')]/orig.loc[('0000','0000')]

#neos_a['list']
# neos_a['mean'].applymap(lambda x:[x])
#pd.DataFrame([a,b],index=['a','b']).fillna([0])
#pd.DataFrame(a.array)
#np.array(neos_a['list'].loc[('0000','0001')].apply(lambda x:[np.mean(x)]))
#neos_a['mean'].loc[('0000','0001')].apply(lambda x:[x]).to_list()
#by_feature(neos_g)['list']
#fpoe_a['mean'].loc[('0000','0000')].sum()
# np.array(neos_a['list'].loc[('0000','0001')].to_list()).transpose().shape[0]

author_feature  tweet_feature
0000            0000             361.613762
                0001              41.596804
                0010             139.113642
                0011              84.855543
                0100             173.800757
                                    ...    
1101            1000             243.124999
                1001              28.751161
                1010              30.694276
                1100             126.329168
                1110              17.302548
Length: 131, dtype: float64

In [13]:
mine.loc[('0000','0010')]/orig.loc[('0000','0010')]

NameError: name 'mine' is not defined

In [None]:
len(fpoe_g.loc[('0000','0000')]['list'][0])

In [None]:
orig

In [None]:
mine

In [None]:
from scipy.stats import wasserstein_distance

# np.random.seed(0)
n = 100

Y1 = np.random.randn(n)
Y2 = np.random.randn(n-10) - 2
d =  np.abs(Y1 - Y2.reshape((n-10, 1)))

assignment = linear_sum_assignment(d)
print(d[assignment].sum()/90)       # 1.9777950447866477
print(wasserstein_distance(Y1, Y2))  #
#np.abs(Y1 - Y2.reshape((n, 1)))

In [None]:
s=ast.literal_eval(open('hawk-out/neos/solutions-neos-1338421.hawk-pbs5.pyon').read())

In [None]:
f0=pd.DataFrame([{**d, 'val':v} for v,d in s[('0000','0000')]])

In [None]:
f0.sort_values('discount_factor')

In [None]:
g=sp.sparse.load_npz('../kpm/pokec_full.npz')

In [None]:
g[0:990908//2, 0:990908//2]

# Shortest path trees

In [62]:
from snsim.propagation import read, simulation, tree
def read_graph(filename):
    g,_=read.labelled_graph(filename)
    g=nx.from_scipy_sparse_matrix(g,create_using=nx.DiGraph)
    return g

def tree_csv(filename):
    df=pd.read_csv(filename)
    df.tree=df.tree.apply(lambda x:tree.from_dict(ast.literal_eval(x)))
    return df

def shortest_path_histogram(graph, retweets):
    # a,r=retweets
    # retweets=(a-1,[x-1 for x in r])
    hist=retweets.apply(lambda t: tree.shortest_path_histogram(graph,t)).fillna(0)
    return hist[sorted(hist.columns)]

def analyze(graph, tweets):
    graph=read_graph(graph)
    tweets=retweet_csv(tweets)
    tweets=tweets[tweets.retweets.apply(len)>0]
    retweets=tweets[['author','retweets']].apply(tuple,axis=1)
    hist=shortest_path_histogram(graph,retweets)
    return pd.concat([tweets,hist], axis=1)


In [63]:
neos_w=analyze('../data/anon_graph_inner_neos_20201110.npz', 'retweeters_neos_20210729.csv')

No path from 7040 to 1697
No path from 7344 to 3283
No path from 4335 to 1735
No path from 4335 to 1859
No path from 4335 to 6881
No path from 4335 to 6881
No path from 7286 to 759
No path from 4463 to 5368
No path from 3175 to 5368
No path from 5670 to 1010
No path from 5670 to 1275
No path from 5670 to 6836
No path from 5238 to 6280
No path from 6328 to 7040
No path from 6328 to 1415
No path from 6328 to 5295
No path from 6328 to 3047
No path from 2582 to 1358
No path from 1936 to 1441
No path from 1036 to 1358
No path from 7040 to 5316
No path from 4979 to 683
No path from 95 to 7946
No path from 7040 to 5195
No path from 665 to 7813
No path from 2352 to 5867
No path from 2352 to 4304
No path from 7556 to 4117
No path from 4335 to 1410
No path from 1393 to 4442
No path from 1008 to 7946
No path from 8039 to 1233
No path from 7040 to 234
No path from 7556 to 7287
No path from 7556 to 7727
No path from 7556 to 2969
No path from 7556 to 7065
No path from 5011 to 1517
No path from 7147 

Unnamed: 0,author_feature,tweet_feature,author,retweets,0,1,2,3,4,5,6
3,0010,0100,7040,[996],0,1,0,0,0,0,0
4,0001,0000,5547,"[7301, 1059]",0,2,0,0,0,0,0
7,0001,1011,1598,[6809],0,1,0,0,0,0,0
11,0001,1111,1598,[4639],0,1,0,0,0,0,0
13,0001,1011,1598,[447],0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4466,0010,0010,3144,"[3979, 7535, 3905]",0,3,0,0,0,0,0
4472,0010,1010,3293,[3979],0,1,0,0,0,0,0
4475,0001,1001,5243,"[7205, 6180, 2118]",0,3,0,0,0,0,0
4480,0001,1011,3034,"[589, 437, 7687, 73]",0,2,2,0,0,0,0


In [67]:
fpoe_w=analyze('../data/anon_graph_inner_fpoe_20201110.npz', 'retweeters_fpoe_20210729.csv')

No path from 9187 to 14023
No path from 17926 to 4457
No path from 9187 to 14023
No path from 7977 to 13606
No path from 7409 to 16288
No path from 7343 to 5926
No path from 4742 to 3125
No path from 10980 to 18583
No path from 10597 to 3220
No path from 4199 to 1823
No path from 6882 to 8990
No path from 17462 to 19925
No path from 9870 to 3220
No path from 14397 to 20317
No path from 933 to 19925
No path from 11070 to 17535
No path from 9499 to 3220
No path from 18512 to 3220
No path from 18512 to 3238
No path from 18512 to 3125
No path from 18512 to 3220
No path from 5102 to 2339
No path from 1746 to 3220
No path from 620 to 3220
No path from 18512 to 3220
No path from 18317 to 13939
No path from 6131 to 15465
No path from 20782 to 13606
No path from 12841 to 890
No path from 12841 to 9220
No path from 12841 to 3332
No path from 12841 to 2237
No path from 12841 to 426
No path from 12841 to 14043
No path from 12841 to 1748
No path from 19205 to 13778
No path from 1127 to 344
No path 

Unnamed: 0,author_feature,tweet_feature,author,retweets,0,1,2,3,4,5,6
6,0001,1100,13214,"[21061, 3263, 16007]",0,2,1,0,0,0,0
7,0010,0000,8746,[7899],0,1,0,0,0,0,0
9,0100,0011,10539,"[5975, 7415, 18647]",0,3,0,0,0,0,0
11,0101,0100,14180,"[3006, 19425, 14096, 14850]",0,4,0,0,0,0,0
15,0101,0100,7742,[9701],0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
19389,0000,1001,6921,"[15226, 9817]",0,1,1,0,0,0,0
19404,1101,0101,18970,[20971],0,1,0,0,0,0,0
19407,0110,0110,20971,[19425],0,1,0,0,0,0,0
19408,1101,0101,6536,"[4719, 4308]",0,2,0,0,0,0,0


In [86]:
neos_w.to_csv('neos_w.csv', index=False)
fpoe_w.to_csv('fpoe_w.csv',index=False)

In [89]:
neos_w=normalized_csv('neos_w.csv')
fpoe_w=normalized_csv('fpoe_w.csv')

In [99]:
rneos_w=retweeted_by_feature(neos_w)
rfpoe_w=retweeted_by_feature(fpoe_w)

In [93]:
table(rneos_w, rneos_g, rneos_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.636953,0.656945,0.542774,0.351309,0.417602
with params,0.270069,0.527344,-0.021102,0.27649,0.349736


In [94]:
table(rneos_a, rneos_g, rneos_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.552382,0.564451,0.452748,0.448518,0.457164
with params,0.293702,0.597478,-0.111128,0.411395,0.430974


In [100]:
table(rfpoe_w,rfpoe_g,rfpoe_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.740069,0.764529,0.703269,0.338771,0.390383
with params,0.305213,0.61722,-0.107224,0.281524,0.353729


In [97]:
table(rfpoe_a,rfpoe_g,rfpoe_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.664786,0.678986,0.62529,0.553836,0.53679
with params,0.32314,0.76052,-0.185203,0.524343,0.528336


In [5]:
g=read_graph('../data/anon_graph_inner_neos_20201110.npz')

In [180]:
trees=retweet_csv('neos_generated.csv').tree.apply(tree.from_dict)

In [106]:
c=retweet_csv('neos_generated.csv')

In [183]:
r=neos_w[['author','retweets']].apply(tuple,axis=1)

In [31]:
neos_r=neos_w[neos_w.retweets.apply(len)>0]
r=neos_r[['author','retweets']].apply(tuple,axis=1)

In [8]:
inner_graph=read_graph('../data/anon_graph_inner_neos_20201110.npz')

In [6]:
neos_w=retweet_csv('retweeters_neos_20210729.csv')

In [13]:
r=neos_w[['author','retweets']].apply(tuple,axis=1)

0                             (6359, [])
1                             (1363, [])
2                             (4007, [])
3                          (7040, [996])
4                   (5547, [7301, 1059])
                      ...               
4482                          (5662, [])
4483    (6908, [6052, 2488, 8078, 7232])
4484                           (679, [])
4485                          (2451, [])
4486                          (6547, [])
Length: 4487, dtype: object

In [10]:
data=tree.shortest_path_histogram(inner_graph, (6908,[6052, 2488, 8078, 7232]))

In [41]:
inner_graph.has_edge(5547,7301)

False

In [45]:
inner_graph.has_edge(7040,996)

True

In [52]:
inner_graph.has_edge(7039,1058)

True

In [53]:
n_hist=shortest_path_histogram(inner_graph,r)

No path from 7040 to 1697
No path from 7344 to 3283
No path from 4335 to 1735
No path from 4335 to 1859
No path from 4335 to 6881
No path from 4335 to 6881
No path from 7286 to 759
No path from 4463 to 5368
No path from 3175 to 5368
No path from 5670 to 1010
No path from 5670 to 1275
No path from 5670 to 6836
No path from 5238 to 6280
No path from 6328 to 7040
No path from 6328 to 1415
No path from 6328 to 5295
No path from 6328 to 3047
No path from 2582 to 1358
No path from 1936 to 1441
No path from 1036 to 1358
No path from 7040 to 5316
No path from 4979 to 683
No path from 95 to 7946
No path from 7040 to 5195
No path from 665 to 7813
No path from 2352 to 5867
No path from 2352 to 4304
No path from 7556 to 4117
No path from 4335 to 1410
No path from 1393 to 4442
No path from 1008 to 7946
No path from 8039 to 1233
No path from 7040 to 234
No path from 7556 to 7287
No path from 7556 to 7727
No path from 7556 to 2969
No path from 7556 to 7065
No path from 5011 to 1517
No path from 7147 

In [60]:
n_hist

Unnamed: 0,0,1,2,3,4,5,6
3,0,1,0,0,0,0,0
4,0,2,0,0,0,0,0
7,0,1,0,0,0,0,0
11,0,1,0,0,0,0,0
13,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
4466,0,3,0,0,0,0,0
4472,0,1,0,0,0,0,0
4475,0,3,0,0,0,0,0
4480,0,2,2,0,0,0,0


In [61]:
pd.concat([neos_r,n_hist], axis=1)

Unnamed: 0,author_feature,tweet_feature,author,retweets,0,1,2,3,4,5,6
3,0010,0100,7040,[996],0,1,0,0,0,0,0
4,0001,0000,5547,"[7301, 1059]",0,2,0,0,0,0,0
7,0001,1011,1598,[6809],0,1,0,0,0,0,0
11,0001,1111,1598,[4639],0,1,0,0,0,0,0
13,0001,1011,1598,[447],0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4466,0010,0010,3144,"[3979, 7535, 3905]",0,3,0,0,0,0,0
4472,0010,1010,3293,[3979],0,1,0,0,0,0,0
4475,0001,1001,5243,"[7205, 6180, 2118]",0,3,0,0,0,0,0
4480,0001,1011,3034,"[589, 437, 7687, 73]",0,2,2,0,0,0,0


In [189]:
import importlib
importlib.reload(tree)

<module 'snsim.propagation.tree' from '/Users/ian/src/propagation/snsim/propagation/tree.py'>