In [1]:
import ast
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse


In [2]:
import io
from contextlib import contextmanager


@contextmanager
def quote_brackets(filename):
    with open(filename) as f:
        yield io.StringIO(f.read().replace('[','"[').replace(']',']"'))

def combine_features(df):
    def str_cat_df(df,cols):
        series = (df[c] for c in cols)
        series = list(map(lambda x: x.apply(str), series))
        return series[0].str.cat(series[1:]).astype('category')
    
    author_features=['verified', 'activity', 'defaultprofile', 'userurl']
    tweet_features=['hashtag', 'tweeturl', 'mentions', 'media']
    
    df.insert(0,'author_feature',str_cat_df(df, author_features))
    df.insert(1,'tweet_feature',str_cat_df(df, tweet_features))
    
    df.drop(columns=author_features, inplace=True)
    df.drop(columns=tweet_features, inplace=True)

def normalize_level_column(colname):
    if colname.startswith('level'):
        return int(colname.removeprefix('level'))
    try:
        return int(colname)
    except ValueError:
        return colname
    
def normalized_csv(oldfile, newfile=None, numlevels=11):
    df = pd.read_csv(oldfile, dtype={'author_feature':'str','tweet_feature':'str'})
    if 'author_feature' not in df:
        combine_features(df)
    df.rename(mapper=normalize_level_column, axis=1, inplace=True)
    df.drop(columns=['author','tree','retweets',0], inplace=True, errors='ignore')
    for l in range(1,numlevels):
        if l not in df:
            df[l]=0
    if newfile is not None:
        df.to_csv(newfile, index=False)
    return df

def by_feature(df):
    df=df.groupby(['author_feature', 'tweet_feature']).agg(['sum','mean',list])
    return df.dropna().swaplevel(axis=1).sort_index(axis=1)

def retweeted_by_feature(df):
    return by_feature(df[df.max(numeric_only=True, axis=1)>1])


In [3]:
with quote_brackets('vegan_20210729_analysed.csv') as f:
    vegan_a=normalized_csv(f, 'vegan_analyzed.csv')
    
fpoe_a=normalized_csv('fpoeanalysed.csv','fpoe_analyzed.csv')
neos_a=normalized_csv('neos_20210729_analysed.csv','neos_analyzed.csv')

In [4]:
fpoe_g=normalized_csv('fpoe_generated.csv')
fpoe_p=normalized_csv('fpoe_with_params.csv')
neos_g=normalized_csv('neos_generated.csv')
neos_p=normalized_csv('neos_with_params.csv')

In [5]:
rfpoe_a=retweeted_by_feature(fpoe_a)
rneos_a=retweeted_by_feature(neos_a)
rfpoe_g=retweeted_by_feature(fpoe_g)
rfpoe_p=retweeted_by_feature(fpoe_p)
rneos_g=retweeted_by_feature(neos_g)
rneos_p=retweeted_by_feature(neos_p)

In [6]:
fpoe_a=by_feature(fpoe_a)
neos_a=by_feature(neos_a)
fpoe_g=by_feature(fpoe_g)
fpoe_p=by_feature(fpoe_p)
neos_g=by_feature(neos_g)
neos_p=by_feature(neos_p)


In [7]:
import scipy.stats

def observed_levels(df):
    """Return list of level observations per feature vector."""
    df=df['sum']
    return df.apply(lambda x: [l for c in df.columns for l in [c]*x[c]], axis=1)

def observed_levels_distance(a, b, distance=scipy.stats.wasserstein_distance):
    df=pd.DataFrame({'a':observed_levels(a), 'b':observed_levels(b)}).dropna()
    return df.apply(lambda x:distance(*x),axis=1)

def mean_levels_distance(a, b, distance=scipy.stats.wasserstein_distance):
    df=pd.concat([a['mean'],b['mean']],keys=['a','b'],axis=1).dropna()
    columns=a['mean'].columns
    return df.apply(lambda x: distance(columns,columns,x['a'],x['b']),axis=1)

#mean_levels_distance(fpoe_a,fpoe_p).mean()
#observed_levels_distance(fpoe_a,fpoe_p).mean()
#len(observed_levels(fpoe_a)[('0000','0000')])

def mean_error(a, b):
    return (a['mean'] - b['mean']).dropna().mean(axis=1)

def mean_absolute_error(a, b):
    return (a['mean'] - b['mean']).dropna().abs().mean(axis=1)


def compare(a,b):
    m=mean_error(a,b)
    ma=mean_absolute_error(a,b)
    return {'mamem':m.abs().mean(),
            'mmaem':ma.mean(),
            'mmem':m.mean(),
            # 'mywasserstein_levels': observed_levels_distance(a,b, lambda u,v:wasserstein_2d(np.array([u]).transpose(),np.array([v]).transpose())).mean(),
            'mwasserstein_levels': mean_levels_distance(a,b, scipy.stats.wasserstein_distance).mean(),
            'menergy_levels': mean_levels_distance(a,b, scipy.stats.energy_distance).mean()
           }

def table(analyzed, without_params, with_params):
    return pd.DataFrame([compare(analyzed,without_params),compare(analyzed,with_params)],index=['without params', 'with params'])

In [8]:
table(fpoe_a,fpoe_g,fpoe_p) 

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.704789,0.717423,0.665919,0.553836,0.53679
with params,0.35763,0.555201,0.251749,0.524343,0.528336


In [9]:
table(rfpoe_a,rfpoe_g,rfpoe_p) 

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.998231,1.018543,0.959427,0.556859,0.5304
with params,0.462968,0.983141,0.001188,0.56951,0.572886


In [10]:
table(neos_a, neos_g, neos_p)

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.5628,0.571218,0.463697,0.448518,0.457164
with params,0.308064,0.426988,0.220152,0.411395,0.430974


In [11]:
table(rneos_a, rneos_g, rneos_p)   

Unnamed: 0,mamem,mmaem,mmem,mwasserstein_levels,menergy_levels
without params,0.752935,0.773736,0.65211,0.443173,0.441352
with params,0.375539,0.750151,-0.010862,0.431692,0.448682


In [12]:
# https://stackoverflow.com/a/57563383/
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment

def wasserstein_2d(U,V, normalize=True):
    if normalize:
        U = U#/U.shape[0]#U.sum().sum()
        V = V#/V.shape[0]#V.sum().sum()
    d = cdist(U,V)
    # display(d)

    assignment = linear_sum_assignment(d)
    return d[assignment].sum()#/(U.sum().sum()+V.sum().sum())

def wasserstein_2d_row_ab(row, normalize=False):
    a=np.array(row['a'].to_list()).transpose()
    b=np.array(row['b'].to_list()).transpose()
    return wasserstein_2d(a,b,normalize)

def wasserstein_2d_df(a,b,normalize=False):
    a=a['list']
    b=b['list']
#     a=pd.Series([observed_levels(a)])
#     b=pd.Series([observed_levels(b)])
    df=pd.concat([a,b], keys=['a','b'], axis=1).dropna()
#     display(df)
    return df.apply(wasserstein_2d_row_ab, axis=1, normalize=normalize)

# display(observed_levels_distance(fpoe_a,fpoe_p))

wasserstein_2d_df(fpoe_a,fpoe_g,True)
# mine=observed_levels_distance(fpoe_a,fpoe_g, lambda u,v:wasserstein_2d(np.array([u]).transpose(),np.array([v]).transpose()))
# orig=observed_levels_distance(fpoe_a,fpoe_g)
# mine.loc[('0000','0000')]/orig.loc[('0000','0000')]

#neos_a['list']
# neos_a['mean'].applymap(lambda x:[x])
#pd.DataFrame([a,b],index=['a','b']).fillna([0])
#pd.DataFrame(a.array)
#np.array(neos_a['list'].loc[('0000','0001')].apply(lambda x:[np.mean(x)]))
#neos_a['mean'].loc[('0000','0001')].apply(lambda x:[x]).to_list()
#by_feature(neos_g)['list']
#fpoe_a['mean'].loc[('0000','0000')].sum()
# np.array(neos_a['list'].loc[('0000','0001')].to_list()).transpose().shape[0]

author_feature  tweet_feature
0000            0000             361.613762
                0001              41.596804
                0010             139.113642
                0011              84.855543
                0100             173.800757
                                    ...    
1101            1000             243.124999
                1001              28.751161
                1010              30.694276
                1100             126.329168
                1110              17.302548
Length: 131, dtype: float64

In [13]:
mine.loc[('0000','0010')]/orig.loc[('0000','0010')]

NameError: name 'mine' is not defined

In [None]:
len(fpoe_g.loc[('0000','0000')]['list'][0])

In [None]:
orig

In [None]:
mine

In [None]:
from scipy.stats import wasserstein_distance

# np.random.seed(0)
n = 100

Y1 = np.random.randn(n)
Y2 = np.random.randn(n-10) - 2
d =  np.abs(Y1 - Y2.reshape((n-10, 1)))

assignment = linear_sum_assignment(d)
print(d[assignment].sum()/90)       # 1.9777950447866477
print(wasserstein_distance(Y1, Y2))  #
#np.abs(Y1 - Y2.reshape((n, 1)))

In [None]:
s=ast.literal_eval(open('hawk-out/neos/solutions-neos-1338421.hawk-pbs5.pyon').read())

In [None]:
f0=pd.DataFrame([{**d, 'val':v} for v,d in s[('0000','0000')]])

In [None]:
f0.sort_values('discount_factor')

In [None]:
g=sp.sparse.load_npz('../kpm/pokec_full.npz')

In [None]:
g[0:990908//2, 0:990908//2]