In [5]:
import ast
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse


In [97]:
import io
from contextlib import contextmanager


@contextmanager
def quote_brackets(filename):
    with open(filename) as f:
        yield io.StringIO(f.read().replace('[','"[').replace(']',']"'))

def combine_features(df):
    def str_cat_df(df,cols):
        series = (df[c] for c in cols)
        series = list(map(lambda x: x.apply(str), series))
        return series[0].str.cat(series[1:]).astype('category')
    
    author_features=['verified', 'activity', 'defaultprofile', 'userurl']
    tweet_features=['hashtag', 'tweeturl', 'mentions', 'media']
    
    df.insert(0,'author_feature',str_cat_df(df, author_features))
    df.insert(1,'tweet_feature',str_cat_df(df, tweet_features))
    
    df.drop(columns=author_features, inplace=True)
    df.drop(columns=tweet_features, inplace=True)

def normalize_level_column(colname):
    if colname.startswith('level'):
        return str(int(colname.removeprefix('level'))-1)
    return colname
    
def normalized_csv(oldfile, newfile=None):
    df = pd.read_csv(oldfile, dtype={'author_feature':'str','tweet_feature':'str'})
    if 'author_feature' not in df:
        combine_features(df)
    df.drop(columns=['author','tree','retweets'], inplace=True, errors='ignore')
    df.rename(mapper=normalize_level_column, axis=1, inplace=True)
    if newfile is not None:
        df.to_csv(newfile, index=False)
    return df
    


In [109]:
with quote_brackets('vegan_20210729_analysed.csv') as f:
    vegan_a=normalized_csv(f, 'vegan_analyzed.csv')
    
fpoe_a=normalized_csv('fpoeanalysed.csv','fpoe_analyzed.csv')
neos_a=normalized_csv('neos_20210729_analysed.csv','neos_analyzed.csv')

In [141]:
fpoe_g=normalized_csv('fpoe_generated.csv')
fpoe_p=normalized_csv('fpoe_with_params.csv')
neos_g=normalized_csv('results-trees-neos_20201110-2021-12-10T19:58:39.150768.csv')
neos_p=normalized_csv('neos_with_params.csv')

In [46]:
fpoe_a.groupby(['author_feature', 'tweet_feature']).agg('mean').dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4
author_feature,tweet_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000,0000,4.427083,1.270833,1.343750,0.062500,0.0
0000,0001,2.363636,0.454545,1.500000,0.000000,0.0
0000,0010,5.833333,5.000000,1.611111,0.333333,0.0
0000,0011,9.500000,2.400000,1.900000,0.000000,0.0
0000,0100,3.360656,1.475410,0.885246,0.147541,0.0
...,...,...,...,...,...,...
1110,1000,1.000000,7.000000,2.000000,4.000000,0.0
1110,1010,4.000000,1.000000,2.000000,0.000000,0.0
1110,1110,1.000000,1.000000,0.000000,0.000000,0.0
1111,1010,3.500000,1.000000,0.000000,0.000000,0.0


In [195]:
def mean(a,b):
    a=a.groupby(['author_feature', 'tweet_feature']).agg('mean').dropna()
    b=b.groupby(['author_feature', 'tweet_feature']).agg('mean').dropna()
    return a.sub(b,fill_value=0).dropna().mean(axis=1)

def compare(a,b):
    m=mean(a,b)
    return {'mae':m.abs().mean(), 'me':m.mean()}

def table(analyzed,without_params,with_params):
    return pd.DataFrame([compare(analyzed,without_params),compare(analyzed,with_params)],index=['without params', 'with params'])

mean(neos_p,neos_a)

author_feature  tweet_feature
0000            0000            -0.687858
                0010            -0.276137
                0011             0.051860
                0100            -0.640568
                0110            -0.156569
                                   ...   
1101            0110             0.053814
                0111             0.352887
                1010             0.126102
                1011             0.163213
                1100             0.042303
Length: 91, dtype: float64

In [191]:
table(fpoe_a,fpoe_g,fpoe_p)    

Unnamed: 0,mae,me
without params,0.550406,0.509885
with params,0.327014,0.166491


In [192]:
table(neos_a,neos_g,neos_p)    

Unnamed: 0,mae,me
without params,0.433909,0.326001
with params,0.245927,0.10703


In [5]:
s=ast.literal_eval(open('hawk-out/neos/solutions-neos-1338421.hawk-pbs5.pyon').read())

In [10]:
f0=pd.DataFrame([{**d, 'val':v} for v,d in s[('0000','0000')]])

In [16]:
f0.sort_values('discount_factor')

Unnamed: 0,discount_factor,corr,val
69,0.900000,0.004300,1.406988
66,0.900000,0.004500,1.880012
71,0.910000,0.004200,3.227481
67,0.910000,0.004300,1.148491
31,0.910000,0.004500,1.077906
...,...,...,...
57,0.996094,0.004300,1.138493
42,0.996094,0.003700,1.384988
11,1.000000,0.004648,0.478144
62,1.000000,0.004700,2.301293


In [3]:
g=sp.sparse.load_npz('../kpm/pokec_full.npz')

In [9]:
g[0:990908//2, 0:990908//2]

<495454x495454 sparse matrix of type '<class 'numpy.float64'>'
	with 20002868 stored elements in Compressed Sparse Row format>