### define

In [1]:
import igraph

EDGELIST_FULL = './data/concepts_full.edgelist'
EDGELIST_2015_CUM = './data/concepts_2015_cum.edgelist'
EDGELIST_2015 = './data/concepts_2015.edgelist'
EDGELIST_2013_CUM = './data/concepts_2013_cum.edgelist'
EDGELIST_2013 = './data/concepts_2013.edgelist'

Disparity filter for link significance:

[www.pnas.org/content/106/16/6483](https://www.pnas.org/content/106/16/6483)

In [2]:
def disparity_integral(x, k):
    """
    calculate the definite integral in the disparity filter
    """
    return ((1.0 - x)**k) / ((k - 1.0) * (x - 1.0))

def get_disparity_significance(norm_weight, degree):
    """
    calculate the significance (alpha) for the disparity filter
    """
    return 1.0 - ((degree - 1.0) * (disparity_integral(norm_weight, degree) - disparity_integral(0.0, degree)))

## form

In [2]:
d_full = defaultdict(int)
d_2015_cum = defaultdict(int)
d_2015 = defaultdict(int)
d_2013_cum = defaultdict(int)
d_2013 = defaultdict(int)

find = mc['sw']['articles'].find({'concepts_len': {'$gte': 2}}, ['arxiv_id', 'concepts'])

for a in tn(find, total=mc['sw']['articles'].count_documents({'concepts_len': {'$gte': 2}})):
    year = int(a['arxiv_id'][:2])
    concepts = sorted(a['concepts'])
    for i in range(len(concepts) - 1):
        c = concepts[i]
        for cc in concepts[i + 1:]:
#             d_full[(c, cc)] += 1
            if year <= 15:
#                 d_2015_cum[(c, cc)] += 1
                if year == 15:
                    d_2015[(c, cc)] += 1
                if year <= 13:
#                     d_2013_cum[(c, cc)] += 1
                    if year == 13:
                        d_2013[(c, cc)] += 1
len(d_full), len(d_2015_cum), len(d_2015), len(d_2013_cum), len(d_2013)

HBox(children=(IntProgress(value=0, max=1114761), HTML(value='')))




(0, 0, 0, 12658007, 11139356)

In [None]:
with open(EDGELIST_FULL, 'w') as f:
    for e, w in tn(d_full.items()):
        f.write(f"{e[0]} {e[1]} {w}\n")

In [None]:
with open(EDGELIST_2015_CUM, 'w') as f:
    for e, w in tn(d_2015_cum.items()):
        f.write(f"{e[0]} {e[1]} {w}\n")

In [None]:
with open(EDGELIST_2015, 'w') as f:
    for e, w in tn(d_2015.items()):
        f.write(f"{e[0]} {e[1]} {w}\n")

In [None]:
with open(EDGELIST_2013_CUM, 'w') as f:
    for e, w in tn(d_2013_cum.items()):
        f.write(f"{e[0]} {e[1]} {w}\n")

In [None]:
with open(EDGELIST_2013, 'w') as f:
    for e, w in tn(d_2013.items()):
        f.write(f"{e[0]} {e[1]} {w}\n")

### 2013

In [7]:
g_2013 = igraph.Graph.Read_Ncol(EDGELIST_2013, directed=False)
g_2013.vs['strength'] = g_2013.strength(weights='weight')
g_2013.vcount(), g_2013.ecount()

(16229, 11139356)

In [87]:
for v in tn(g_2013.vs):
    degree = v.degree()
    strength = v['strength']  # just .strength() is simple degree !!!
    for e in g_2013.es[g_2013.incident(v)]:
        norm_weight = e['weight'] / strength
        e['norm_weight'] = norm_weight
        if degree > 1:
            try:
                if norm_weight == 1.0:
                    norm_weight -= 0.0001
                alpha = get_disparity_significance(norm_weight, degree)
            old_alpha = e.attributes().get('alpha')
            if old_alpha is None:
                old_alpha = 100
            e['alpha'] = min(old_alpha, alpha)
            old_norm_weight = e.attributes().get('norm_weight')
            if old_norm_weight is None:
                old_norm_weight = 100
            e['norm_weight'] = max(old_norm_weight, norm_weight)
        else:
            e['alpha'] = 0.0

alpha_s = pd.Series(g_2013.es['alpha']).sort_values().reset_index()[[0]]
alpha_to_p_value = 1 - alpha_s.groupby(0).apply(lambda x: min(x.index)) / len(alpha_s)
for e in tn(g_2013.es()):
    e['alpha_p_value'] = alpha_to_p_value[e['alpha']]
pd.to_pickle(g_2013, './data/g_2013.pkl')

HBox(children=(IntProgress(value=0, max=11139356), HTML(value='')))




### 2015

In [3]:
g_2015 = igraph.Graph.Read_Ncol(EDGELIST_2015, directed=False)
g_2015.vs['strength'] = g_2015.strength(weights='weight')
g_2015.vcount(), g_2015.ecount()

(16660, 12658007)

In [4]:
for v in tn(g_2015.vs):
    degree = v.degree()
    strength = v['strength']  # just .strength() is simple degree !!!
    for e in g_2015.es[g_2015.incident(v)]:
        norm_weight = e['weight'] / strength
        e['norm_weight'] = norm_weight
        if degree > 1:
            try:
                if norm_weight == 1.0:
                    norm_weight -= 0.0001
                alpha = get_disparity_significance(norm_weight, degree)
            old_alpha = e.attributes().get('alpha')
            if old_alpha is None:
                old_alpha = 100
            e['alpha'] = min(old_alpha, alpha)
            old_norm_weight = e.attributes().get('norm_weight')
            if old_norm_weight is None:
                old_norm_weight = 100
            e['norm_weight'] = max(old_norm_weight, norm_weight)
        else:
            e['alpha'] = 0.0

alpha_s = pd.Series(g_2015.es['alpha']).sort_values().reset_index()[[0]]
alpha_to_p_value = 1 - alpha_s.groupby(0).apply(lambda x: min(x.index)) / len(alpha_s)
for e in tn(g_2015.es()):
    e['alpha_p_value'] = alpha_to_p_value[e['alpha']]
pd.to_pickle(g_2015, './data/g_2015.pkl')

HBox(children=(IntProgress(value=0, max=16660), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12658007), HTML(value='')))


