# Setting Up Space


In [None]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
from matplotlib.colors import Normalize
from matplotlib.legend_handler import HandlerTuple
from tqdm import tqdm
from itertools import combinations

import Analysis as lyze
import Archive as arch
import GlobalSim
import LocalSim
import dataHandling as data

# Real World Data 

## Directory & File Name Patterns

In [None]:
main_directory = os.getcwd()  # Replace with your own directory path
hashtags = arch.list_folder(main_directory)

# pattern of dates in the files' names
pattern = r'_(\d+-+\d+_+\d+-+\d+)' 
dates = arch.extract_patterns_from_filenames(os.path.join(main_directory,hashtags[0]), pattern)

hashtag_sets = [{'hashtag': np.arange(0,len(hashtags)), 'label':'all'}]


## *Combining temporal networks
*(This section can be skipped if already done once.)*


In this section, we combine the networks temporally, such that from the 9 initial timespans of the retweet data, we get only 2 timespans from the aggregation of the first the first 4 timespans and the second 5 timespans. 

In [None]:
# Skip this part if the combined networks have already been saved 
# and jump to the next cell to read those networks
for hashtag in hashtags:
    print(f"{hashtag} is under process:")
    data.compose_temporal_union(main_directory, "1sthalf", [hashtag], dates[:4], save=True)
    print(f"First half of {hashtag} completed.")
    data.compose_temporal_union(main_directory, "2ndhalf", [hashtag], dates[5:], save=True)
    print(f"Second half of {hashtag} completed.")

## Loading Combined Data

In [None]:
# reading the combined data
first_half_data = arch.mass_read_graphs(hashtags, main_directory, '1sthalf')
second_half_data = arch.mass_read_graphs(hashtags, main_directory, '2ndhalf')

## Preference Existence Measures

In [None]:

overlap_results, similarity_results = lyze.parallel_similarity_overlap(first_half_data,
                                    second_half_data,mode='both', weighted_overlap=True,
                                    overlap=True, similarity=False)


# Global Preferential Model

## The Underlying Network

### Simulation
Start a network from scratch and evolve it.

In [None]:
directory = os.path.join(os.getcwd(), 'Sim_results')

N = 400  # Number of nodes
T = 5000  # Number of timesteps
m = 10 #number of new links added at each timesteps
m0 = 20  # Number of nodes with non-zero degree at t=0
x = 3  # Link increment-proportion percentage
steps = 25 #adjacency matrices will be saved at every other 'steps'
save_steps = np.arange(0,T,steps)  # Timesteps to save adjacency matrices
init_type = 'random' # could also be 'path', see GlobalSim package
update_mode = 'increment' #could also be 'proportional', see GlobalSim package

output_dir = arch.create_folder_in_wd('global2_'+init_type[0:4]+'_'+update_mode[0:3]+'_'+arch.generate_magic_number(),
                                    directory) # Directory to save snapshots
sim_params = {'N':N, 'T':T, 'm0':m0, 'm':m,'x':x, 'steps':steps, 'save_steps':np.array(save_steps),
              'output_dir':str("r'")+output_dir+str("'"),
              'update_mode':str("'")+update_mode+str("'"),
              'init_type':str("'")+init_type+str("'")}

GlobalSim.simulate_global_preference(N, T, m, x, save_steps, output_dir, update_mode, init_type)

arch.save_function_arguments(os.path.join(output_dir,'Parameters.txt'), sim_params)
print(f"Saved {len(save_steps)} adjacency matrices.")

### Continue Simulation
Continue evolving the network from a saved adjacency matrix

In [None]:
"""manually input the properties of the adjacency matrices 
or copy from 'parameter' text file in the simulation folder"""

t0 = 1990 #from when to begin continuing simulation
T = 2000 #timesetps to continue
m = 3 #nummber of links added to the graph at each timesteps
update_mode = 'increment'
x= 2 #update parameter based on update_mode
steps= 10
save_steps = np.arange(t0,T,steps)
output_dir = os.getcwd() #change this directory for your purpose

#continue simulation
adj_matrix = np.load(os.path.join(output_dir, f'adjacency_matrix_t{t0}.npy'))
GlobalSim.continue_global_simulation(adj_matrix, t0, T, m, x, save_steps, output_dir, update_mode )

### Loading saved data
If the underlying network is already simulated, just load the data.

In [None]:
N= 400
T= 5000
m0= 20
m= 10
x= 3
steps= 25
output_dir= r'c:\Users\ROZHIN\Desktop\New folder (3)\Code\00 RE\Sim_results\global2_rand_inc_58-10-03-06-25'
update_mode= 'increment'
init_type= 'random'


#::::::::::::::::::::::::::::: select the evolution timestep you wish to analyze
t =2000
directory1 = os.path.join(output_dir,'adjacency_matrix_t'+str(t)+'.npy')
global_adjacency_matrix = np.load(directory1)

## Retweet Simulation

In [None]:
N = len(global_adjacency_matrix)
global_rt_sim_ens = 120 #the number of simulation repetition (ensemble)
global_rt_noise_base = 0.3

#In simulation, since no evolution is happening, 
# only those with non-zero degree can participate in sharing
global_init_nodes = [i for i in range(len(global_adjacency_matrix)) if np.sum(global_adjacency_matrix[i,:])!=0]

global_rt_sim_paths = []
for i in tqdm(range(global_rt_sim_ens)):
    global_rt_noise = np.abs(global_rt_noise_base + np.random.normal(0,0.05)) #optional to add a deviation in noise
    rand_coeff =  np.random.choice(np.arange(1,20)) #random coefficient to increase the retweet-network size ratio
    init = np.random.choice(global_init_nodes) #only choose from those nodes who can be initiators
    paths, _ = lyze.simulate_retweet(global_adjacency_matrix, N*rand_coeff,global_rt_noise,init_node=init)
    global_rt_sim_paths.append(paths)

#saving the results
file_name = rf'pickles\global_retweet_simulation_N_{N}_t{t}_{global_rt_sim_ens}ens_{global_rt_noise_base}noise_'+arch.generate_magic_number()+'.pkl'
with open(file_name, 'wb') as f:
    pickle.dump(global_rt_sim_paths, f)

# Local Preferential Model

## The underlying netwrok

### Simulation
Start a network from scratch and evolve it.

In [None]:

m = 10 #number of new links added at each timesteps
m0 = 20  # Number of nodes with non-zero degree at t=0
x = 3  # Link increment-proportion percentage
steps = 25 #adjacency matrices will be saved at every other 'steps'
save_steps = np.arange(0,T,steps)  # Timesteps to save adjacency matrices
init_type = 'random' # could also be 'path', see LocalSim package
update_mode = 'increment' #could also be 'proportional', see LocalSim package

N = 400 # Number of nodes
T = 10000  # Number of timesteps
m = 5  # Number of new links added at each timesteps
x = 50 # Link increment-proportion percentage
steps = 5 # Adjacency matrices will be saved at every other 'steps'
save_steps = np.arange(0,T,steps)  # Timesteps to save adjacency matrices
update_mode = 'increment'
distribution = "gaussian"
params = {"mean": 6, "std": 2}
output_dir = arch.create_folder_in_wd(os.path.join(main_directory,'Sim_results',
                            'local_'+distribution[0:3]+'_'+update_mode[0:3]+'_'+arch.generate_magic_number())) # Directory to save snapshots
sim_params ={'N':N, 'T':T, 'm':m,'x':x, 'steps':steps, 'save_steps':np.array(save_steps), 'output_dir':str("r'")+output_dir+str("'"),
              'update_mode':str("'")+update_mode+str("'"),
            'distribution': str("'")+distribution+str("'"), 'params':params}

LocalSim.simulate_local_preference(N, T, m, x, save_steps, output_dir, update_mode, distribution, params)

arch.save_function_arguments(os.path.join(output_dir,'Parameters.txt'), sim_params)
print(f"Saved {len(save_steps)} adjacency matrices.")


### if saved data

In [None]:
#:::::::::::::::::::::::::::::: insert the 'parameters' file's contents here
N= 400
T= 10000
m= 5
x= 50
steps= 5
# save_steps= [   0    5   10 ... 9985 9990 9995]
output_dir= r'C:\Users\ROZHIN\Desktop\New folder (3)\Code\00 RE\Sim_results\local'
update_mode= 'increment'
distribution= 'gaussian'
params= {'mean': 6, 'std': 2}



#:::::::::::::::::::::::::::: select the evolution timestep you wish to analyze
t =800
directory1 = os.path.join(output_dir,'adjacency_matrix_t'+str(t)+'.npy')
local_adjacency_matrix = np.load(directory1)


## Retweet Simulation

In [None]:
N = len(local_adjacency_matrix)
local_rt_sim_ens = 120 #the number of simulation repetition (ensemble)
local_rt_noise_base = 0.1

# since in local simulation, all nodes have non-zero degree, we randomly pick almost a quarter of
# the present nodes as the initiating nodes of a retweet thread
local_init_nodes = np.random.choice(N, int(N/4), replace=False)

steps_taken = []
local_rt_sim_paths = []
for i in tqdm(range(local_rt_sim_ens)):
    local_rt_noise = np.abs(local_rt_noise_base + np.random.normal(0,0.05)) #gaussian_like_random
    rand_coeff =  np.random.choice(np.arange(10,200)) #int(np.random.choice(ratios)
    init = np.random.choice(local_init_nodes)
    paths, steps = lyze.simulate_retweet(local_adjacency_matrix, N*rand_coeff, local_rt_noise, init_node = init)
    local_rt_sim_paths.append(paths)
    steps_taken.append(steps)


#saving the repickles/sults
file_name = f'pickles/local_retweet_simulation_t{t}_{local_rt_sim_ens}ens_{local_rt_noise}noise_'+arch.generate_magic_number()+'.pkl'
with open(file_name, 'wb') as f:
    pickle.dump(local_rt_sim_paths, f)

# No Preference Simulation

## Random Selection

### Retweet Simulation


In [None]:
N = 50
rand_adjacency_matrix = (np.ones((N,N)) - np.eye(N))/(N-1)

rand_rt_sim_ens = 120
rand_rt_noise = 0

init_nodes = np.random.choice(N,int(N/4), replace=False)

rand_rt_sim_paths = []
steps_taken = []
for i in tqdm(range(rand_rt_sim_ens)):
    rand_coeff = np.random.randint(1,5) 
    init = np.random.choice(init_nodes) #np.random.choice(init_nodes)
    paths, steps = lyze.simulate_retweet(rand_adjacency_matrix, N*rand_coeff, rand_rt_noise, init_node = init)
    rand_rt_sim_paths.append(paths)
    steps_taken.append(steps)

# Analysing All Cases together
In this section, we apply the **Modified Weighted Jaccard Indec** and **Cosine Similatiy** measures to the four cases present:
1. Real Data
2. Global Preference Model
3. Local Preference Model
4. No Preference Model

And plot the results, before assessing the distribution of the results of these measures on pairs of networks using KS test.

## Modified Weighted Jaccard Index

### Loadig Data

In [None]:
# Real data
# Previously calclulated Modified Weighted Jaccard Index (mentioned as overlap for convinience)
with open(r'New_1sthalf_2ndhalf_pair_overlap_coeff_all_both_intersection_weighted_23-15-30-01-25.pkl', 'rb') as f:
    overlap_results = pickle.load(f)

overlaps = []
for index, (key, value) in enumerate(overlap_results.items()):
    ij = key.split('_')
    i = int(ij[0])
    j = int(ij[1])
    overlaps.append(value)


In [None]:
# global simulation

""" To read simulation results from a file, uncomment this part"""
# simulation_dir = os.getcw() #change this directory for your purpose
# with open(simulation_dir, 'rb') as f:
#     global_rt_sim_paths = pickle.load(f)

global_overlap_results = []
group_sims = 6
len_group = int(len(global_rt_sim_paths)/group_sims)
for i in range(group_sims):
    group_overlap_results = []
    for path1, path2 in combinations(global_rt_sim_paths[i*len_group:(i+1)*len_group], 2):
        group_overlap_results.append(lyze.overlap_coefficient(path1, path2))

    global_overlap_results.append(group_overlap_results)
        

In [None]:
# local simulation

""" To read simulation results from a file, uncomment this part"""
# simulation_dir = os.getcw() #change this directory for your purpose
# with open(simulation_dir, 'rb') as f:
#     local_rt_sim_paths = pickle.load(f)

local_overlap_results = []
group_sims = 6
len_group = int(len(local_rt_sim_paths)/group_sims)
for i in range(group_sims):
    group_overlap_results = []
    for path1, path2 in combinations(local_rt_sim_paths[i*len_group:(i+1)*len_group], 2):
        group_overlap_results.append(lyze.overlap_coefficient(path1, path2))

    local_overlap_results.append(group_overlap_results)
        

In [None]:
# random data

#---------------- Simulation of retweet
N = 50
rand_adjacency_matrix = (np.ones((N,N)) - np.eye(N))/(N-1)

node_num = len(rand_adjacency_matrix) 
rand_rt_sim_ens = 120
rand_rt_noise = 0
init_nodes = np.arange(0,node_num,5)

rand_rt_sim_paths = []
steps_taken = []
for i in tqdm(range(rand_rt_sim_ens)):
    rand_coeff = np.random.randint(1,5) 
    init = np.random.choice(init_nodes) #np.random.choice(init_nodes)
    paths, steps = lyze.simulate_retweet(rand_adjacency_matrix, node_num*rand_coeff, rand_rt_noise, init_node = init)
    rand_rt_sim_paths.append(paths)
    steps_taken.append(steps)

#---------------- calculating the modified jaccard index
rand_overlap_results = []
m = 6
len_group = int(len(rand_rt_sim_paths)/m)
# print(len_group)
for i in range(m):
    group_overlap_results = []
    taken_sims = rand_rt_sim_paths[i*len_group:(i+1)*len_group]
    for path1, path2 in combinations(taken_sims, 2):
        group_overlap_results.append(lyze.overlap_coefficient(path1, path2))
    rand_overlap_results.append(group_overlap_results)

In [None]:
def sturges(n):
    return int(np.log(n)/np.log(2)) + 1

### Plotting the results

In [None]:
colors = ["#1f77b4","#ff7f0e","#2ca02c","#d62728"]

probabilities = []
rand_bin = 10
bin_num = sturges(len(overlaps))
fig, ax = plt.subplots(figsize=(8, 5))


# global:::::::::::::::::::::::::::::::::
"""the middle points"""
y_data, bins = np.histogram(np.mean(global_overlap_results, axis=0), bins=bin_num, density=True)
"""vertical points for each bin"""
points = np.zeros((len(bins) - 1, len(global_overlap_results)))
for i in range(len(global_overlap_results)):
    points[:, i], _ = np.histogram(global_overlap_results[i], bins=bins, density=True)
y_data, bins = np.histogram(np.array(global_overlap_results).flatten(), bins=bin_num, density=True)
y_err = np.std(points, axis=1)  # Compute error
y_err /= np.sum(y_data)  # Normalize error to the sum of the original points
y_data /= np.sum(y_data)  # Normalize
probabilities.append(np.sum(y_data))

plt.fill_between(bins[:-1], y_data - y_err, y_data + y_err, alpha=0.6, color=colors[0],
                label="Global Model ± Err",zorder=3)
pr1, = plt.plot(bins[:-1], y_data, lw=3,alpha=0.6,color=colors[0], zorder=3)
pr2, = plt.plot(bins[:-1], y_data, 'k',marker="v", ls='dashed', ms=8, mec="white", linewidth=1,
                mfc=colors[0], zorder=5,label="Global Model ± Err")


# local:::::::::::::::::::::::::::::::::::::::::::::::::::
y_data, bins = np.histogram(np.mean(local_overlap_results, axis=0), bins=bin_num, density=True)
points = np.zeros((len(bins) - 1, len(local_overlap_results)))
for i in range(len(local_overlap_results)):
    points[:, i], _ = np.histogram(local_overlap_results[i], bins=bins, density=True)
y_data, bins = np.histogram(np.array(local_overlap_results).flatten(), bins=bin_num, density=True)
y_err = np.std(points, axis=1)  # Compute error
y_err /= np.sum(y_data)  # Normalize
y_data /= np.sum(y_data)  # Normalize
probabilities.append(np.sum(y_data))

plt.fill_between(bins[:-1], y_data - y_err, y_data + y_err,color=colors[1], hatch="XX", edgecolor="k",
                alpha=0.3, label="Local Model ± Err",zorder=4)
pl1, = plt.plot(bins[:-1], y_data, lw=3, alpha=0.6,color=colors[1], zorder=4)
pl2, = plt.plot(bins[:-1], y_data, 'k',marker="^", ls='dashdot', ms=8, mec="white",
                linewidth=1, mfc=colors[1], zorder=5,label="Local Model ± Err")



# random:::::::::::::::::::::::::::::::::::::::::::::::
y_data, bins = np.histogram(np.mean(rand_overlap_results, axis=0), bins=rand_bin, density=True)
points = np.zeros((len(bins) - 1, len(rand_overlap_results)))
for i in range(len(rand_overlap_results)):
    points[:, i], _ = np.histogram(rand_overlap_results[i], bins=rand_bin, density=True)
y_err = np.std(points, axis=1)  # Compute error
y_err /= np.sum(y_data)  # Normalize
y_data /= np.sum(y_data)  # Normalize
probabilities.append(np.sum(y_data))
plt.fill_between(bins[:-1], y_data - y_err, y_data + y_err, alpha=0.3, color=colors[3],
                label="No Preference ± Err",zorder=2)
pr1, = plt.plot(bins[:-1], y_data, lw=4, zorder=10,alpha=0.6,color=colors[3])
pr2, = plt.plot(bins[:-1], y_data, 'k',marker="d", ls='dotted', ms=8, mec="white",
                linewidth=1, mfc=colors[3], zorder=10,label="No Preference ± Err")


#data:::::::::::::::::::::::::::::::::
bin_num = sturges(len(overlaps))
y_data, bins = np.histogram(overlaps, bins=bin_num, density=True)
y_data /= np.sum(y_data)  # Normalize
probabilities.append(np.sum(y_data))
pd1, = plt.plot(bins[:-1], y_data, lw=4, color='k', zorder=10,alpha=0.6, label='Retweet Data')
pd2, = plt.plot(bins[:-1], y_data, color='white',marker="o", ls=(0,(1,5)), ms=8, mec="white",
                linewidth=1, mfc='k', zorder=10,label="Retweet Data")

# Titles and Labels
plt.xlim(right=1, left=0)
plt.ylim(bottom=0)
plt.xlabel(r"$\tilde{J}_w$", fontsize=12)
plt.ylabel(r"$P(\tilde{J}_w)$", fontsize=12)
plt.legend()

handles, labels = ax.get_legend_handles_labels()
ax.legend([ tuple(handles[0:2]), tuple(handles[2:4]), tuple(handles[4:6]), 
           tuple(handles[6:8])], labels[::2], handlelength=3,
           handler_map={tuple: HandlerTuple(ndivide=1)})

# plt.savefig("Jaccard_index1"+arch.generate_magic_number()+".png",dpi=200)
plt.show()


## Cosine Similarity

### Loading Data

In [None]:
#Real data
with open(r'New_1sthalf_2ndhalf_pair_similarity_all_both_intersection_weighted_08-01-31-01-25.pkl', 'rb') as f:
    similarity_results = pickle.load(f)

similarities = []
for index, (key, value) in enumerate(similarity_results.items()):
    similarities.append(value)
        

In [None]:
#global

""" To read simulation results from a file, uncomment this part"""
# simulation_dir = os.getcw() #change this directory for your purpose
# with open(simulation_dir, 'rb') as f:
#     global_rt_sim_paths = pickle.load(f)

global_similarity_results = []
group_sims = 6
len_group = int(len(global_rt_sim_paths)/group_sims)
for i in range(group_sims):
    group_similarity_results = []
    for path1, path2 in tqdm(combinations(global_rt_sim_paths[i*len_group:(i+1)*len_group], 2)):
        _, (mean, std) = lyze.analyze_all_simulation_nodes([path1,path2])
        group_similarity_results.append(mean)

    global_similarity_results.append(group_similarity_results)

In [None]:
#local

""" To read simulation results from a file, uncomment this part"""
# simulation_dir = os.getcw() #change this directory for your purpose
# with open(simulation_dir, 'rb') as f:
#     local_rt_sim_paths = pickle.load(f)

local_similarity_results = []
group_sims = 6
len_group = int(len(local_rt_sim_paths)/group_sims)
for i in range(group_sims):
    group_similarity_results = []
    for path1, path2 in tqdm(combinations(local_rt_sim_paths[i*len_group:(i+1)*len_group], 2)):
        _, (mean, std) = lyze.analyze_all_simulation_nodes([path1,path2])
        group_similarity_results.append(mean)

    local_similarity_results.append(group_similarity_results)
        

In [None]:
# random data

#-------------------- Simulating Retweet
rand_adjacency_matrix = (np.ones((N,N)) - np.eye(N))/(N-1)

node_num = len(rand_adjacency_matrix) 
rand_rt_sim_ens = 120
rand_rt_noise = 0
#starting node: yes
init_nodes = np.arange(0,node_num,5)

rand_rt_sim_paths = []
steps_taken = []
for i in tqdm(range(rand_rt_sim_ens)):
    rand_coeff = np.random.randint(1,5) 
    init = np.random.choice(init_nodes) #np.random.choice(init_nodes)
    paths, steps = lyze.simulate_retweet(rand_adjacency_matrix, node_num*rand_coeff, rand_rt_noise, init_node = init)
    rand_rt_sim_paths.append(paths)
    steps_taken.append(steps)

# ------------------- Calculating the cosine similarities

rand_similarity_results = []
group_sims = 6
len_group = int(len(rand_rt_sim_paths)/group_sims)
for i in range(group_sims):
    group_similarity_results = []
    for path1, path2 in tqdm(combinations(rand_rt_sim_paths[i*len_group:(i+1)*len_group], 2)):
        _, (mean, std) = lyze.analyze_all_simulation_nodes([path1,path2])
        group_similarity_results.append(mean)

    rand_similarity_results.append(group_similarity_results)

### Plotting the results

In [None]:
colors = ["#1f77b4","#ff7f0e","#2ca02c","#d62728"]

bin_num = sturges(len(overlaps))
rand_bin = 10
fig, ax = plt.subplots(figsize=(8, 5))


# global:::::::::::::::::::::::::::::::::
y_data, bins = np.histogram(np.mean(global_similarity_results, axis=0), bins=bin_num, density=True)
points = np.zeros((len(bins) - 1, len(global_similarity_results)))
for i in range(len(global_similarity_results)):
    points[:, i], _ = np.histogram(global_similarity_results[i], bins=bins, density=True)
y_data, bins = np.histogram(np.array(global_similarity_results).flatten(), bins=bin_num, density=True)
y_err = np.std(points, axis=1)  # Compute error
y_err /= np.sum(y_data)  # Normalize
y_data /= np.sum(y_data)  # Normalize

plt.fill_between(bins[:-1], y_data - y_err, y_data + y_err, alpha=0.4, color=colors[0], label="Global Model ± Err",zorder=3)
pr1, = plt.plot(bins[:-1], y_data, lw=3,alpha=0.6,color=colors[0], zorder=3)
pr2, = plt.plot(bins[:-1], y_data, 'k',marker="v", ls='dashed', ms=8, mec="white", linewidth=1, mfc=colors[0], zorder=5,label="Global Model ± Err")

# local:::::::::::::::::::::::::::::::::::::::::::::::::::
bin_num = sturges(len(local_overlap_results[0]))
y_data, bins = np.histogram(np.mean(local_similarity_results, axis=0), bins=bin_num, density=True)
points = np.zeros((len(bins) - 1, len(local_similarity_results)))
for i in range(len(local_similarity_results)):
    points[:, i], _ = np.histogram(local_similarity_results[i], bins=bins, density=True)
y_data, bins = np.histogram(np.array(local_similarity_results).flatten(), bins=bin_num, density=True)
y_err = np.std(points, axis=1)  # Compute error
y_err /= np.sum(y_data)  # Normalize
y_data /= np.sum(y_data)  # Normalize

plt.fill_between(bins[:-1], y_data - y_err, y_data + y_err, color=colors[1], hatch="XX", edgecolor="k",
                alpha=0.3, label="Local Model ± Err",zorder=4) 
pl1, = plt.plot(bins[:-1], y_data, lw=3, alpha=0.6,color=colors[1], zorder=4)
pl2, = plt.plot(bins[:-1], y_data, 'k',marker="^", ls='dashdot', ms=8, mec="white",
                linewidth=1, mfc=colors[1], zorder=5,label="Local Model ± Err")


# random:::::::::::::::::::::::::::::::::::::::::::::::
bin_num = sturges(len(np.mean(rand_overlap_results, axis=0)))
y_data, bins = np.histogram(np.mean(rand_similarity_results, axis=0), bins=rand_bin, density=True)
points = np.zeros((len(bins) - 1, len(rand_similarity_results)))
for i in range(len(rand_similarity_results)):
    points[:, i], _ = np.histogram(rand_similarity_results[i], bins=rand_bin, density=True)
y_err = np.std(points, axis=1)  # Compute error
y_err /= np.sum(y_data)  # Normalize
y_data /= np.sum(y_data)  # Normalize

plt.fill_between(bins[:-1], y_data - y_err, y_data + y_err, alpha=0.3, color=colors[3],
                label="No Preference ± Err",zorder=2)
pr1, = plt.plot(bins[:-1], y_data, lw=4, zorder=10,alpha=0.6,color=colors[3])
pr2, = plt.plot(bins[:-1], y_data, 'k',marker="d", ls='dotted', ms=8, mec="white",
                linewidth=1, mfc=colors[3], zorder=10,label="No Preference ± Err")


# data:::::::::::::::::::::::::::::::::
bin_num = sturges(len(overlaps))
y_data, bins = np.histogram(similarities, bins=bin_num, density=True)
y_data /= np.sum(y_data)  # Normalize
pd1, = plt.plot(bins[:-1], y_data, lw=4, color='k', zorder=10,alpha=0.6, label='Retweet Data')
pd2, = plt.plot(bins[:-1], y_data, 'white',marker="o", ls=(0,(1,5)), ms=8, mec="white",
                linewidth=1, mfc="black", zorder=10,label="Retweet Data")


# Titles and Labels
plt.xlim(right=1, left=0)
plt.ylim(bottom=0)
plt.xlabel(r"$\langle S_i^{h,g} \rangle$", fontsize=12)
plt.ylabel(r"$P(\langle S_i^{h,g} \rangle)$", fontsize=12)

plt.legend()
handles, labels = ax.get_legend_handles_labels()
ax.legend([ tuple(handles[0:2]), tuple(handles[2:4]), tuple(handles[4:6]), tuple(handles[6:8])], labels[::2], handlelength=3,
          handler_map={tuple: HandlerTuple(ndivide=1)})

# plt.savefig("Similarities"+arch.generate_magic_number()+".png",dpi=200)
plt.show()


# Statistical Tests

In [None]:
sims = {'0': similarities, '1': global_similarity_results[0], 
           '2': local_similarity_results[0], '3': rand_similarity_results[0]}
names = {'0': 'Data', '1': 'Global', '2': 'Local', '3': 'No Preference'}
overalp = {'0': overlaps, '1': global_overlap_results[0], 
           '2': local_overlap_results[0], '3': rand_overlap_results[0]}

## For Modified Weighted Jaccard Index

In [None]:
# stat test of overlaps

# Dictionary
overalp = {'0': overlaps, '1': global_overlap_results[0], 
           '2': local_overlap_results[0], '3': rand_overlap_results[0]}
names = {'0': 'Data', '1': 'Global', '2': 'Local', '3': 'No Preference'}

# Initialize matrices
stat_matrix = np.zeros((4, 4))
p_value_matrix = np.zeros((4, 4))

# Calculate KS test for each pair
for i in range(4):
    for j in range(4):
        stat, p_value = ks_2samp(overalp[str(i)], overalp[str(j)])
        stat_matrix[i, j] = stat
        p_value_matrix[i, j] = p_value

# Plotting the results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Plot KS Statistic
cax1 = ax1.imshow(stat_matrix, cmap='Blues', aspect='auto',
                  norm=Normalize(vmin=0, vmax=stat_matrix.max()))
ax1.set_title('Modified weighted Jaccard index')
fig.colorbar(cax1, ax=ax1)

for i in range(4):
    for j in range(4):
        color = 'white' if stat_matrix[i,j] > stat_matrix.max() / 2 else 'black'
        ax1.text(j, i, f'{stat_matrix[i, j]:.3f}', ha='center', va='center', color=color)

# Plot P-Values
cax2 = ax2.imshow(p_value_matrix, cmap='Blues', aspect='auto',
                  norm=Normalize(vmin=0, vmax=p_value_matrix.max()))
ax2.set_title('P-Values')
fig.colorbar(cax2, ax=ax2)

for i in range(4):
    for j in range(4):
        color = 'white' if p_value_matrix[i,j] > p_value_matrix.max() / 2 else 'black'
        ax2.text(j, i, f'{p_value_matrix[i, j]:.3e}', ha='center', va='center', color=color)

# Labels and layout
for ax in [ax1, ax2]:
    ax.set_xticks(range(4))
    ax.set_yticks(range(4))
    ax.set_xticklabels([names[str(i)] for i in range(4)])
    ax.set_yticklabels([names[str(i)] for i in range(4)])

plt.tight_layout()
plt.savefig('KS_overlap.png', dpi=200)
plt.show()



## For Cosine Similarities

In [None]:
# stat test of cosine similarities

# Dictionary
sims = {'0': similarities, '1': global_similarity_results[0], 
           '2': local_similarity_results[0], '3': rand_similarity_results[0]}
names = {'0': 'Data', '1': 'Global', '2': 'Local', '3': 'No Preference'}

# Initialize matrices
stat_matrix = np.zeros((4, 4))
p_value_matrix = np.zeros((4, 4))

# Calculate KS test for each pair
for i in range(4):
    for j in range(4):
        stat, p_value = ks_2samp(sims[str(i)], sims[str(j)])
        stat_matrix[i, j] = stat
        p_value_matrix[i, j] = p_value

# Plotting the results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Plot KS Statistic
cax1 = ax1.imshow(stat_matrix, cmap='Blues', aspect='auto',
                  norm=Normalize(vmin=0, vmax=stat_matrix.max()))
ax1.set_title('Cosine similarities')
fig.colorbar(cax1, ax=ax1)

for i in range(4):
    for j in range(4):
        color = 'white' if stat_matrix[i,j] > stat_matrix.max() / 2 else 'black'
        ax1.text(j, i, f'{stat_matrix[i, j]:.3f}', ha='center', va='center', color=color)

# Plot P-Values
cax2 = ax2.imshow(p_value_matrix, cmap='Blues', aspect='auto',
                  norm=Normalize(vmin=0, vmax=p_value_matrix.max()))
ax2.set_title('P-Values')
fig.colorbar(cax2, ax=ax2)

for i in range(4):
    for j in range(4):
        color = 'white' if p_value_matrix[i,j] > p_value_matrix.max() / 2 else 'black'
        ax2.text(j, i, f'{p_value_matrix[i, j]:.3e}', ha='center', va='center', color=color)

# Labels and layout
for ax in [ax1, ax2]:
    ax.set_xticks(range(4))
    ax.set_yticks(range(4))
    ax.set_xticklabels([names[str(i)] for i in range(4)])
    ax.set_yticklabels([names[str(i)] for i in range(4)])

plt.tight_layout()
plt.savefig('KS_sim.png',dpi=200)
plt.show()