In [None]:
import networkx as nx
import numpy as np
import os
import re
import pickle

from ShortestPathAlgorithms import seedSets
from TestTransferability import revise_random_dists, reviseSeedSets

def extract_integers_from_line(line):
    # Use regular expression to match exactly two integers in a line
    matches = re.findall(r'\b\d+\b', line)
    if len(matches) == 2:
        return int(matches[0]), int(matches[1])
    else:
        return None, None

def process_text_file(file_path):
    extracted_integers = []
    with open(file_path, 'r') as file:
        for line in file:
            # Extract integers from each line
            integer1, integer2 = extract_integers_from_line(line)
            if integer1 is not None and integer2 is not None:
                # If two integers are extracted, store them
                extracted_integers.append((integer1, integer2))
    return extracted_integers

def get_files_with_substring(directory_path, substring):
    file_paths = []
    for foldername, subfolders, filenames in os.walk(directory_path):
        for filename in filenames:
            if substring in filename:
                file_paths.append(os.path.join(foldername, filename))
    return file_paths

def create_graph_from_edges(edge_list):
    G = nx.Graph()
    G.add_edges_from(edge_list)
    return G.to_directed()

def can_convert_to_undirected(directed_graph):
    for edge in directed_graph.edges():
        if not directed_graph.has_edge(edge[1], edge[0]):
            return False
    return True

dir_nums = [1,2,3,4,5,6,7,8,16,17,18,19,20,21,25,27,28,29,30,31]
path = '/home/myl/notebooks/shortest-path/version14/samples/real_graphs/'
dirs = [path+str(num) for num in dir_nums]

dir = os.path.dirname(os.getcwd())+'/samples'
if not os.path.exists(dir):
    os.makedirs(dir)

sample_dir = dir + '/real_graphs'
if not os.path.exists(sample_dir):
    os.makedirs(sample_dir)

k_max = 4

In [None]:
all_graphs = []
all_graph_info = []
all_random_seeds = []
i = 0
for dir_path in dirs:
    paths1 = get_files_with_substring(dir_path,'txt')
    paths2 = get_files_with_substring(dir_path,'csv')
    paths = paths1+paths2
    paths = [p for p in paths if 'README' not in p and 'descriptions' not in p]
    if '25' in dir_path:
        paths = paths[:1]
    print(paths)
    graphs = []
    targets = []
    for file_path in paths:
        extracted_integers = process_text_file(file_path)
        G = create_graph_from_edges(extracted_integers)
        largest_component = max(nx.strongly_connected_components(G), key=len)
        G = G.subgraph(largest_component)
        G = nx.relabel_nodes(G, {node: index for index, node in enumerate(G.nodes())})
        if len(G.edges()) < 250000:
            i += 1
            print(i)
            print(len(G.nodes()),len(G.edges()),can_convert_to_undirected(G))
            graphs.append(G)
            graph_info = G, False, False
            all_graph_info.append(graph_info)
            random_seeds = []
            for k in range(k_max):
                random_seeds.append(seedSets(graph_info,k+1))
            all_random_seeds.append(random_seeds)
        else:
            print('not selected',len(G.nodes()),len(G.edges()),can_convert_to_undirected(G))
    all_graphs.append(graphs)
    
samples = all_graph_info, all_random_seeds

In [None]:
p = sample_dir+'/all_graphs.pkl'
# if os.path.exists(p):
#     with open(p, 'rb') as file:
#         samples = pickle.load(file)
# else:
#     raise AssertionError('Data not found.')

if len(samples) < 4:
    samples = revise_random_dists(samples)
if len(samples[2]) != 6:
    samples = samples[0],samples[1],[None]*6,samples[2],samples[3]
with open(p, 'wb') as file:
    pickle.dump(samples, file)
# if samples[2][0] == None:
#     print('Calculating degree centrality...')
#     samples = reviseSeedSets(samples,centralities=['degree'])
#     with open(p, 'wb') as file:
#         pickle.dump(samples, file)
# if samples[2][1] == None:
#     print('Calculating closeness centrality...')
#     samples = reviseSeedSets(samples,centralities=['closeness'])
#     with open(p, 'wb') as file:
#         pickle.dump(samples, file)
# if samples[2][2] == None:
#     print('Calculating betweenness centrality...')
#     samples = reviseSeedSets(samples,centralities=['betweenness'])
#     with open(p, 'wb') as file:
#         pickle.dump(samples, file)
# if samples[3][3] == None:
#     print('Calculating harmonic centrality...')
#     samples = reviseSeedSets(samples,centralities=['harmonic'])
#     with open(p, 'wb') as file:
#         pickle.dump(samples, file)
# if samples[3][4] == None:
#     print('Calculating laplacian centrality...')
#     samples = reviseSeedSets(samples,centralities=['laplacian'])
#     with open(p, 'wb') as file:
#         pickle.dump(samples, file)
# if samples[2][5] == None:
#     print('Calculating pagerank centrality...')
#     samples = reviseSeedSets(samples,centralities=['pagerank'])
# with open(p, 'wb') as file:
#     pickle.dump(samples, file)