In [216]:
from collections import Counter
from tqdm import tqdm
import random
import copy

import networkx as nx
import pandas as pd
import numpy as np

In [7]:
def get_nx_graph(edge_list_path):
    edge_list = pd.read_csv(edge_list_path, index_col = 0)
    aev = edge_list.values
    edges_list_t = [(aev[i][0], aev[i][1]) for i in tqdm(range(len(aev)))]
    edge_to_index = {(aev[i][0], aev[i][1]):i for i in tqdm(range(len(aev)))}
    return edge_to_index, nx.DiGraph((x, y) for (x, y) in tqdm(Counter(edges_list_t)))    

def get_subraph(N, source: int, depth_limit: int = 4):
    nodes = list(nx.dfs_preorder_nodes(N, source = source, depth_limit = depth_limit))
    H = N.subgraph(nodes)
    print("Nodes in subgraph: ", len(nodes), "\nEdges in subgraph: ", len(H.edges()))
    return H

### Notebook navigation


[Co-authorship subgraph extraction](#ca_subgraph_extraction)

[Citation subgraph extraction](#citation_graph_extraction)

[Dataset clear](#dataset_clear)

[Final check](#final_check)

[Save data](#save_data)

### Co-authorship subgraph extraction
<a id='ca_subgraph_extraction'></a>

In [8]:
edge_to_index_A, A = get_nx_graph("processed_data/SSORC_CS_2010_2021_authors_edge_list.csv")

  mask |= (ar1 == a)
100%|██████████| 30796749/30796749 [00:22<00:00, 1388407.39it/s]
100%|██████████| 30796749/30796749 [00:41<00:00, 745457.94it/s]
100%|██████████| 30796749/30796749 [02:41<00:00, 191092.78it/s]


In [9]:
edge_to_index_G, G = get_nx_graph("processed_data/SSORC_CS_2010_2021_papers_edge_list_indexed.csv")

100%|██████████| 17921409/17921409 [00:12<00:00, 1431800.73it/s]
100%|██████████| 17921409/17921409 [00:22<00:00, 799091.70it/s]
100%|██████████| 17921409/17921409 [01:45<00:00, 169576.13it/s]


In [11]:
print("Number of connected components in the initial graphs: ", \
      nx.number_connected_components(A.to_undirected()), \
      nx.number_connected_components(G.to_undirected()))

Number of connected components in the initial graphs:  99041 12235


Co-authorship network edge list:

In [12]:
authors_edges = pd.read_csv("processed_data/SSORC_CS_2010_2021_authors_edge_list.csv", index_col = 0)

Co-authorship network edges papers:

In [180]:
authors_edges_papers = pd.read_csv("processed_data/SSORC_CS_2010_2021_authors_edges_papers_indices.csv", index_col = 0, \
                                   converters={"papers_indices": lambda x: x.strip("[]").replace("'","").split(", ")})

  mask |= (ar1 == a)


In [19]:
source = random.choice(list(A.nodes()))
sub_A = get_subraph(A, source, depth_limit = 5)

Nodes in subgraph:  25474 
Edges in subgraph:  255001


In [20]:
print("Subgraph A connected components check: ", nx.number_connected_components(sub_A.to_undirected()))

Subgraph A connected components check:  1


In [56]:
sub_A_edges = list(sub_A.edges())
len(sub_A_edges)

255001

Obtaining papers corresponding to co-authorship graph edges:

In [211]:
authors_edges_papers_sub = [authors_edges_papers["papers_indices"][edge_to_index_A[sub_A_edges[i]]] for i in tqdm(range(len(sub_A_edges)))]

100%|██████████| 255001/255001 [00:03<00:00, 66127.24it/s]


Extracting unique papers:

In [212]:
authors_edges_papers_sub_flat = [int(item) for subarray in authors_edges_papers_sub for item in subarray]
unique_papers = list(set(authors_edges_papers_sub_flat))

In [213]:
print("Total amount of mentioned papers: ", len(authors_edges_papers_sub_flat), \
      "\nUnique papers number: ", len(unique_papers))

Total amount of mentioned papers:  476373 
Unique papers number:  142423


### Citation subgraph extraction
<a id='citation_graph_extraction'></a>

In [146]:
G_sub = G.subgraph(unique_papers)

In [147]:
G_sub_nodes = list(G_sub.nodes())

### Dataset clear
<a id='dataset_clear'></a>

Get connected components of initial citation graph:

In [148]:
Gcc = sorted(nx.connected_components(G_sub.to_undirected()), key=len, reverse=True)

In [149]:
print("Some CC sizes: ", len(Gcc[0]), len(Gcc[1]), len(Gcc[2]))

Some CC sizes:  115907 29 25


In [150]:
remnants = 0
for i in range(1, len(Gcc)):
    remnants += len(Gcc[i])
print ("Number of papers out of GCC: ", remnants)

Number of papers out of GCC:  24356


Get id's of papers that are not presented in the largest connected component:

In [151]:
papers_out_lcc = []
for i in tqdm(range(len(G_sub_nodes))):
    if G_sub_nodes[i] not in Gcc[0]:
        papers_out_lcc.append(G_sub_nodes[i])

100%|██████████| 140263/140263 [00:00<00:00, 1710737.92it/s]


In [152]:
print("Total number of papers to delete: ", len(papers_out_lcc))

Total number of papers to delete:  24356


Removing unwanted papers from G_sub:

In [153]:
G_sub_clear = nx.DiGraph(G_sub)

In [154]:
for i in range(len(papers_out_lcc)):
    node = papers_out_lcc[i]
    G_sub_clear.remove_node(node) 

Obtaining list of disappeared collaborations after papers removing:

In [214]:
collabs_indices_to_delete = []
for i in tqdm(range(len(papers_out_lcc))):
    for j in range(len(authors_edges_papers_sub)):
        if str(papers_out_lcc[i]) in authors_edges_papers_sub[j]:
            del authors_edges_papers_sub[j][authors_edges_papers_sub[j].index(str(papers_out_lcc[i]))]
            if len(authors_edges_papers_sub[j]) == 0:
                collabs_indices_to_delete.append(j)

100%|██████████| 24356/24356 [57:32<00:00,  7.06it/s] 


In [217]:
collabs_indices_to_delete_copy = copy.deepcopy(collabs_indices_to_delete)

In [215]:
len(collabs_indices_to_delete)

31567

In [219]:
A_sub_clear = nx.DiGraph(sub_A)

In [220]:
A_sub_clear_edges = list(A_sub_clear.edges())

In [221]:
for i in range(len(collabs_indices_to_delete)):
    edge = A_sub_clear_edges[collabs_indices_to_delete[i]]
    A_sub_clear.remove_edge(*edge) 

In [228]:
authors_edges_papers_sub_clear = [authors_edges_papers_sub[i] for i in range(len(authors_edges_papers_sub)) if len(authors_edges_papers_sub[i]) > 0]

In [285]:
len(A_sub_clear_edges) -  len(collabs_indices_to_delete), len(authors_edges_papers_sub_clear)

(223434, 223434)

Check if some edge in co-authorship network remains without any papers in citation network:

In [222]:
A_sub_clear_edges_check = list(A_sub_clear.edges())

In [231]:
for edge in tqdm(A_sub_clear_edges_check):
    if len(authors_edges_papers["papers_indices"][edge_to_index_A[edge]]) == 0:
        print(edge)

100%|██████████| 223434/223434 [00:03<00:00, 62042.73it/s]


Delete unwanted authors that remain separated after collaboration removing:

In [232]:
Gcc_a = sorted(nx.connected_components(A_sub_clear.to_undirected()), key=len, reverse=True)

In [234]:
print("Connected compontents in the clear graph: ", [len(Gcc_a[i]) for i in range(len(Gcc_a))])

Connected compontents in clear graph:  [22306, 12, 6, 6, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [235]:
for i in range(1, len(Gcc_a)):
    authors_2_delete = list(Gcc_a[i])
    for j in range(len(authors_2_delete)):
        node = authors_2_delete[j]
        A_sub_clear.remove_node(node) 

In [236]:
Gcc_a_f = sorted(nx.connected_components(A_sub_clear.to_undirected()), key=len, reverse=True)

In [237]:
print("Connected compontents in the final graph: ", [len(Gcc_a_f[i]) for i in range(len(Gcc_a_f))])

Connected compontents in final graph:  [22306]


### Final check
<a id='final_check'></a>

In [253]:
A_nc, G_nc = nx.number_connected_components(A_sub_clear.to_undirected()),\
nx.number_connected_components(G_sub_clear.to_undirected()) 
print("Number of connected components in the final graphs: ", \
      A_nc, \
      G_nc)
assert A_nc == 1 and G_nc == 1, "The number of connected components is not equal to 1 in the graphs"

Number of connected components in the final graphs:  1 1


In [263]:
def get_graph_properties(H):
    print("Nodes in in the final subgraph: ", len(H.nodes()), "\nEdges in the final subgraph: ", len(H.edges()))
#    print("Diameter: ", nx.diameter(H.to_undirected()))
    print("Average clustering coefficient: ", nx.average_clustering(H.to_undirected()))

In [264]:
print("Co-authorship graph properties: ")
get_graph_properties(A_sub_clear)
print("\nCitation graph properties: ")
get_graph_properties(G_sub_clear)

Co-authorship graph properties: 
Nodes in in the final subgraph:  22306 
Edges in the final subgraph:  223268
Average clustering coefficient:  0.5843096324886987

Citation graph properties: 
Nodes in in the final subgraph:  115907 
Edges in the final subgraph:  521901
Average clustering coefficient:  0.19744282542548652


### Save data
<a id='save_data'></a>

In [266]:
dataset_name = "SSORC_CS_10_21_22306_115907_primus" 

_Recommended format:_ name_of_SSORC_subset (e. g., SSORC_CS_10_21) + _ + co-authorsip graph node number + _ + citation graph node number + _ + fancy latin word

In [267]:
!mkdir datasets/{dataset_name}

In [274]:
pd.DataFrame(list(A_sub_clear.edges()), columns = ["from", "to"]).to_csv("datasets/" + dataset_name + "/"\
                                                                         + dataset_name + "_" + "authors_edge_list.csv")

In [275]:
pd.DataFrame(list(G_sub_clear.edges()), columns = ["from", "to"]).to_csv("datasets/" + dataset_name + "/"\
                                                                         + dataset_name + "_" + "papers_edge_list.csv")

In [280]:
pd.DataFrame(list(A_sub_clear.nodes()), columns = ["node_id"]).to_csv("datasets/" + dataset_name + "/"\
                                                                         + dataset_name + "_" + "authors_nodes.csv")

In [279]:
pd.DataFrame(list(G_sub_clear.nodes()), columns = ["node_id"]).to_csv("datasets/" + dataset_name + "/"\
                                                                         + dataset_name + "_" + "papers_nodes.csv")