## Part 1: Network Construction
Nodes represent authors of academic papers.

Edge from node A to B indicates a joint paper written by both.

Edge weights are the number of papers they have written together.

In [8]:
import pandas as pd
from itertools import combinations # to create unique co-author pairs from the list of authors for each paper
from itertools import chain # to flatten the list of lists of co-author pairs efficiently
from collections import Counter # to count each co-author pair
import networkx as nx
import json
from networkx.readwrite import json_graph
import numpy as np
from scipy import stats


##### Create a weighted edgelist

In [9]:
# Read the data
df_papers = pd.read_csv('IC2S2_combined_papers.csv')
df_authors = pd.read_csv('IC2S2_combined_authors.csv')

In [10]:
df_papers.head()

Unnamed: 0,id,publication_year,cited_by_count,author_ids
0,https://openalex.org/W3124321814,2013,2060,"['https://openalex.org/A5087421071', 'https://..."
1,https://openalex.org/W2137375941,2008,586,"['https://openalex.org/A5003697141', 'https://..."
2,https://openalex.org/W2178843456,2015,581,"['https://openalex.org/A5027036879', 'https://..."
3,https://openalex.org/W2072514260,2012,539,"['https://openalex.org/A5063229424', 'https://..."
4,https://openalex.org/W2972768705,2015,500,"['https://openalex.org/A5043002180', 'https://..."


In [11]:
df_authors.head()

Unnamed: 0,id,display_name,works_api_url,h_index,works_count,country_code
0,https://openalex.org/A5101854927,Hazem Ibrahim,https://api.openalex.org/works?filter=author.i...,12,25,FI
1,https://openalex.org/A5037300629,Nouar AlDahoul,https://api.openalex.org/works?filter=author.i...,14,88,AE
2,https://openalex.org/A5007282319,Talal Rahwan,https://api.openalex.org/works?filter=author.i...,32,167,US
3,https://openalex.org/A5018129441,Yasir Zaki,https://api.openalex.org/works?filter=author.i...,16,127,US
4,https://openalex.org/A5058939619,Alvin Zhou,https://api.openalex.org/works?filter=author.i...,14,59,US


In [12]:
# Get the co-author pairs for each paper (each row in dataframe)
coauthor_pairs = df_papers['author_ids'].apply(lambda x: list(combinations(x, 2))) # find unique combos of 2 authors in author list for each paper

# Flatten list of lists into a single list (of tuples as combinations returns tuples) --> using chain from itertools for efficiency
flattened_pairs = list(chain.from_iterable(coauthor_pairs))

# Count number of co-author pairs
coauthor_count = Counter()

for sublist in coauthor_pairs: # incrementally count co-author pairs (increase efficiency)
    coauthor_count.update(sublist)

# Make edgelist
edgelist = []
for (a, b), count in coauthor_count.items():
    edgelist.append((a, b, count))

: 

##### Graph construction

In [None]:
Graph = nx.Graph()
Graph.add_weighted_edges_from(edgelist)

##### Node attributes

In [None]:
# First add author attributes: display_name, country 

for index, row in df_authors.iterrows():
    author_id = row['id']
    Graph.add_node(author_id, display_name=row['display_name'], country = row['country_code'])

# (possibly faster than the for loop above but not sure if it gives the same content) Graph.add_nodes_from(df_authors['id'], display_name = df_authors['display_name'], country = df_authors['country_code'])

# Get citation count from df_papers
author_citation_counts = df_papers.explode('author_ids').groupby('author_ids')['citations'].sum() # explode to get one author per row, groupby author and sum citations

# Add citation count as an attribute to the nodes
for author_id, citation_count in author_citation_counts.items(): # (author_citation_counts is a Series where index is author_id and value is citation count)
    Graph.nodes[author_id]['citation'] = citation_count

# Get first publication year for each author from df_papers
first_pub_year = df_papers.explode('author_ids').groupby('author_ids')['publication_year'].min() # explode to get one author per row, groupby author and get min publication year

# Add first publication year as an attribute to the nodes
for author_id, year in first_pub_year.items(): # (first_pub_year is a Series where index is author_id and value is first publication year)
    Graph.nodes[author_id]['first_pub_year'] = year

In [None]:
# Save the graph as a json file
graph_data = json_graph.node_link_data(Graph)
with open("network.json", "w") as f:
    json.dump(graph_data, f, indent = 4) # indent = 4 to make the json file more readable

## Part 2: Preliminary Network Analysis


##### Network Metrics

In [None]:
num_nodes = Graph.number_of_nodes()
num_edges = Graph.number_of_edges()

print(f"Total number of authors is {num_nodes} and total number of collaborations is {num_edges}")

In [None]:
max_possible_edges = num_nodes * (num_nodes - 1) / 2 # n choose 2 is max possible edges for an undirected graoh where n is the number of nodes
density = num_edges/max_possible_edges
print(f"Density of the network is {density}")

ANSWER THE QUESTION: Would you say that the network is sparse? Justify your answer.
(if density is close to 1, the graph is not sparse)

Is the network fully connected (i.e., is there a direct or indirect path between every pair of nodes within the network), or is it disconnected?

In [None]:
# Find number of connected components
num_cc = nx.number_connected_components(Graph)
print(f"Number of connected components in the network is {num_cc}")

In [None]:
# Find number of isolated nodes
num_isolated = len(list(nx.isolates(Graph)))

Discuss the results above on network density, and connectivity. Are your findings in line with what you expected? Why?

##### Degree Analysis

In [None]:
# Compute the average, median, mode, minimum, and maximum degree of the nodes

# Get the degrees of all the nodes
degrees = [degree for node, degree in Graph.degree()]

avg_deg = np.mean(degrees)
med_deg = np.median(degrees)
mode_deg = stats.mode(degrees)[0][0]
min_deg = min(degrees)
max_deg = max(degrees)

In [None]:
# Compute the average, median, mode, minimum, and maximum of node strength

# Get the strength of all the nodes (for each node, strength is the sum of the weights of the edges incident to that node)
strengths = [strength for node, strength in Graph.degree(weight='weight')]
avg_str = np.mean(strengths)
med_str = np.median(strengths)
mode_str = stats.mode(strengths)[0][0]
min_str = min(strengths)
max_str = max(strengths)

What do these metrics tell us about the network?

##### Top Authors

In [None]:
sorted_by_deg = sorted(Graph.degree(), key=lambda x: x[1], reverse=True)
top_5_deg = sorted_by_deg[:5]

for node, degree in top_5_deg:
    print(Graph.nodes[node]['display_name'])

What role do these node play in the network?

Research these authors online. What areas do they specialize in? Do you think that their work aligns with the themes of Computational Social Science? If not, what could be possible reasons?