In [22]:
# importing required library
import pymongo  #pip install pymongo

In [23]:
# Local server connection url
mongo_url = "mongodb://localhost:27017/"

# Connecting to the MongoDB local server
client = pymongo.MongoClient(mongo_url)

# Accessing required database and collection
db = client["sample"]
collection = db["research"]

### Task 1

In [24]:
# Calculating number of articles present in the collection
num_articles = collection.count_documents({})

print(f"Number of Articles: {num_articles}")

Number of Articles: 501629


In [25]:
# Calculating number of unique organisations present in the collection

# Creating an aggregation pipeline to count unique organisations
pipeline = [
    {"$unwind": "$author"},
    {"$unwind": "$author.affiliation"},
    {"$group": {"_id": "$author.affiliation.name"}},
    {"$group": {"_id": None, "totalAffiliations": {"$sum": 1}}}
]

# Executing the aggregation pipeline
result = list(collection.aggregate(pipeline))
total_affiliations = result[0]["totalAffiliations"]

print(f"Number of organisations: {total_affiliations}")

Number of organisations: 163723


In [26]:
# Calculating number of researchers present in the collection

# Creating an aggregation pipeline to count researchers
pipeline = [
    {"$unwind": "$author"},
    {"$group": {"_id": "$author.given"}},
    {"$group": {"_id": None, "totalAuthors": {"$sum": 1}}}
]

# Executing the aggregation pipeline
result = list(collection.aggregate(pipeline))
total_authors = result[0]["totalAuthors"]

print(f"Number of researchers: {total_authors}")

Number of researchers: 215061


### Task 2

In [27]:
# Importing required library
import networkx as nx

# Creating an empty graph
G = nx.Graph()

# Iterating through all documents in the collection
data = collection.find()

# Finding all the organisations within each document
for document in data:
    organisations = []
    for author in document.get("author", []):
        if author.get("affiliation", [{}]):
            organisation = author.get("affiliation", [{}])[0].get("name", "")
            organisations.append(organisation)
    
    # Adding organisations as nodes to the graph
    for organisation in organisations:
        G.add_node(organisation)

    # Creating connections between organisations that wrote papers together
    for i in range(len(organisations)):
        for j in range(i + 1, len(organisations)):
            if organisations[i] != organisations[j]:
                org_i = set(organisations[i].split(","))
                org_j = set(organisations[j].split(","))
                if org_i.intersection(org_j):
                    G.add_edge(organisations[i], organisations[j])

# Calculating the centrality of each organisation
degree_centrality = nx.degree_centrality(G)

# Sorting the organisation based on centrality
sorted_organisations = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

# Selecting the top 10 organizations
top_10_organisations = sorted_organisations[:10]

# Printing the top 10 organizations with highest centrality
print("Top 10 organisations with highest centrality:")
for org, centrality_score in top_10_organisations:
    print(centrality_score, org)


Top 10 organisations with highest centrality:
0.0011478694746340752 University of Wisconsin–Oshkosh , College of Business, Department of Accounting, Oshkosh, WI, USA
0.0011478694746340752 Bryant University , College of Business, Department of Accounting, Smithfield, RI, USA
0.0011478694746340752 University of Central Arkansas , College of Business, Department of Accounting, Conway, AR, USA
0.0011478694746340752 East Carolina University , College of Business, Department of Accounting, Greenville, NC, USA
0.0011478694746340752 Colorado State University emeritus, , College of Business, Department of Accounting, Fort Collins, CO, USA
0.0011478694746340752 Lewis University , College of Business, Department of Accounting, Romeoville, IL, USA
0.0011478694746340752 Lone Star College , College of Business, Department of Accounting, Montgomery, TX, USA
0.0011345993072972651 Loyola University Maryland , Sellinger School of Business and Management, Department of Accounting, Baltimore, MD, USA
0.00

In [28]:
# Creating an empty graph
G2 = nx.Graph()

# Iterating through all documents in the collection
data = collection.find()

# Finding all the authors within each document
for document in data:
    authors = []
    for author in document.get("author", [{}]):
        full_name = author.get("given","")+" "+author.get("family","")
        authors.append(full_name)

    # Adding authors as nodes to the graph
    for author in authors:
        G2.add_node(author)

    # Creating connections between authors that wrote papers together
    for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
            if authors[i] != authors[j]:
                author_i = set(authors[i])
                author_j = set(authors[j])
                if author_i.intersection(author_j):
                    G2.add_edge(authors[i], authors[j])

# Calculating the centrality of each author
degree_centrality_auth = nx.degree_centrality(G2)

# Sorting the authors based on centrality
sorted_authors = sorted(degree_centrality_auth.items(), key=lambda x: x[1], reverse=True)

# Selecting the top 10 authors
top_10_authors = sorted_authors[1:11]

# Printing the top 10 authors with highest centrality
print("Top 10 authors with highest centrality:")
for auth, centrality_score in top_10_authors:
    print(centrality_score, auth)

Top 10 authors with highest centrality:
0.007558795848159102 Amanpreet Brar
0.007080170428188758 Arda Isik
0.0070614008038761954 Andrey Litvin
0.007053356679170812 Muhammed Elhadi
0.007053356679170812 Piotr Major
0.007053356679170812 Ionut Negoi
0.007053356679170812 Francesco Pata
0.007053356679170812 Gianluca Pellino
0.007053356679170812 Mihail Slavchev
0.007053356679170812 Kjetil Soreide


### Task 3

In [29]:
# Writing the graph to a file to be visualized in Cytoscape
nx.write_graphml(G, "organisations_graph.graphml")
nx.write_graphml(G2,"authors_graph.graphml")