In [1]:
# Imports
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from collections import defaultdict as ddict, Counter
from itertools import chain
from scipy.stats import pearsonr

import csv
import json
import itertools
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from operator import itemgetter

In [2]:
# Read papers

def read_papers(filename='authors.csv'):
    papers = ddict(set)
    authors = ddict(set)
    affiliations = ddict(set)
    author2affiliation = ddict(set)
    paper2authors = ddict(set)
    paper2authorsaffiliation = ddict(set)
    paper2affiliations = ddict(set)

    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
                continue
            paper_title = row[1].strip()
            paper_authors = row[2].strip()
            
            paper2authors[paper_title].add(paper_authors)
            
            full_name = paper_authors
            affiliation = row[3].strip()
            paper2authorsaffiliation[paper_title].add((paper_authors, affiliation))
            paper2affiliations[paper_title].add(affiliation)

            papers[paper_title.strip()].add(full_name)
            authors[full_name.strip()].add(paper_title)
            affiliations[affiliation.strip()].add(paper_title)
            author2affiliation[full_name.strip()].add(affiliation)
            line_count += 1
        return papers, authors, affiliations, author2affiliation, paper2authors, paper2authorsaffiliation, paper2affiliations

kfupm_papers, kfupm_authors, kfupm_affiliations, kfupm_author2affiliation, kfupm_paper2authors, kfupm_paper2authorsaffiliation, kfupm_paper2affiliations = read_papers(filename='kfupm_authors.csv')
ksu_papers, ksu_authors, ksu_affiliations, ksu_author2affiliation, ksu_paper2authors, ksu_paper2authorsaffiliation, ksu_paper2affiliations = read_papers(filename='ksu_authors.csv')
mit_papers, mit_authors, mit_affiliations, mit_author2affiliation, mit_paper2authors, mit_paper2authorsaffiliation, mit_paper2affiliations = read_papers(filename='mit_authors.csv')

# Affiliaiton -> Papers

def get_papers_for_name(affiliations):
    name2papers = ddict(set)
    for org in affiliations:
        name = org
        name2papers[name] = name2papers[name].union(set(affiliations[org]))
            
    return name2papers

kfupm_name2papers = get_papers_for_name(kfupm_affiliations)
ksu_name2papers = get_papers_for_name(ksu_affiliations)
mit_name2papers = get_papers_for_name(mit_affiliations)

In [3]:
# Collaboration Matrix (csv)

def collab_matrix(affiliations, paper2affiliations, filename):
    affiliations_list = []

    for affiliation in affiliations:
        affiliations_list.append(affiliation)

    lst = []*len(affiliations_list)
    sub_lst = []*len(affiliations_list)

    for i in range(len(affiliations_list)):
        for j in range(len(affiliations_list)):
            sub_lst.append(0)
        lst.append(sub_lst)
        sub_lst = []*len(affiliations_list)


    for paper in paper2affiliations:
        paper_affiliations = paper2affiliations.get(paper)
        for i in range(len(paper_affiliations)):
            if list(paper2affiliations.get(paper))[i] in set(affiliations_list):
                for j in range(i+1, len(paper_affiliations)):
                    if list(paper2affiliations.get(paper))[j] in set(affiliations_list):
                        index_of_i = affiliations_list.index(list(paper2affiliations.get(paper))[i])
                        index_of_j = affiliations_list.index(list(paper2affiliations.get(paper))[j])
                        lst[index_of_i][index_of_j]+=1
                        lst[index_of_j][index_of_i]+=1

    df = pd.DataFrame(lst, columns = affiliations_list, index = affiliations_list, dtype = int) 
    df.to_csv(filename)
    return affiliations_list, df
    
kfupm_affiliations_list, kfupm_df = collab_matrix(kfupm_affiliations, kfupm_paper2affiliations, r'kfupm_collab_matrix.csv')
ksu_affiliations_list, ksu_df = collab_matrix(ksu_affiliations, ksu_paper2affiliations, r'ksu_collab_matrix.csv')
mit_affiliations_list, mit_df = collab_matrix(mit_affiliations, mit_paper2affiliations, r'mit_collab_matrix.csv')

In [4]:
# Authors -> Collabs
def get_author2collabs(papers):
    author2coauthors = ddict(list)
    for paper in papers:
        for author1 in papers[paper]:
            for author2 in papers[paper]:
                if author1 != author2:
                    author2coauthors[author1].append(author2) 
    author2collabs = ddict(list)
    for author, coauthors in author2coauthors.items():
        for k, v in Counter(coauthors).items():
            author2collabs[author].append((v, k))
        
    return author2collabs

kfupm_author2collabs = get_author2collabs(kfupm_papers)
ksu_author2collabs = get_author2collabs(ksu_papers)
mit_author2collabs = get_author2collabs(mit_papers)

# Affiliation -> Collabs

def get_name2collabs(name2papers):
    name2collabs = ddict(list)
    for name1, papers1 in name2papers.items():
        for name2, papers2 in name2papers.items():
            if name1 != name2:
                collabs = len(papers1.intersection(papers2))
                if collabs:
                    name2collabs[name1].append((collabs, name2))
    return name2collabs

kfupm_name2collabs = get_name2collabs(kfupm_name2papers)
ksu_name2collabs = get_name2collabs(ksu_name2papers)
mit_name2collabs = get_name2collabs(mit_name2papers)

In [5]:
# Collab -> (size, collab)

def get_name2numcollabs(name2collabs):
    department_collaboration_size = 0;
    name2numcollabs = ddict(set)
    for name, collab in name2collabs.items():
        for item in collab:
            if name != item[1]:
                department_collaboration_size += item[0]
        name2numcollabs[name] = department_collaboration_size
        department_collaboration_size = 0

    for name, num_collabs in name2numcollabs.items():
        if num_collabs == set():
            name2numcollabs[name] = 0

    name2numcollabs = dict(sorted(name2numcollabs.items(), key=lambda item: item[1], reverse=True))
    return name2numcollabs

kfupm_name2numcollabs = get_name2numcollabs(kfupm_name2collabs)
ksu_name2numcollabs = get_name2numcollabs(ksu_name2collabs)
mit_name2numcollabs = get_name2numcollabs(mit_name2collabs)

In [6]:
# Data to json for visualization

# Write json file
def write_json(data, filename='data.json'): 
    with open(filename,'w') as f: 
        json.dump(data, f, indent=4)

# Nodes color scheme
def color_scheme(size):
    if size >= 0 and size <= 20:
        return '#f0fbaf'
    elif size >= 20 and size <= 40:
        return '#bce8a5'
    elif size >= 40 and size <= 60:
        return '#69c2af'
    elif size >= 60 and size <= 80:
        return '#268eb4'
    elif size >= 80 and size <= 100:
        return '#1c4f9b'
    elif size >= 100 and size <= 120:
        return '#161d6e'
    else:
        return '#3b4975'
    
def write_nodes(name2collabs, name2numcollabs, nodes_filename, node_names_filename):
    departments = []
    department_names = []
    names = []
    id = -1
    for name, collabs in name2collabs.items():
        if name not in departments:
            id += 1
            if name2numcollabs[name] >= 0 and name2numcollabs[name] <= 50:
                collabs_color = '#969696'
            if name2numcollabs[name] >= 51 and name2numcollabs[name] <= 100:
                collabs_color = '#636363'
            if name2numcollabs[name] >= 101 and name2numcollabs[name] <= 150:
                collabs_color = '#a1d99b'
            if name2numcollabs[name] >= 151 and name2numcollabs[name] <= 200:
                collabs_color = '#74c476'
            if name2numcollabs[name] >= 201 and name2numcollabs[name] <= 250:
                collabs_color = '#31a354'
            if name2numcollabs[name] >= 251 and name2numcollabs[name] <= 300:
                collabs_color = '#fdae6b'
            if name2numcollabs[name] >= 301 and name2numcollabs[name] <= 350:
                collabs_color = '#fd8d3c'
            if name2numcollabs[name] >= 351 and name2numcollabs[name] <= 400:
                collabs_color = '#e6550d'
            if name2numcollabs[name] >= 401 and name2numcollabs[name] <= 450:
                collabs_color = '#6baed6'
            if name2numcollabs[name] >= 451 and name2numcollabs[name] <= 500:
                collabs_color = '#3182bd'
            if "Department" in name or "Dept" in name:
                departments.append({"id": name, "name": name, "collaborations": name2numcollabs[name], "color": collabs_color})
                department_names.append({"id": id, "name": name, "type": "department"})
                names.append(name)
            elif "Center" in name or "Ctr " in name:
                departments.append({"id": name, "name": name, "collaborations": name2numcollabs[name], "color": collabs_color})
                department_names.append({"id": id, "name": name, "type": "center"})
                names.append(name)
            else:
                departments.append({"id": name, "name": name, "collaborations": name2numcollabs[name], "color": collabs_color})
                department_names.append({"id": id, "name": name, "type": "center"})
                names.append(name)

    write_json(departments, filename=nodes_filename)
    write_json(department_names, filename=node_names_filename)

write_nodes(kfupm_name2collabs, kfupm_name2numcollabs, 'kfupm_nodes.json', 'kfupm_nodes_names.json')
write_nodes(ksu_name2collabs, ksu_name2numcollabs, 'ksu_nodes.json', 'ksu_nodes_names.json')
write_nodes(mit_name2collabs, mit_name2numcollabs, 'mit_nodes.json', 'mit_nodes_names.json')

def write_links(name2collabs, links_filename):
    links = []
    num_collabs = 0
    sources = []
    targets = []
    final = ddict(set)
    for name, collabs in name2collabs.items():
        num_collabs += len(collabs)
        for num, neighbor in collabs:
            if neighbor not in sources and name not in final[neighbor]:
                collabs_color_length = len(collabs)
                if num >= 0 and num <= 6:
                    collabs_color = '#969696'
                    stroke_width = 6
                if num >= 7 and num <= 13:
                    collabs_color = '#636363' # purple
                    stroke_width = 6
                if num >= 14 and num <= 19:
                    collabs_color = '#dadaeb' # green
                    stroke_width = 6
                if num >= 20 and num <= 25:
                    collabs_color = '#bcbddc' # blue
                    stroke_width = 6
                if num >= 26 and num <= 31:
                    collabs_color = '#9e9ac8' # red
                    stroke_width = 6
                if num >= 32 and num <= 37:
                    collabs_color = '#756bb1' # red
                    stroke_width = 6
                if num >= 38 and num <= 43:
                    collabs_color = '#c7e9c0' # red
                    stroke_width = 6
                if num >= 44 and num <= 49:
                    collabs_color = '#a1d99b' # red
                    stroke_width = 6
                if num >= 50 and num <= 55:
                    collabs_color = '#74c476'
                    stroke_width = 6
                if num >= 56 and num <= 61:
                    collabs_color = '#31a354' # purple
                    stroke_width = 6
                if num >= 62 and num <= 67:
                    collabs_color = '#fdd0a2' # green
                    stroke_width = 6
                if num >= 68 and num <= 73:
                    collabs_color = '#fdae6b' # blue
                    stroke_width = 6
                if num >= 74 and num <= 79:
                    collabs_color = '#fd8d3c' # red
                    stroke_width = 6
                if num >= 80 and num <= 85:
                    collabs_color = '#e6550d' # red
                    stroke_width = 6
                if num >= 86 and num <= 91:
                    collabs_color = '#c6dbef' # red
                    stroke_width = 6
                if num >= 92 and num <= 97:
                    collabs_color = '#9ecae1' # red
                    stroke_width = 6
                if num >= 98 and num <= 103:
                    collabs_color = '#6baed6' # green
                    stroke_width = 12
                if num >= 105 and num <= 109:
                    collabs_color = '#3182bd' # blue
                    stroke_width = 6
                links.append({"source": name, "target": neighbor, "opacity": 0.7, "color": collabs_color, "strokeWidth": stroke_width})
            sources.append(name)
            targets.append(neighbor)
        final[name] = targets
        targets = []
    write_json(links, filename=links_filename)
    
write_links(kfupm_name2collabs, 'kfupm_links.json')
write_links(ksu_name2collabs, 'ksu_links.json')
write_links(mit_name2collabs, 'mit_links.json')

In [7]:
# Affiliations with more than 10 collaborations

def get_name2numcollabsafterthreshold(name2numcollabs):
    name2numcollabsafterthreshold = ddict(set)
    for name, numcollabs in name2numcollabs.items():
        if numcollabs > 10:
            name2numcollabsafterthreshold[name] = numcollabs
    return name2numcollabsafterthreshold

kfupm_name2numcollabsafterthreshold = get_name2numcollabsafterthreshold(kfupm_name2numcollabs)
ksu_name2numcollabsafterthreshold = get_name2numcollabsafterthreshold(ksu_name2numcollabs)
mit_name2numcollabsafterthreshold = get_name2numcollabsafterthreshold(mit_name2numcollabs)

In [8]:
# Isolated affilaitions

def get_isolated_units(name2numcollabs, name2papers, isolated_filename):
    name2numcollabsbeforethreshold = ddict(set)
    for name, numcollabs in name2numcollabs.items():
        if numcollabs <= 10:
            name2numcollabsbeforethreshold[name] = numcollabs
    isolated = []
    id = 1
    for name in name2numcollabsbeforethreshold:
        if "Department" in name or "Dept" in name or "department" in name or "dept" in name:
            type = "Department"
        elif "Center" in name or "Ctr " in name or "ctr " in name:
            type = "Center"
        else:
            type =""
        isolated.append({"id": id, "name": name, "type": type, "num_collaborations": name2numcollabsbeforethreshold[name], "num_papers": len(name2papers[name])})
        id +=1
    write_json(isolated, filename=isolated_filename)
    
get_isolated_units(kfupm_name2numcollabs, kfupm_name2papers, 'kfupm_isolated.json')
get_isolated_units(ksu_name2numcollabs, ksu_name2papers, 'ksu_isolated.json')
get_isolated_units(mit_name2numcollabs, mit_name2papers, 'mit_isolated.json')

In [9]:
# Write ndx csv fro crossfilter (visualization)

def get_authors_ndx(authors_ndx_filename, authors_filename, name2numcollabsafterthreshold, paper2affiliations):
    current_paper_id = -1
    with open(authors_ndx_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["paper", "title", "author", "affiliation", "cite", "year", "collabs"])
        with open(authors_filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count == 0:
                    line_count += 1
                    continue
                paper_id = row[0].strip()
                paper_title = row[1].strip()
                paper_authors = row[2].strip()
                paper_affiliaitons = row[3].strip()
                paper_cite = row[4].strip()
                paper_year = row[5].strip()
                if paper_id == current_paper_id:
                    paper_year = '-'
                    paper_cite = 0
                else:
                    if paper_affiliaitons in name2numcollabsafterthreshold and name2numcollabsafterthreshold[paper_affiliaitons] != set() and name2numcollabsafterthreshold[paper_affiliaitons] > 0:
                        writer.writerow([paper_id, paper_title, paper_authors, paper_affiliaitons, paper_cite, paper_year, len(paper2affiliations[paper_title]) - 1])
                    current_paper_id = paper_id 

get_authors_ndx('kfupm_authors_ndx.csv', 'kfupm_authors.csv', kfupm_name2numcollabsafterthreshold, kfupm_paper2affiliations)
get_authors_ndx('ksu_authors_ndx.csv', 'ksu_authors.csv', ksu_name2numcollabsafterthreshold, ksu_paper2affiliations)
get_authors_ndx('mit_authors_ndx.csv', 'mit_authors.csv', mit_name2numcollabsafterthreshold, mit_paper2affiliations)

In [10]:
# Grants

def read_grants(paper_filename, grants_filename, paper2affiliations):
    name2grants = ddict(set)
    with open(grants_filename) as grant_csv_file:
        grant_csv_reader = csv.reader(grant_csv_file, delimiter=',')
        grant_line_count = 0
        for row in grant_csv_reader:
            if grant_line_count == 0:
                grant_line_count += 1
                continue
            id = row[0].strip()
            grant = row[1].strip()
            with open(paper_filename) as paper_csv_file:
                paper_csv_reader = csv.reader(paper_csv_file, delimiter=',')
                line_count = 0
                for paper_row in paper_csv_reader:
                    if line_count == 0:
                        line_count += 1
                        continue
                    paper_id = paper_row[0].strip()
                    paper_title = paper_row[1].strip()
                    if paper_id == id:
                        for affiliation in paper2affiliations[paper_title]:
                            if name2grants[affiliation] == set():
                                name2grants[affiliation] = 0
                                name2grants[affiliation] += int(grant)
                            else:
                                name2grants[affiliation] += int(grant)
                        break;
                    line_count += 1
            grant_line_count += 1
        return name2grants

kfupm_name2grants = json.load(open('kfupm_name2grants.json'))
# read_grants('kfupm_authors.csv', 'kfupm_grants.csv', kfupm_paper2affiliations)
ksu_name2grants = json.load(open('ksu_name2grants.json'))
# read_grants('ksu_authors.csv', 'ksu_grants.csv', ksu_paper2affiliations)
mit_name2grants = json.load(open('mit_name2grants.json'))
# mit_name2grants = read_grants('mit_authors.csv', 'mit_grants.csv', mit_paper2affiliations)

In [15]:
def get_faculty_lst(filename):
    faculty = []
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
                continue
            name = row[0].strip() 
            if name not in faculty:
                faculty.append(name)
            line_count += 1
    return faculty
  
kfupm_faculty_list = get_faculty_lst('kfupm_faculty.csv')
ksu_faculty_list = get_faculty_lst('ksu_faculty.csv')
mit_faculty_list = get_faculty_lst('mit_faculty.csv')

In [16]:
def get_authors2contribution(paper2authorsaffiliation):
    authors2contribution = ddict(set)
    for paper, authors in paper2authorsaffiliation.items():
        for author, affiliation in authors:
            if author not in authors2contribution:
                authors2contribution[author] = ddict(set)
            if affiliation not in authors2contribution[author]:
                authors2contribution[author][affiliation] = 0
            authors2contribution[author][affiliation] += 1/len(authors)
    return authors2contribution


kfupm_authors2contribution = get_authors2contribution(kfupm_paper2authorsaffiliation)
ksu_authors2contribution = get_authors2contribution(ksu_paper2authorsaffiliation)
mit_authors2contribution = get_authors2contribution(mit_paper2authorsaffiliation)
 
def get_authors2cfs(paper2authorsaffiliation, faculty_list):    
    authors2faculty = ddict(set)
    authors2students = ddict(set)
    for paper, authors in paper2authorsaffiliation.items():
        for author, affiliation in authors:
            if author not in authors2faculty:
                authors2faculty[author] = ddict(set)
                authors2faculty[author][affiliation] = []
            if author not in authors2students:
                authors2students[author] = ddict(set)
                authors2students[author][affiliation] = []
            if affiliation not in authors2faculty[author]:
                authors2faculty[author][affiliation] = []
            if affiliation not in authors2students[author]:
                authors2students[author][affiliation] = []
            for author2, affiliation2 in authors:
                if author2 == author:
                    continue
                if author not in authors2faculty:
                    authors2faculty[author] = ddict(set)
                if author not in authors2students:
                    authors2students[author] = ddict(set)
                if author2 in faculty_list:
                    if affiliation not in authors2faculty[author]:
                        authors2faculty[author][affiliation] = []
                    if author2 not in authors2faculty[author][affiliation]:
                        authors2faculty[author][affiliation].append(author2)
                else:
                    if affiliation not in authors2students[author]:
                        authors2students[author][affiliation] = []
                    if author2 not in authors2students[author][affiliation]:
                        authors2students[author][affiliation].append(author2)
        
    return authors2faculty, authors2students    

kfupm_authors2faculty, kfupm_authors2students = get_authors2cfs(kfupm_paper2authorsaffiliation, kfupm_faculty_list)
ksu_authors2faculty, ksu_authors2students = get_authors2cfs(ksu_paper2authorsaffiliation, ksu_faculty_list)
mit_authors2faculty, mit_authors2students = get_authors2cfs(mit_paper2authorsaffiliation, mit_faculty_list)
            

In [17]:
# Define each unit
    
def define_unit(filename, affiliations_list, name2collabs, name2numcollabs, name2grants, research_area_filename, authors2contribution, authors2faculty, authors2students, paper2authors):
    units = []
    paper_id = -1
    unit_id = 1
    for affiliation in affiliations_list:
        num_papers = 0
        num_citations = 0
        authors = []
        authors2num = ddict(set)
        areas2num = ddict(set)
        title2cite = ddict(set)
        collabs = []
        affiliation_collabs = name2collabs[affiliation]
        affiliation_collabs = sorted(affiliation_collabs, key=itemgetter(0), reverse=True)
        affiliation_collabs = affiliation_collabs[0:5]
        for collab in affiliation_collabs:
            collabs.append({"collab": collab[1], "number": collab[0]})
        with open(filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count == 0:
                    line_count += 1
                    continue
                current_paper_id = row[0].strip()
                paper_title = row[1].strip()
                paper_authors = row[2].strip()
                paper_affiliation = row[3].strip()
                paper_cite = row[4].strip()
                if paper_affiliation == affiliation:
                    if(current_paper_id != paper_id):
                        num_papers += 1 
                        if paper_cite != '': 
                            num_citations += int(paper_cite)
                            title2cite[paper_title] = paper_cite
                        with open(research_area_filename) as area_file:
                            area_reader = csv.reader(area_file, delimiter=',')
                            line_count = 0
                            for row1 in area_reader:
                                row_id = row1[0].strip()
                                area = row1[1].strip()
                                if row_id == current_paper_id:
                                    if areas2num[area] == set():
                                        areas2num[area] = 1
                                    else:
                                        areas2num[area] += 1
                    if authors2num[paper_authors] == set():
                        if paper_cite != '': 
                            cite = int(paper_cite)
                        else:       
                            cite = 0
                        authors2num[paper_authors] = [1, cite]
                    else:
                        num_papers_per_author = authors2num[paper_authors][0]
                        num_papers_per_author += 1
                        num_citations_per_author = authors2num[paper_authors][1]
                        if paper_cite != '': 
                            cite = int(paper_cite)
                        else: 
                            cite = 0
                        num_citations_per_author += cite
                        authors2num[paper_authors] = [num_papers_per_author, num_citations_per_author]
                    
                    paper_id = current_paper_id
                line_count += 1
        authors2num = dict(sorted(authors2num.items(), key=lambda item: item[1], reverse=True))
        authors2num = dict(itertools.islice(authors2num.items(), 5))
        for author, [num, cites] in authors2num.items():
            authors.append({"author": author, "papers": num, "citations": cites, "contributions": round(authors2contribution[author][affiliation],2), "faculty": len(authors2faculty[author][affiliation]), "students": len(authors2students[author][affiliation])})
        title2cite = dict(sorted(title2cite.items(), key=lambda item: item[1], reverse=True))
        title2cite = dict(itertools.islice(title2cite.items(), 5))
        papers = []
        for title, cite in title2cite.items():
            papers.append({"title": title, "citations": cite, "authors": len(paper2authors[title])})
        areas2num = dict(sorted(areas2num.items(), key=lambda item: item[1], reverse=True))
        areas2num = dict(itertools.islice(areas2num.items(), 10))
        areas = []
        for area, num in areas2num.items():
            areas.append({"area": area, "number": num})
        num_collabs = 0
        if affiliation in name2numcollabs and name2numcollabs[affiliation] != set():
            num_collabs = name2numcollabs[affiliation]
        num_grants = 0
        num_grants = name2grants[affiliation]
        units.append({"id": unit_id, "name": affiliation, "num_papers": num_papers, "num_collabs": num_collabs, "num_citations": num_citations, "num_grants": num_grants, "authors": authors, "papers": papers, "areas": areas, "collabs": collabs})
        unit_id += 1
    return units

kfupm_units = define_unit('kfupm_authors.csv', kfupm_affiliations_list, kfupm_name2collabs, kfupm_name2numcollabs, kfupm_name2grants, 'kfupm_research_areas.csv', kfupm_authors2contribution, kfupm_authors2faculty, kfupm_authors2students, kfupm_paper2authors)
write_json(kfupm_units, filename='kfupm_units.json')

ksu_units = define_unit('ksu_authors.csv', ksu_affiliations_list, ksu_name2collabs, ksu_name2numcollabs, ksu_name2grants, 'ksu_research_areas.csv',  ksu_authors2contribution, ksu_authors2faculty, ksu_authors2students, ksu_paper2authors)
write_json(ksu_units, filename='ksu_units.json')

mit_units = define_unit('mit_authors.csv', mit_affiliations_list, mit_name2collabs, mit_name2numcollabs, mit_name2grants, 'mit_research_areas.csv', mit_authors2contribution, mit_authors2faculty, mit_authors2students, mit_paper2authors)
write_json(mit_units, filename='mit_units.json')

In [35]:
# Collab heatmap

def get_heatmap(name2numcollabs, df, heatmap_filename):
    collab_heatmap = []
    zeroes = 0
    name2numcollabsafterthreshold = dict(itertools.islice(name2numcollabs.items(), 20))

    for name, num in name2numcollabsafterthreshold.items():
        collab_heatmap_row = []
        for index, row in df.iterrows():
            if index == name:
                for name2, num in name2numcollabsafterthreshold.items():
                    collab_heatmap_row.append({"x": name2, "y": int(row[name2])})
                    if int(row[name2]) == 0:
                        zeroes += 1
        collab_heatmap.append({"name": name, "data": collab_heatmap_row})

    heatmap_json = {'series': collab_heatmap}
  
    write_json(heatmap_json, filename=heatmap_filename)
           
get_heatmap(kfupm_name2numcollabs, kfupm_df, 'kfupm_heatmap.json')

get_heatmap(ksu_name2numcollabs, ksu_df, 'ksu_heatmap.json')

get_heatmap(mit_name2numcollabs, mit_df, 'mit_heatmap.json')  


In [20]:
def name_citations(filename, affiliations_list):
    name2citations = ddict(set)
    for affiliation in affiliations_list:
        citations = 0
        papers_id = []
        with open(filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count == 0:
                    line_count += 1
                    continue
                paper_id = row[0].strip()  
                paper_affiliation = row[3].strip()
                paper_cite = row[4].strip()
                if paper_affiliation == affiliation:
                    if paper_id not in papers_id:
                        citations += int(paper_cite)
                        papers_id.append(paper_id)
                line_count += 1
        name2citations[affiliation] = citations
    return name2citations
  
kfupm_name2citations = name_citations('kfupm_authors.csv', kfupm_affiliations_list)

ksu_name2citations = name_citations('ksu_authors.csv', ksu_affiliations_list)

mit_name2citations = name_citations('mit_authors.csv', mit_affiliations_list)


KeyboardInterrupt: 

In [21]:
def name_facutly(filename, affiliations_list):
    name2faculty = ddict(set)
    for affiliation in affiliations_list:
        faculty = []
        with open(filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count == 0:
                    line_count += 1
                    continue
                author = row[0].strip() 
                faculty_affiliation = row[1].strip()
                if faculty_affiliation == affiliation:
                    faculty.append(author)
                line_count += 1
        name2faculty[affiliation] = faculty
    return name2faculty
  
kfupm_name2faculty = name_facutly('kfupm_faculty.csv', kfupm_affiliations_list)
ksu_name2faculty = name_facutly('ksu_faculty.csv', ksu_affiliations_list)
mit_name2faculty = name_facutly('mit_faculty.csv', mit_affiliations_list)

In [22]:
def get_name2numfaculty(name2faculty, filename):
    name2numfaculty = ddict(set)
    # for author, affiliaiton in author2affiliation.items():
    #     if author not in name2faculty[list(affiliaiton)[0]]:
    #         name2faculty[list(affiliaiton)[0]].append(author)

    for name, faculties in name2faculty.items():
        name2numfaculty[name] = len(faculties)

    # df = pd.DataFrame(name2faculty["Information and Computer Science Department"]) 
    # df.to_csv(r'faculty_csv.csv')
    write_json(name2numfaculty, filename=filename)
    
get_name2numfaculty(kfupm_name2faculty, 'kfupm_faculty_num.json')
get_name2numfaculty(ksu_name2faculty, 'ksu_faculty_num.json')
get_name2numfaculty(mit_name2faculty, 'mit_faculty_num.json')

In [34]:
# Sacatter Plots

kfupm_collab_citations = []
ksu_collab_citations = []
mit_collab_citations = []
data1 = []
data2 = []
for affiliation in kfupm_affiliations_list:
    unit_collab_citations = []
    if affiliation in kfupm_name2numcollabs:
        collabs = kfupm_name2numcollabs[affiliation]
    else:
        collabs = 0
    unit_collab_citations.append(collabs)
    data1.append(collabs)
    data2.append(kfupm_name2citations[affiliation])
    unit_collab_citations.append(kfupm_name2citations[affiliation])
    kfupm_collab_citations.append(unit_collab_citations)
    
corr, _ = pearsonr(data1, data2)
print(corr)
    
for affiliation in ksu_affiliations_list:
    unit_collab_citations = []
    if affiliation in ksu_name2numcollabs:
        collabs = ksu_name2numcollabs[affiliation]
    else:
        collabs = 0
    unit_collab_citations.append(collabs)
    data1.append(collabs)
    data2.append(ksu_name2citations[affiliation])
    unit_collab_citations.append(ksu_name2citations[affiliation])
    ksu_collab_citations.append(unit_collab_citations)
    
corr, _ = pearsonr(data1, data2)
print(corr)

for affiliation in mit_affiliations_list:
    unit_collab_citations = []
    if affiliation in mit_name2numcollabs:
        collabs = mit_name2numcollabs[affiliation]
    else:
        collabs = 0
    if collabs > 585:
        continue
    unit_collab_citations.append(collabs)
    data1.append(collabs)
    data2.append(mit_name2citations[affiliation])
    unit_collab_citations.append(mit_name2citations[affiliation])
    mit_collab_citations.append(unit_collab_citations)
    
corr, _ = pearsonr(data1, data2)
print(corr)

collab_citations = [{"name": "KFUPM", "data": kfupm_collab_citations}, {"name": "KSU", "data": ksu_collab_citations}, {"name": "MIT", "data": mit_collab_citations}]
write_json(collab_citations, filename='collab_citation_scatter.json') 

kfupm_collab_grant = []
ksu_collab_grant = []
mit_collab_grant = []

for affiliation in kfupm_affiliations_list:
    unit_collab_grant = []
    if affiliation in kfupm_name2numcollabs:
        collabs = kfupm_name2numcollabs[affiliation]
    else:
        collabs = 0
    unit_collab_grant.append(collabs)
    data1.append(collabs)
    data2.append(kfupm_name2grants[affiliation])
    unit_collab_grant.append(kfupm_name2grants[affiliation])
    kfupm_collab_grant.append(unit_collab_grant)
    
corr, _ = pearsonr(data1, data2)
print(corr)
    
for affiliation in ksu_affiliations_list:
    unit_collab_grant = []
    if affiliation in ksu_name2numcollabs:
        collabs = ksu_name2numcollabs[affiliation]
    else:
        collabs = 0
    unit_collab_grant.append(collabs)
    data1.append(collabs)
    data2.append(ksu_name2grants[affiliation])
    unit_collab_grant.append(ksu_name2grants[affiliation])
    ksu_collab_grant.append(unit_collab_grant)
    
corr, _ = pearsonr(data1, data2)
print(corr)
    
for affiliation in mit_affiliations_list:
    unit_collab_grant = []
    if affiliation in mit_name2numcollabs:
        collabs = mit_name2numcollabs[affiliation]
    else:
        collabs = 0
    if collabs > 735:
        continue
    unit_collab_grant.append(collabs)
    data1.append(collabs)
    data2.append(mit_name2grants[affiliation])
    unit_collab_grant.append(mit_name2grants[affiliation])
    mit_collab_grant.append(unit_collab_grant)
    
corr, _ = pearsonr(data1, data2)
print(corr)

collab_grant = [{"name": "KFUPM", "data": kfupm_collab_grant}, {"name": "KSU", "data": ksu_collab_grant}, {"name": "MIT", "data": mit_collab_grant}]
write_json(collab_grant, filename='collab_grant_scatter.json') 

kfupm_collab_faculty = []
ksu_collab_faculty = []
mit_collab_faculty = []

for affiliation in kfupm_affiliations_list:
    unit_collab_faculty = []
    if affiliation in kfupm_name2numcollabs:
        collabs = kfupm_name2numcollabs[affiliation]
    else:
        collabs = 0
    unit_collab_faculty.append(collabs)
    data1.append(collabs)
    data2.append(len(kfupm_name2faculty[affiliation]))
    unit_collab_faculty.append(len(kfupm_name2faculty[affiliation]))
    kfupm_collab_faculty.append(unit_collab_faculty)
    
corr, _ = pearsonr(data1, data2)
print(corr)
    
for affiliation in ksu_affiliations_list:
    unit_collab_faculty = []
    if affiliation in ksu_name2numcollabs:
        collabs = ksu_name2numcollabs[affiliation]
    else:
        collabs = 0
    unit_collab_faculty.append(collabs)
    data1.append(collabs)
    data2.append(len(ksu_name2faculty[affiliation]))
    unit_collab_faculty.append(len(ksu_name2faculty[affiliation]))
    ksu_collab_faculty.append(unit_collab_faculty)
    
corr, _ = pearsonr(data1, data2)
print(corr)
    
for affiliation in mit_affiliations_list:
    unit_collab_faculty = []
    if affiliation in mit_name2numcollabs:
        collabs = mit_name2numcollabs[affiliation]
    else:
        collabs = 0
    if collabs > 735:
        continue
    unit_collab_faculty.append(collabs)
    data1.append(collabs)
    data2.append(len(mit_name2faculty[affiliation]))
    unit_collab_faculty.append(len(mit_name2faculty[affiliation]))
    mit_collab_faculty.append(unit_collab_faculty)

corr, _ = pearsonr(data1, data2)
print(corr)

collab_faculty = [{"name": "KFUPM", "data": kfupm_collab_faculty}, {"name": "KSU", "data": ksu_collab_faculty}, {"name": "MIT", "data": mit_collab_faculty}]
write_json(collab_faculty, filename='collab_faculty_scatter.json') 


0.8036594468755694
0.7835222160727313
0.8389233974362467
0.7892713412809362
0.7106224154830042
0.6409435589618206
0.6185853835908818
0.6264547550650493
0.6278965240523028


In [32]:
def get_name2collabsjson(name2collabs, filename):
    name2collabsjson = ddict(set)
    for name, collabs in name2collabs.items():
        for num, collab in collabs:
            if name not in name2collabsjson:
                 name2collabsjson[name] = ddict(set)
            name2collabsjson[name][collab] = num

    write_json(name2collabsjson, filename=filename) 

get_name2collabsjson(kfupm_name2collabs,'kfupm_name_collabs.json')
get_name2collabsjson(ksu_name2collabs,'ksu_name_collabs.json')
get_name2collabsjson(mit_name2collabs,'mit_name_collabs.json')

In [33]:
# Static

names=[]
num_collabs = 0
max_collabs = 0
min_collabs = 2000
d2d = 0
d2c = 0
c2c = 0
for name, collabs in kfupm_name2collabs.items():
    for num, collab in collabs:
        if collab not in names:
            num_collabs += num
            if num > max_collabs:
                max_collabs = num
            if num < min_collabs:
                min_collabs = num
            if "Department" in name and "Department" in collab:
                d2d += num
            if "Department" in name and "Center" in collab:
                d2c += num
            if "Center" in name and "Department" in collab:
                d2c += num
            if "Center" in name and "Center" in collab:
                c2c += num
    names.append(name)

static = {"total_num_collabs": num_collabs, "max_collabs": max_collabs, "min_collabs": min_collabs, "d2d": d2d, "d2c": d2c, "c2c": c2c}

write_json(static, filename='kfupm_static.json') 

names=[]
num_collabs = 0
max_collabs = 0
min_collabs = 2000
d2d = 0
d2c = 0
c2c = 0
for name, collabs in ksu_name2collabs.items():
    for num, collab in collabs:
        if collab not in names:
            num_collabs += num
            if num > max_collabs:
                max_collabs = num
            if num < min_collabs:
                min_collabs = num
            if "Department" in name and "Department" in collab:
                d2d += num
            if "Department" in name and "Center" in collab:
                d2c += num
            if "Center" in name and "Department" in collab:
                d2c += num
            if "Center" in name and "Center" in collab:
                c2c += num
    names.append(name)

static = {"total_num_collabs": num_collabs, "max_collabs": max_collabs, "min_collabs": min_collabs, "d2d": d2d, "d2c": d2c, "c2c": c2c}

write_json(static, filename='ksu_static.json') 

names=[]
num_collabs = 0
max_collabs = 0
min_collabs = 2000
d2d = 0
d2c = 0
c2c = 0
for name, collabs in mit_name2collabs.items():
    for num, collab in collabs:
        if collab not in names:
            num_collabs += num
            if num > max_collabs:
                max_collabs = num
            if num < min_collabs:
                min_collabs = num
            if "Department" in name and "Department" in collab:
                d2d += num
            if "Department" in name and "Center" in collab:
                d2c += num
            if "Center" in name and "Department" in collab:
                d2c += num
            if "Center" in name and "Center" in collab:
                c2c += num
    names.append(name)

static = {"total_num_collabs": num_collabs, "max_collabs": max_collabs, "min_collabs": min_collabs, "d2d": d2d, "d2c": d2c, "c2c": c2c}

write_json(static, filename='mit_static.json') 