In [None]:
import networkx as nx
import csv
from collections import Counter
import itertools
import datetime

#### dslquery is a wrapper function for the Dimensions DSL API - just get in touch with the dimensions team for a copy###
from dimension_api import api_query 

In [None]:
##### Generate Researcher Collaboration Network from Publications   #######
#
#  extract from the list of publications identified by doi the researchers that have been identified by researcher_id
#  for each resaercher, establish their coauthors, number of publications, most recent insitution, country, 
#  and dominant FOR code
#
start = datetime.datetime.now()
print(start)

In [None]:
def publicationsfromdoi(dois,limit=1000,skip=0):
    searchstring = """
    search publications
       where
          doi in [{}]        
          and year in [2012:2017]
          and type = "article"
    return publications[year+author_affiliations+FOR] limit {} skip {}
    """.format(",".join([ '"{}"'.format(d) for d in dois]),limit,skip)
    return searchstring

In [None]:
def eePublicationsfromlist():
    pubs=[]
    dois = []
    with open('Authors_DOIs.txt', 'rt') as csvfile:
        doireader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for row in doireader:
            dois.append(row[0])
                
    skip = 0
    
    idchunks = [dois[x:x + 250] for x in range(0, len(dois), 250)]
    for ids in idchunks:
        results = api_query(publicationsfromdoi(dois=ids,limit=1000,skip=skip)).get('publications',[])
        pubs += results
        print(len(pubs))
        
    return pubs
    

In [None]:
#### Get all the publications
allpublications = eePublicationsfromlist()

In [None]:
### Edges

### Return a list of Researchers in each publication
researchers = [ [r['researcher_id'] for r in a if 'researcher_id' in r.keys()]               
                for p in allpublications
                for a in p.get('author_affiliations',[])
               ]

#### Create Edges for each publications 

cartesian_resaerchers = [list(itertools.product(rs,rs)) for rs in researchers]
print(cartesian_resaerchers[0]
     )

#### 'flatten' the list so that we can count the edges
flat_list = [edge_instance 
             for publist in cartesian_resaerchers 
             for edge_instance in publist
               if edge_instance[0] < edge_instance[1]
                 ]

#Count the edges, and create a list
edges = Counter(flat_list)
[ (e,edges[e]) for e in list(edges)]


#for affiliations in [p['author_affiliations'] for p in allpublications[1]]:
#    print(affiliaitons)

In [None]:
### Nodes

researcher_details = [ [ (r,p.get('year'),p.get('FOR',[])) 
                 for r in a
                 if 'researcher_id' in r.keys()]
     for p in allpublications
     for a in p.get('author_affiliations',[])
]


researcher_instance_list = [r_instance 
             for publist in researcher_details
             for r_instance in publist
                 ]


researcher_size = Counter([ri[0]['researcher_id'] for ri in researcher_instance_list])

nodes = {}

def updateresearherdef(ri,previousri):
    
    researchFields, primaryFOR = [], ''
    
    researchFields = previousri.get('researchFields',[]) + [f['name'][0:4] for f in ri[2]]
    fc = Counter(researchFields)
    fcl = [(a,fc[a]) for a in list(fc)]+[('0000',0)]
    primaryFOR = sorted(fcl, key = lambda a: -a[1])[0][0]
    
    aff = (ri[0].get('affiliations',[{}])+[{}])[0]

    return dict(
              first_name = ri[0]['first_name'],
              last_name = ri[0]['last_name'],
              country = aff.get('country',''),
              grid_id = aff.get('id',''),
              insitution_name = aff.get('name',''),
              size = researcher_size[ri[0]['researcher_id']],
              year = ri[1],
              primaryFOR = primaryFOR,
              researchFields = researchFields
            )
    

for ri in researcher_instance_list:
    researcher = ri[0]['researcher_id']

    if nodes.get(researcher,{'year':0})['year'] < ri[1]:       
        nodes[researcher] = updateresearherdef(ri,nodes.get(researcher,{}))            



In [None]:

for k in nodes.keys():
    del nodes[k]['researchFields']

  

In [None]:
    G = nx.Graph()
    G.add_nodes_from([(k,nodes[k]) 
                      for k in nodes.keys()])
    G.add_edges_from([(e[0],e[1],{'weight':edges[e]}) for e in list(edges)])
    #U = G.to_undirected()
    #nx.set_node_attributes(U, 'Degree', nx.degree(U))
    nx.write_graphml(G, "eeCollaborations2.graphml")

In [None]:
### . You can now load the graphml file above into a network tool like Gephi to process

In [None]:
end = datetime.datetime.now()
print(end)
print('the process took ',end-start,' to complete')