# Analyzing Biorxiv Data

### *Created: 6 May 2018*
#### Author: Ali Sina Booeshaghi
#### Summary: This notebook summarizes the efforts to analyze the Biorxiv Data. 

## **Loading the Data**

In [8]:
%matplotlib inline

## Cell to import all packages
import json
import os
import operator
import pprint
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
from dateutil import parser
import pandas as pd
from numpy import loadtxt
# use the convention variable_name

In [80]:
# Load the smallest file, change data1.txt -> data21.txt for full data
path_to_data = os.getcwd() + '/complete_data_2018_5_11/data/data1.txt'
path_to_summary = os.getcwd() + '/complete_data_2018_5_11/analysis/journal_summary.txt'
path_to_save = os.getcwd() + '/complete_data_2018_5_11/analysis'


with open(path_to_data, 'rb') as f:
    papers = json.load(f)

with open(path_to_summary, 'rb') as f:
    journal_summary = json.load(f)
    
# The way to access the data is as follows:
# papers['papers'] gives you a list of all of the downloaded papers
# papers['papers'][0] gives you the first one
# papers['papers'][0][keyword] for keyword in ['abstract', 'authors', 'date', 'journal', 'link', 'text', 'title', 'twitter'] to access that info

## **Now lets look at some general information about the data we have**

In [81]:
expected_number_of_papers = 20500
number_of_papers = len(papers['papers'])
number_of_journals = len(journal_summary)
journal_summary_sorted = sorted(journal_summary.items(), key=operator.itemgetter(1), reverse=True) # Sorts by most represented journal

print "Number of papers : ", number_of_papers
print "Number of journals : ", number_of_journals
print "Top ten journals by number of papers : "
pprint.pprint(journal_summary_sorted[0:10])

Number of papers :  24598
Number of journals :  1124
Top ten journals by number of papers : 
[(u'Pre print', 15979),
 (u'Scientific Reports', 473),
 (u'eLife', 406),
 (u'PLOS ONE', 368),
 (u'Bioinformatics', 286),
 (u'Nature Communications', 229),
 (u'PNAS', 220),
 (u'PLOS Computational Biology', 211),
 (u'Genetics', 179),
 (u'PLOS Genetics', 176)]


## **Here we have to now iterate through all of the data in papers to get richer information**

In [82]:
global_author_list = []
global_date_dict = {}
author_network = []
for paper_num in range(len(papers['papers'])):
    abstract = papers['papers'][paper_num]['abstract']
    authors  = papers['papers'][paper_num]['authors']
    date     = papers['papers'][paper_num]['date']
    journal  = papers['papers'][paper_num]['journal']
    link     = papers['papers'][paper_num]['link']
    text     = papers['papers'][paper_num]['text']
    title    = papers['papers'][paper_num]['title']
    twitter  = papers['papers'][paper_num]['twitter']
    
    author_network.append(authors)
    # Getting a list of all of the authors
    for dude in authors:
        global_author_list.append(dude)
    
    # Getting a list of all of the dates the papers were posted
    if date not in global_date_dict:
        global_date_dict[date] = 1
    else:
        global_date_dict[date] += 1

### Author Information

In [83]:
# Getting author number counts
duplicates =  len(global_author_list)
global_author_tuple = [tuple(lst) for lst in global_author_list]
global_author_list = list(set(global_author_tuple))

# Checking out affiliations
global_aff_dict = {}
global_aff_list = []
for dude in global_author_list:
    name = dude[0]
    aff = dude[2]
    if aff in global_aff_dict:
        # if the dude is aleady in the aff list
        global_aff_dict[aff] += 1
    else:
        global_aff_dict[aff] = 1
        
    global_aff_list.append(aff)

# NOTE: This sorts the instutitions by the number of unique authors form that institution, 
# TODO: I definitely double counted
# TODO: The same thing but for the number of papers
# TODO: fix numbers of authors

global_aff_list = global_aff_dict.keys()
len(global_aff_dict)
global_aff_list_sorted = sorted(global_aff_dict.items(), key=operator.itemgetter(1), reverse=True) # Sorts by most represented journal

In [84]:
number_of_total_authors = len(global_author_list)
number_of_authors_pub_many = duplicates - number_of_total_authors
number_of_authors_pub_once = number_of_total_authors - number_of_authors_pub_many

number_of_publishing_institutions = len(global_aff_list)

print "Total number of authors : ", number_of_total_authors
print "Total number of authors who pub once : ", number_of_authors_pub_once
print "Total number of authors who pub many : ", number_of_authors_pub_many
print "Total number of publishing institutions : ", number_of_publishing_institutions

Total number of authors :  140925
Total number of authors who pub once :  121874
Total number of authors who pub many :  19051
Total number of publishing institutions :  32600


### Graph of Authors and Collaborators

#### *Creating Edge and Node List*

In [91]:
trimmed_author_network = []
for paper in author_network:
    a_list = []
    for author in paper:
      a_list.append(author[0] + ', ' + author[1])  
    trimmed_author_network.append(a_list)


In [92]:
# create edge list
edges = [[i[0],j] for i in trimmed_author_network for j in i[1:]]
nodes = []
for paper in trimmed_author_network:
    for dude in paper:
        if dude not in nodes:
            nodes.append(dude)
# check for duplicates
len(nodes) != len(set(nodes))

False

In [93]:
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout, to_agraph
## TODO: add identifiers to each node, ie institution
def draw_graph(n, e):

    # get nodes and edges
    nodes = n
    edges = e
    
    # create networkx graph
    G=nx.Graph()

    # add nodes
    G.add_nodes_from(nodes)

    # add edges
    G.add_edges_from(edges)
    print nx.info(G)

    # draw graph
    return G

# draw example
G = draw_graph(nodes, edges)

Name: 
Type: Graph
Number of nodes: 118772
Number of edges: 130208
Average degree:   2.1926


In [99]:
G.neighbors('Thomson, Matthew')

[u'Aull, Katherine H.', u'Gehring, Jase', u'Hendel, Nathan L.']

### Date Information

In [258]:
# Working with dates
dates, number_of_papers_posted = zip(*sorted(global_date_dict.items()))
dates = list(dates)
number_of_papers_posted = list(number_of_papers_posted)

In [259]:
# Plotting number of papers posted over time
# TODO: for plotting dates need to remove the once with errors ie DOI not found
dates_dt = []
for day in dates:
    dates_dt.append(parser.parse(day))
    
dates_matplot = mdates.date2num(dates_dt)

date_series = pd.Series(dates_dt, number_of_papers_posted)
date_series.hist(bins=12)
plt.show()

## **Random Scripts that I don't really need anymore**

In [17]:
## This scripts takes the summary document and produces a dictionary with journal, num of paper pairs. it uses the old summary.txt from /complete_data/data/pdfs/
summary_dictionary = {}
for n in summary:
    new = n.split(':')
    journal = new[0:-1][0]
    ''.join(journal)
    number_of_journal_papers = new[-1].strip('\n')
    summary_dictionary[journal] = int(number_of_journal_papers)

with open(path_to_save + '/journal_summary.txt', 'w') as f:
    json.dump(summary_dictionary, f)