# Analyzing Biorxiv Data

### *Created: 6 May 2018*
#### Author: Ali Sina Booeshaghi
#### Summary: This notebook summarizes the efforts to analyze the Biorxiv Data. 

## **Load the Data**

In [21]:
%matplotlib inline
%time

## Cell to import all packages
import json
import os
import operator
import pprint
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
from dateutil import parser
import pandas as pd
from numpy import loadtxt
from uuid import uuid4
import ctypes
import hashlib
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

from bokeh.plotting import figure, gridplot
from bokeh.io import output_notebook, show

# for fuuzzy string matching
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# use the convention variable_name

output_notebook()


CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 4.05 µs


In [12]:
%time
# Load the smallest file, change data1.txt -> data21.txt for full data
path_to_data = os.getcwd() + '/complete_data_2018_5_11/data/data1.txt'
path_to_summary = os.getcwd() + '/complete_data_2018_5_11/analysis/journal_summary.txt'
path_to_save = os.getcwd() + '/complete_data_2018_5_11/analysis'


with open(path_to_data, 'rb') as f:
    papers = json.load(f)

with open(path_to_summary, 'rb') as f:
    journal_summary = json.load(f)
    
# The way to access the data is as follows:
# papers['papers'] gives you a list of all of the downloaded papers
# papers['papers'][0] gives you the first one
# papers['papers'][0][keyword] for keyword in ['abstract', 'authors', 'date', 'journal', 'link', 'text', 'title', 'twitter'] to access that info

data = pd.DataFrame(papers['papers'])
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs


In [13]:
fix = data['twitter'][1004]
data['twitter'][1004] = fix[1:]

In [14]:
data['twitter'][1004]

[[u' 19 Dec 2014 ', 6],
 [u' 22 Dec 2014 ', 3],
 [u' 24 Dec 2014 ', 1],
 [u' 20 Dec 2014 ', 6],
 [u' 13 Feb 2015 ', 9],
 [u' 12 Feb 2015 ', 1],
 [u' 16 Feb 2015 ', 5],
 [u' 21 Dec 2014 ', 2]]

## **Using the categories as lables in a classifier**

In [33]:
data['clean abstract'] = data['abstract'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
X_train, X_test, y_train, y_test = train_test_split(data['clean abstract'], data.category, test_size=0.2)

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k=10000)),
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])

model = pipeline.fit(X_train, y_train)

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

target_names = ['1', '2', '3', '4', '5']
print("top 10 keywords per class:")
for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % (label, " ".join(feature_names[top10])))

print("accuracy score: " + str(model.score(X_test, y_test)))

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.

## **Now lets look at some general information about the data we have**

In [3]:
%time
expected_number_of_papers = 20500
number_of_papers = len(papers['papers'])
number_of_journals = len(journal_summary)
journal_summary_sorted = sorted(journal_summary.items(), key=operator.itemgetter(1), reverse=True) # Sorts by most represented journal

print "Number of papers : ", number_of_papers
print "Number of journals : ", number_of_journals
print "Top ten journals by number of papers : "
pprint.pprint(journal_summary_sorted[0:10])

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
Number of papers :  1025
Number of journals :  1124
Top ten journals by number of papers : 
[(u'Pre print', 15979),
 (u'Scientific Reports', 473),
 (u'eLife', 406),
 (u'PLOS ONE', 368),
 (u'Bioinformatics', 286),
 (u'Nature Communications', 229),
 (u'PNAS', 220),
 (u'PLOS Computational Biology', 211),
 (u'Genetics', 179),
 (u'PLOS Genetics', 176)]


## **Working with dataframe**

In [35]:
data.head()

Unnamed: 0,abstract,authors,category,date,journal,link,text,title,twitter
0,Since the discovery of tumour initiating cells...,"[[Scott, Jacob, H. Lee Moffitt Cancer Center ...",[Cancer Biology],"November 7, 2013",PLOS Computational Biology,http://dx.doi.org/10.1371/journal.pcbi.1003433,/Users/lynnyi/Documents/Projects/biorxiv-data-...,Microenvironmental variables must influence in...,"[[ 13 Jan 2014 , 2], [ 27 Nov 2013 , 15], [ 28..."
1,Incoherent feedforward loops represent importa...,"[[Sen, Shaunak, Indian Institute of Technolog...",[Synthetic Biology],"November 7, 2013",Pre print,,/Users/lynnyi/Documents/Projects/biorxiv-data-...,Designing Robustness to Temperature in a Feedf...,[]
2,Mimulus guttatus and M. nasutus are an evoluti...,"[[Brandvain, Yaniv, Department of Evolution a...",[Evolutionary Biology],"November 7, 2013",PLOS Genetics,http://dx.doi.org/10.1371/journal.pgen.1004410,/Users/lynnyi/Documents/Projects/biorxiv-data-...,Speciation and introgression between Mimulus n...,"[[ 13 Nov 2013 , 4], [ 20 May 2014 , 1], [ 12 ..."
3,Dysregulated microRNA (miRNA) expression is a ...,"[[Wan, Ying-Wooi, Baylor College of Medicine]...",[Genomics],"November 13, 2013",PLOS ONE,http://dx.doi.org/10.1371/journal.pone.0087782,/Users/lynnyi/Documents/Projects/biorxiv-data-...,On the Reproducibility of TCGA Ovarian Cancer ...,"[[ 11 Dec 2013 , 1], [ 13 Nov 2013 , 2], [ 18 ..."
4,Hybrid zones can be valuable tools for studyin...,"[[Nadeau, Nicola, University of Cambridge], [...",[Evolutionary Biology],"November 12, 2013",Genome Research,http://dx.doi.org/10.1101/gr.169292.113,/Users/lynnyi/Documents/Projects/biorxiv-data-...,Population genomics of parallel hybrid zones i...,"[[ 06 Dec 2016 , 1], [ 17 Nov 2013 , 2], [ 28 ..."


### **Making twitter list** 

In [15]:
dt = []
b = []
for paper in data['twitter']:
    b = []
    for dates in paper:
        b.append(dates[0])
    dt.append(b)
    
rt = []
b = []
for paper in data['twitter']:
    b = []
    for dates in paper:
        b.append(dates[-1])
    rt.append(b)

In [16]:
data['rt'] = rt
data['dt'] = dt

In [17]:
for date_ind in range(len(data['dt'])):
    if len(data['dt'][date_ind])>0:
        data['dt'][date_ind] = pd.to_datetime(data['dt'][date_ind],infer_datetime_format=True)

In [18]:
data['dt'][0]

DatetimeIndex(['2014-01-13', '2013-11-27', '2013-11-28', '2013-11-22',
               '2013-12-01'],
              dtype='datetime64[ns]', freq=None)

In [40]:
p1 = figure(width=1000, height=500, title="Number of retweets vs days elapsed")
p1.xaxis.axis_label = 'Number of retweets'
p1.yaxis.axis_label = 'Days elapsed'
print data['rt'][0]

p1.circle(data['dt'][0], data['rt'][0], size=7, alpha=0.5, legend='')
p1.circle(data['dt'][1], data['rt'][1], size=7, alpha=0.5, legend='')
p1.circle(data['dt'][2], data['rt'][2], size=7, alpha=0.5, legend='')
p1.circle(data['dt'][3], data['rt'][3], size=7, alpha=0.5, legend='')
show(p1)

[2, 15, 1, 1, 2]


## **Iterate through all of the data in papers to get richer information**

In [5]:
%time
global_author_list = []
global_date_dict = {}
author_network = []
for paper_num in range(len(papers['papers'])):
    abstract = papers['papers'][paper_num]['abstract']
    authors  = papers['papers'][paper_num]['authors']
    date     = papers['papers'][paper_num]['date']
    journal  = papers['papers'][paper_num]['journal']
    link     = papers['papers'][paper_num]['link']
    text     = papers['papers'][paper_num]['text']
    title    = papers['papers'][paper_num]['title']
    twitter  = papers['papers'][paper_num]['twitter']
    
    author_network.append(authors)
    # Getting a list of all of the authors
    for dude in authors:
        global_author_list.append(dude)
    
    # Getting a list of all of the dates the papers were posted
    if date not in global_date_dict:
        global_date_dict[date] = 1
    else:
        global_date_dict[date] += 1

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [6]:
papers['papers'][0]

{u'abstract': u'Since the discovery of tumour initiating cells (TICs) in solid tumours, studies focussing on their role in cancer initiation and progression have abounded.  The biological interrogation of these cells continues to yield volumes of information on their pro-tumourigenic behaviour, but actionable generalised conclusions have been scarce. Further, new information suggesting a dependence of tumour composition and growth on the microenvironment has yet to be studied theoretically. To address this point, we created a hybrid, discrete/continuous computational cellular automaton model of a generalised stem-cell driven tissue with a simple microenvironment.  Using the model we explored the phenotypic traits inherent to the tumour initiating cells and the effect of the microenvironment on tissue growth. We identify the regions in phenotype parameter space where TICs are able to cause a disruption in homeostasis, leading to tissue overgrowth and tumour maintenance. As our parameter

### Clean up the author network textual data using fuzzy string matching (This takes too long right now.. need to speed up)

In [79]:
author_list = []
for paper in author_network:
    for dude in paper:
        if dude not in author_list:
            wrt = dude[0] + ', ' + dude[1] + ', ' + dude[2][1:]
            author_list.append(wrt)

In [80]:
print compare('Sina Booeshaghi', 'Sina B. Booeshaghi')
fuzz.ratio('Sina Booeshaghi', 'Sina B. Booeshaghi')

NameError: name 'compare' is not defined

In [None]:
#%%time
#to_remove = []
#for i in range(len(author_list)):
#    compare = []
#    for j in author_list[i+1:]:
#        pwise_comp = fuzz.partial_ratio(author_list[i], j)
#        if pwise_comp > 70:
#            to_remove.append(j)

### **Generate author id and lookup table**

In [8]:
# Here we generate a lookup table for authors and associated id's
# For every author there is a uid associated with them
def gen_id_2_auth(author_network):
    author_lookup = {}
    uid_list = []

    for paper in author_network:
        for author in paper:
            # we make a unique ID hash that can be looked up
            # here I fix the stupid space left in front of affiliation
            author = [author[0], author[1], author[2][1:]]
            string_to_hash = author[0] + ', ' + author[1] + ', ' + author[2]
            uid = int(str(hash(string_to_hash))[1:15])
            aff = author[2]
            auth_name = author[0] + ', ' + author[1]

            # if we have added the author to the dict, we increment the number of papers they have published
            if uid in uid_list:
                author_lookup[uid]['count'] += 1

            else:
                to_add = {
                    'name': auth_name, 'count': 1, 'aff': aff
                }
                author_lookup[uid]= to_add

            uid_list.append(uid)
    return author_lookup

lookup_table = gen_id_2_auth(author_network)

In [9]:
# Given an UID and a lookup table, we return the author and their affiliation
def id_2_auth(uid, lookup_table):
    return [lookup_table[uid]['name'], lookup_table[uid]['aff']]

In [10]:
# Given the author which is [lname, fname, aff], return the UID
def auth_2_id(author):
    string_to_hash = author[0] + ', ' + author[1]
    uid = int(str(hash(string_to_hash))[1:15])
    return uid

In [11]:
# Returns a dictionary of all of the affiliations and the people who published under that affiliation
def affiliation_info(author_network):
    affiliations = {}
    added_list = []
    for paper in author_network:
        for author in paper:
            author = [author[0], author[1], author[2][1:]]
            auth_aff = author[2]
            auth_name = author[0] + ', ' + author[1]
            to_id = [auth_name, auth_aff]
            auth_id = auth_2_id(to_id)
            if auth_aff == '':
                # DOI NOT FOUND, just skip
                pass
            else:
                if auth_aff not in affiliations:
                    affiliations[auth_aff] = []

                if auth_id not in added_list:
                    affiliations[auth_aff].append(auth_id)
                    added_list.append(auth_id)
    return affiliations
                
affiliations = affiliation_info(author_network)

In [12]:
# Testing the auth_2_id and id_to_auth
test_id = 86201888899200
out = id_2_auth(test_id, lookup_table)

## **Generate paper ID's**

In [None]:
# Here we generate a lookup table for authors and associated id's
# For every author there is a uid associated with them
def gen_id_2_paper(paper_list):
    author_lookup = {}
    uid_list = []

    for paper in author_network:
        for author in paper:
            # we make a unique ID hash that can be looked up
            # here I fix the stupid space left in front of affiliation
            author = [author[0], author[1], author[2][1:]]
            string_to_hash = author[0] + ', ' + author[1] + ', ' + author[2]
            uid = int(str(hash(string_to_hash))[1:15])
            aff = author[2]
            auth_name = author[0] + ', ' + author[1]

            # if we have added the author to the dict, we increment the number of papers they have published
            if uid in uid_list:
                author_lookup[uid]['count'] += 1

            else:
                to_add = {
                    'name': auth_name, 'count': 1, 'aff': aff
                }
                author_lookup[uid]= to_add

            uid_list.append(uid)
    return author_lookup

lookup_table = gen_id_2_auth(author_network)

## **Create author collaboration graph**

### *Create edge and node lists*

In [13]:
def gen_auth_net_to_uid_net(author_network):
    trimmed_author_network = []
    for paper in author_network:
        a_list = []
        for author in paper:
            string_to_hash = author[0] + ', ' + author[1] + ',' + author[2]
            uid = int(str(hash(string_to_hash))[1:15])
            a_list.append(uid)  
        trimmed_author_network.append(a_list)
    return trimmed_author_network

uid_network = gen_auth_net_to_uid_net(author_network)

In [14]:
# create edge list
edges = [[i[0],j] for i in uid_network for j in i[1:]]

# create nodes
nodes = []
for paper in uid_network:
    for dude in paper:
        if dude not in nodes:
            nodes.append(dude)

### Make the graph

In [15]:
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout, to_agraph
## TODO: add identifiers to each node, ie institution
def draw_graph(n, e):

    # get nodes and edges
    nodes = n
    edges = e
    
    # create networkx graph
    G=nx.Graph()

    # add nodes
    G.add_nodes_from(nodes)

    # add edges
    G.add_edges_from(edges)
    print nx.info(G)

    # draw graph
    return G

# draw example
G = draw_graph(nodes, edges)

Name: 
Type: Graph
Number of nodes: 26001
Number of edges: 23395
Average degree:   1.7995


## Perform computations on this graph

In [16]:
# Returns the first degree neighbors of author
def get_all_neighbors(author):
    auth_id = auth_2_id(author)
    nbrs = []
    for uid in nx.all_neighbors(G, auth_id):
        nbrs.append(id_2_auth(uid, lookup_table))
    return nbrs

In [17]:
# Returns from 1 to n degree neighbors of author
def get_n_nearest_neigbors(author, n=1):
    nbrs = []
    auth_id = auth_2_id(author)
    path_lengths = nx.single_source_dijkstra_path_length(G, auth_id)
    nbr_ids = [node for node, length in path_lengths.iteritems() if length <= n]
    for nbr_id in nbr_ids:
        nbrs.append(id_2_auth(nbr_id, lookup_table))
    return nbrs

In [77]:
#author = id_2_auth(test_id, lookup_table)
author1 = ['Pachter, Lior', 'UC Berkeley']
author2 = ['Alabdulkareem, Ibrahim B.','King Abdullah International Medical Research Center, Ministry of National Guard Health Affairs']
test_id = 70686335484624
author2 = id_2_auth(test_id, lookup_table)
author3 = ['Murray, Richard', 'California Institute of Technology']

degree = range(1,30)
a1 = []
a2 = []
a3 = []
for deg in degree:
    a1.append(len(get_n_nearest_neigbors(author1, n=deg)))
    a2.append(len(get_n_nearest_neigbors(author2, n=deg)))
    a3.append(len(get_n_nearest_neigbors(author3, n=deg)))


p1 = figure(width=1000, height=500, title="Number of People v. Degrees of Separation")
p1.xaxis.axis_label = 'Degree of Separation'
p1.yaxis.axis_label = 'Number of People'
p1.circle(degree, a1, size=7, color="firebrick", alpha=0.5, legend=author1[0] + ', ' + author1[1])
p1.circle(degree, a2, size=7, color="blue", alpha=0.5, legend=author2[0] + ', ' + author2[1])
p1.circle(degree, a3, size=7, color="green", alpha=0.5, legend=author3[0] + ', ' + author3[1])

p1.legend.location = "top_left"
show(p1)

In [3]:
a1[-1]

NameError: name 'a1' is not defined

## Test stuff

In [78]:
author = ['Pachter, Lior', 'UC Berkeley']
author = ['Murray, Richard', 'California Institute of Technology']
test_id = auth_2_id(author)
#test_id = 5509899179575
test_id = 89925582758261

In [50]:
author = ['Doyle, John', 'California Institute of Technology']

affiliations['Harvard University']
author = id_2_auth(23206401549607, lookup_table)
nbrs = get_all_neighbors(id_2_auth(test_id, lookup_table))
nbrs

[[u'Alabdulkareem, Ibrahim B.',
  u'King Abdullah International Medical Research Center, Ministry of National Guard Health Affairs']]

In [451]:
nx.single_source_shortest_path_length(G, test_id, cutoff=1)

{30562699974351: 1,
 57512786246293: 1,
 58440851686067: 1,
 59149540451919: 1,
 89925582758261: 0}

In [452]:
neighbors = nx.all_neighbors(G, test_id)
for uid in nx.all_neighbors(G, test_id):
    print id_2_auth(uid, lookup_table)

[u'Basanta, David', u'H. Lee Moffitt Cancer Center and Research Institute']
[u'Hjelmeland, Anita', u'University of Alabama at Birmingham']
[u'Chinnaiyan, Prakash', u'H. Lee Moffitt Cancer Center and Research Institute']
[u'Anderson, Alexander R A', u'H. Lee Moffitt Cancer Center and Research Institute']


## Print interesting stats

In [58]:
number_of_total_authors = len(global_author_list)
number_of_authors_pub_many = duplicates - number_of_total_authors
number_of_authors_pub_once = number_of_total_authors - number_of_authors_pub_many

number_of_publishing_institutions = len(global_aff_list)

print "Total number of authors : ", number_of_total_authors
print "Total number of authors who pub once : ", number_of_authors_pub_once
print "Total number of authors who pub many : ", number_of_authors_pub_many
print "Total number of publishing institutions : ", number_of_publishing_institutions

Total number of authors :  26002
Total number of authors who pub once :  23110
Total number of authors who pub many :  2892
Total number of publishing institutions :  7327


### Date Information

In [258]:
# Working with dates
dates, number_of_papers_posted = zip(*sorted(global_date_dict.items()))
dates = list(dates)
number_of_papers_posted = list(number_of_papers_posted)

In [259]:
# Plotting number of papers posted over time
# TODO: for plotting dates need to remove the once with errors ie DOI not found
dates_dt = []
for day in dates:
    dates_dt.append(parser.parse(day))
    
dates_matplot = mdates.date2num(dates_dt)

date_series = pd.Series(dates_dt, number_of_papers_posted)
date_series.hist(bins=12)
plt.show()

## **Random Scripts that I don't really need anymore**

In [17]:
## This scripts takes the summary document and produces a dictionary with journal, num of paper pairs. it uses the old summary.txt from /complete_data/data/pdfs/
summary_dictionary = {}
for n in summary:
    new = n.split(':')
    journal = new[0:-1][0]
    ''.join(journal)
    number_of_journal_papers = new[-1].strip('\n')
    summary_dictionary[journal] = int(number_of_journal_papers)

with open(path_to_save + '/journal_summary.txt', 'w') as f:
    json.dump(summary_dictionary, f)