In [1]:
import json

In [2]:
class Author():
    def __init__(self, name, aid):
        self.name = name
        self.aid = aid
        self.nicknames = []
        self.paper_ids = []
    
    def add_paper(self, pid):
        if pid not in self.paper_ids:
            self.paper_ids.append(pid)
            
    def add_nickname(self, name):
        if name not in self.nicknames:
            self.nicknames.append(name)
        

In [3]:
class Paper():
    def __init__(self, title, abstract, year, author_names, b_topic, topics, pid, detc, url):
        
        # Basic info
        self.title = title
        self.abstract = abstract
        self.year = year
        self.author_names = author_names
        self.broad_topic = b_topic
        self.topics = topics
        self.pid = pid
        self.detc = detc
        self.url = url
        
        # add later
        self.author_ids = []
        self.citations = []
        self.cited_by = []
    
    def add_author_id(self, aid):
        if aid not in self.author_ids:
            self.author_ids.append(aid)
        

## 1. Read papers 

In [5]:
file_path = "data/DAC_Entire_DataBase.json"

with open(file_path, "r") as f:
    database = json.load(f)

In [6]:
papers = {}
for p in database:
    paper = Paper(p['Title'], p['Abstract'],p['Year'],p['Authors'], p['Broad_Topic'],\
                  p['Topics'], p['PaperID'],p['DETC'], p['URL'])
    papers[paper.pid] = paper

## 2. Read authors

In [7]:
## add author into the dataset
author_names = {}

for p in papers.values():
    for n in p.author_names:
        author_names[n] = 1

In [8]:
# Assign IDs to each author

id = 0
for n in author_names.keys():
    author_names[n] = str(id)
    id += 1

In [9]:
authors = {}

for name in author_names.keys():
    authors[author_names[name]] = Author(name, author_names[name])

In [10]:
def make_name_to_author_dict(authors):
    ret = {}
    for author in authors.values():
        ret[author.name] = author
    return ret

In [19]:
def extract_text(papers, start_year, end_year):
    text_list = []
    for p in papers.values():
        if p.year < start_year or p.year > end_year:
            text_list.append(p.abstract)
    text = " ".join(text_list)

In [16]:
import rake
import operator

In [17]:
rake_object = rake.Rake("SmartStoplist.txt", 5, 3, 4)

In [18]:
keywords = rake_object.run(text)
for key keywords:
    print key[0]

Keywords: [(u'shot peening process', 7.889312977099236), (u'visual steering commands', 7.785714285714286), (u'monte carlo simulations', 7.714985994397759), (u'active shock absorber', 7.696214896214896), (u'monte carlo simulation', 7.668160597572362), (u'2d fea mesh', 7.656832298136646), (u'sequential quadratic programming', 7.603250270855904), (u'analytical target cascading', 7.601190476190476), (u'latin hypercube sampling', 7.553256704980843), (u'finite element analyses', 7.513083323537808), (u'finite element analysis', 7.507344158685245), (u'particle swarm optimization', 7.484733196761291), (u'finite element models', 7.435806474204993), (u'hybrid electric vehicle', 7.394783354783355), (u'finite element model', 7.380537965513888), (u'miniature machine tools', 7.249217638691324), (u'metal deposition process', 7.145335386062822), (u'multidisciplinary design optimization', 7.14512909736497), (u'high computational cost', 7.1418912124794485), (u'finite element method', 7.134885739890221), 

## 3. Build Connection (between author and paper)

In [10]:
name2author = make_name_to_author_dict(authors)

### Let each author has paper_id list

In [11]:
for paper in papers.values():
    for name in paper.author_names:
        author = name2author[name]
        
        author.add_paper(paper.pid)

### Let each paper has author_id list

In [12]:
for paper in papers.values():
    for name in paper.author_names:
        paper.add_author_id(name2author[name].aid)

## 4. Name Disambiguation

### Detect similar name pairs

Running the following cell will generate lines of similar names. Each line is formatted as "author_id, name, author_id, name". For each line, it the two are indeed similar, them copy and paste the line into data/disambiguation.txt file.

In [13]:
from fuzzywuzzy import fuzz
import Levenshtein
keys = name2author.keys()

for i in range(0, len(keys)):
    for j in range(i+1, len(keys)):
        p1 = name2author[keys[i]]
        p2 = name2author[keys[j]]
        
        first = p1.name
        second = p2.name
        
        pdist = fuzz.partial_ratio(first, second)
        dist = Levenshtein.distance(first, second)
        lv_ra = Levenshtein.ratio(first, second)
        
        if pdist >90 or dist <=2 or lv_ra >0.8:
            print p1.aid+"\t"+first+"\t"+p2.aid+"\t"+second

3	James Allison	463	James T. Allison
5	David J. Gorsich	1759	David Gorsich
8	Q. Cheng	608	Heidi Q. Chen
10	Steve C. Wang	253	C. Wang
11	Niclas Stromberg	679	Niclas Strömberg
12	Yu Gu	1219	P. Gu
12	Yu Gu	933	J. Gu
12	Yu Gu	987	Yu Liu
12	Yu Gu	1598	Xu Guo
12	Yu Gu	2385	Y. Fu
1219	P. Gu	95	C. Yu
1219	P. Gu	933	J. Gu
1219	P. Gu	941	Ashwin P. Gurnani
1219	P. Gu	1859	W. Hu
1219	P. Gu	2385	Y. Fu
22	Le Chen	502	Jie Chen
22	Le Chen	692	Ken Chen
22	Le Chen	1406	Wei Chen
22	Le Chen	2002	Li Chen
22	Le Chen	2277	Wen Chen
31	J.-C. Léon	2444	J. C. Léon
40	Ashraf Nassef	1397	Ashraf O. Nassef
47	Zhe Zhang	1553	Jie Zhang
50	John Ziegert	1370	John C. Ziegert
51	Shen Lu	2242	Zhen Hu
58	Mian Li	116	Xiang Li
58	Mian Li	458	Jia Li
58	Mian Li	929	Meifang Li
58	Mian Li	1305	Yan Li
58	Mian Li	1332	Ming Li
48	I. Horváth	1723	Imre Horváth
511	Weijun Wang	1284	Li-jun Wang
74	V. Krishnamurthy	772	Vivek Krishnamurthy
77	James L. Mathieson	2350	James J. Mason
82	Junfu Zhang	1680	Jun Zhang
91	Chao Qi	1640	Chao Xu
91	C

### Function for merging name1 and name2

In [14]:
def merge(id1, id2, authors, papers):
    author1 = authors[id1]
    author2 = authors[id2]
    
    # 1. On Author level
    
    # let 1 has 2's all paper_ids
    for pid in author2.paper_ids:
        author1.add_paper(pid)
    
    # make 2's name as 1's nickname
    author1.add_nickname(author2.name)
    
    # 2. On Papers level
    # Make author2's papers that contain author2.id now contain author1.id
    for pid in author2.paper_ids:
        paper = papers[pid]
        paper.author_ids = [id1 if x == id2 else x for x in paper.author_ids]
    
    # remove id2
    authors.pop(id2)
    
    print author1.name, " AND ", author2.name, "ARE MERGED!"

### Read from disambiguation file

Think of these name pairs as edges in graph, we need to find connected components of that graph and each component is referring to a person's name set.

In [15]:
import networkx as nx
G=nx.Graph()
disamb_file_path = "data/disambiguation.txt"

dependency = []
with open(disamb_file_path, "rb") as f:
    for line in f:
        segs = line.strip().split("\t")
        id1 = segs[0]
        id2 = segs[2]
        G.add_edge(int(id1), int(id2))

names = [sorted(list(c)) for c in sorted(nx.connected_components(G), key=len, reverse=True)]

### Perform merging

In [16]:
for name_list in names:
    for i in range(0, len(name_list)-1):
        idx = len(name_list) - 1 - i
        merge(str(name_list[idx-1]), str(name_list[idx]), authors, papers)

Niclas Strömberg  AND  Padmavathi K. Pakala ARE MERGED!
Niclas Stromberg  AND  Niclas Strömberg ARE MERGED!
Q. Cheng  AND  Heidi Q. Chen ARE MERGED!
James Allison  AND  James T. Allison ARE MERGED!
David J. Gorsich  AND  David Gorsich ARE MERGED!
J. K. Davidson  AND  Joseph K. Davidson ARE MERGED!
Steve C. Wang  AND  C. Wang ARE MERGED!


In [21]:
authors['680'].__dict__

{'aid': '680',
 'name': u'Danny A. Hlavinka',
 'nicknames': [],
 'paper_ids': [u'DETC2008-49953 pp. 737-747; (11 pages)']}

In [24]:
name2author['Kaarthic Madhavan'].__dict__

{'aid': '1949',
 'name': u'Kaarthic Madhavan',
 'nicknames': [],
 'paper_ids': [u'DETC2008-49953 pp. 737-747; (11 pages)']}

In [42]:
papers.keys()

[u'DETC2002/DAC-34072 pp. 327-336; (10 pages)',
 u'DETC2005-85056 pp. 1215-1224; (10 pages)',
 u'DETC2006-99178 pp. 381-391; (11 pages)',
 u'DETC2004-57509 pp. 999-1010; (12 pages)',
 u'DETC2013-12170 pp. V03BT03A046; (9 pages)',
 u'DETC2011-48404 pp. 509-520; (12 pages)',
 u'DETC2003/DAC-48729 pp. 257-268; (12 pages)',
 u'DETC2011-48385 pp. 55-66; (12 pages)',
 u'DETC2013-13303 pp. V03AT03A032; (13 pages)',
 u'DETC2006-99535 pp. 1193-1204; (12 pages)',
 u'DETC2010-29054 pp. 399-406; (8 pages)',
 u'DETC2005-84523 pp. 1143-1152; (10 pages)',
 u'DETC2003/DAC-48828 pp. 1175-1184; (10 pages)',
 u'DETC2015-46822, pp. V02BT03A052',
 u'DETC2012-70735 pp. 611-618; (8 pages)',
 u'DETC2006-99449 pp. 515-528; (14 pages)',
 u'DETC2013-12664 pp. V03BT03A012; (9 pages)',
 u'DETC2014-35213 pp. V02BT03A013; (8 pages)',
 u'DETC2013-12654 pp. V03AT03A020; (10 pages)',
 u'DETC2002/DAC-34052 pp. 161-168; (8 pages)',
 u'DETC2011-48525 pp. 25-32; (8 pages)',
 u'DETC2003/DAC-48809 pp. 1009-1018; (10 pages)',

In [43]:
papers['DETC2015-46822, pp. V02BT03A052'].__dict__

{'abstract': u'An integrated multiscale modeling framework that incorporates a simulation-based upscaling technique is developed and implemented for the material characterization of additively manufactured cellular structures in this paper. The proposed upscaling procedure enables the determination of homogenized parameters at multiple levels by matching the probabilistic performances between fine and coarse scale models. Polynomial chaos expansion is employed in upscaling procedure to handle the computational burden caused by the input uncertainties. Efficient uncertainty quantification is achieved at the mesocale level by utilizing the developed upscaling technique. The homogenized parameters of mesostructures are utilized again at the macroscale level in the upscaling procedure to accurately obtain the overall material properties of the target cellular structure. Actual experimental results of additively manufactured parts are integrated into the developed procedure to demonstrate t

## 5. Network Construction

In [39]:
def print_author_node(author_list):
    node_list = ["id\tname"]
    for author in author_list:
        node_info = "\t".join([author.aid, author.name])
        node_list.append(node_info)
    return node_list

In [40]:
def print_paper_node(paper_list):
    node_list = ["id\ttitle"]
    for paper in paper_list:
        node_info = "\t".join([paper.pid, paper.title])
        node_list.append(node_info)
    return node_list

In [41]:
def print_to_file(info_list, filename):
    with open(filename, "wb") as f:
        for line in info_list:
            f.write(line)
            f.write("\n")
    print filename, "DONE!"

### 5.1 Preparation

In [None]:
class Interpreter():
    def __init__(self):
        self.digit_holder = {}
        self.string_holder = {}
        
    def add(self, key, value):
        self.digit_holder[key] = value
        self.string_holder[value] = key
    
    def lookup(self, key):
        if type(key) is int:
            return self.digit_holder[key]
        else:
            return self.string_holder[key]

In [None]:
fake_aid = Interpreter()
fake_pid = Interpreter()

for i in range(0, len(authors.keys())):
    aid = authors.keys()[i]
    fake_aid.add(i, aid)
    
for i in range(0, len(papers.keys())):
    pid = papers.keys()[i]
    fake_pid.add(i, pid)

In [34]:
def make_pairs(input_list):
    length = len(input_list)
    ret = []
    if length <= 1:
        return []
    for i in range(0, length-1):
        for j in range(i+1, length):
            ret.append((input_list[i], input_list[j]))
    return ret

In [37]:
def author_network(authors, papers, start_year, end_year):
    edge_list = ["from\tto\tweight\tpaper"]
    for p in papers.values():
        if p.year < start_year or p.year > end_year:
            continue
        author_ids = p.author_ids
        edges = make_pairs(author_ids)
        for edge in edges:
            edge_list.append("\t".join[edge[0], edge[1], "10", p.title])
    return edge_list

## 6. Paper Network Construction

# Topic

In [44]:
import rake
import operator

ImportError: No module named rake

In [25]:
G=nx.path_graph(4)

In [26]:
nx.write_edgelist(G, "test.edgelist")

In [30]:
o = []
for p in papers.values():
    o.extend(p.topics)

In [31]:
o = list(set(o))

In [32]:
len(o)

794

In [33]:
len(papers)

1668

In [36]:
print "from\tto\tweight\ttype"

from	to	weight	type
