# DAC Network Construction

In [1]:
import json

## Author Object

In [2]:
class Author():
    def __init__(self, name, aid):
        self.name = name
        self.aid = aid
        self.nicknames = []
        self.paper_ids = []
    
    def add_paper(self, pid):
        if pid not in self.paper_ids:
            self.paper_ids.append(pid)
            
    def add_nickname(self, name):
        if name not in self.nicknames:
            self.nicknames.append(name)
        

## Paper Object

In [3]:
class Paper():
    def __init__(self, title, abstract, year, author_names, b_topic, topics, pid, detc, url):
        
        # Basic info
        self.title = title
        self.abstract = abstract
        self.year = year
        self.author_names = author_names
        self.broad_topic = b_topic
        self.topics = topics
        self.pid = pid
        self.detc = detc
        self.url = url
        
        # add later
        self.author_ids = []
        self.citations = []
        self.cited_by = []
    
    def add_author_id(self, aid):
        if aid not in self.author_ids:
            self.author_ids.append(aid)
        

## Procedure 1. Read papers 

In [4]:
file_path = "data/DAC_Entire_DataBase.json"

with open(file_path, "r") as f:
    database = json.load(f)

In [5]:
papers = {}
for p in database:
    paper = Paper(p['Title'], p['Abstract'],p['Year'],p['Authors'], p['Broad_Topic'],\
                  p['Topics'], p['PaperID'],p['DETC'], p['URL'])
    papers[paper.pid] = paper

## Procedure 2. Read authors

In [6]:
## add author into the dataset
author_names = {}

for p in papers.values():
    for n in p.author_names:
        author_names[n] = 1

In [7]:
# Assign IDs to each author

id = 0
for n in author_names.keys():
    author_names[n] = str(id)
    id += 1

In [8]:
authors = {}

for name in author_names.keys():
    authors[author_names[name]] = Author(name, author_names[name])

In [9]:
def make_name_to_author_dict(authors):
    ret = {}
    for author in authors.values():
        ret[author.name] = author
    return ret

## 3. Build Connection (between author and paper)

In [10]:
name2author = make_name_to_author_dict(authors)

In [27]:
len(name2author)

2515

### Let each author has paper_id list

In [11]:
for paper in papers.values():
    for name in paper.author_names:
        author = name2author[name]
        
        author.add_paper(paper.pid)

### Let each paper has author_id list

In [12]:
for paper in papers.values():
    for name in paper.author_names:
        paper.add_author_id(name2author[name].aid)

## 4. Name Disambiguation

### Detect similar name pairs

Running the following cell will generate lines of similar names. Each line is formatted as "author_id, name, author_id, name". For each line, it the two are indeed similar, them copy and paste the line into data/disambiguation.txt file.

In [None]:
from fuzzywuzzy import fuzz
import Levenshtein
keys = name2author.keys()

for i in range(0, len(keys)):
    for j in range(i+1, len(keys)):
        p1 = name2author[keys[i]]
        p2 = name2author[keys[j]]
        
        first = p1.name
        second = p2.name
        
        pdist = fuzz.partial_ratio(first, second)
        dist = Levenshtein.distance(first, second)
        lv_ra = Levenshtein.ratio(first, second)
        
        if pdist >90 or dist <=2 or lv_ra >0.8:
            print p1.aid+"\t"+first+"\t"+p2.aid+"\t"+second

### Function for merging name1 and name2

In [13]:
def merge(id1, id2, authors, papers):
    author1 = authors[id1]
    author2 = authors[id2]
    
    # 1. On Author level
    
    # let 1 has 2's all paper_ids
    for pid in author2.paper_ids:
        author1.add_paper(pid)
    
    # make 2's name as 1's nickname
    author1.add_nickname(author2.name)
    
    # 2. On Papers level
    # Make author2's papers that contain author2.id now contain author1.id
    for pid in author2.paper_ids:
        paper = papers[pid]
        paper.author_ids = [id1 if x == id2 else x for x in paper.author_ids]
    
    # remove id2
    authors.pop(id2)
    
    print author1.name, " AND ", author2.name, "ARE MERGED!"

### Read from disambiguation file

Think of these name pairs as edges in graph, we need to find connected components of that graph and each component is referring to a person's name set.

In [14]:
import networkx as nx
G=nx.Graph()
disamb_file_path = "data/disambiguation.txt"

dependency = []
with open(disamb_file_path, "rb") as f:
    for line in f:
        segs = line.strip().split("\t")
        id1 = segs[0]
        id2 = segs[2]
        G.add_edge(int(id1), int(id2))

names = [sorted(list(c)) for c in sorted(nx.connected_components(G), key=len, reverse=True)]

### Perform merging

In [15]:
for name_list in names:
    for i in range(0, len(name_list)-1):
        idx = len(name_list) - 1 - i
        merge(str(name_list[idx-1]), str(name_list[idx]), authors, papers)

Richard J. Malak, Jr.  AND  Richard J. Malak ARE MERGED!
Richard Malak  AND  Richard J. Malak, Jr. ARE MERGED!
Kenneth Chase  AND  Ken W. Chase ARE MERGED!
Kenneth W. Chase  AND  Kenneth Chase ARE MERGED!
Ali Farhang-Mehr  AND  Ali Farhang Mehr ARE MERGED!
A. Farhang-Mehr  AND  Ali Farhang-Mehr ARE MERGED!
Jami J. Shah  AND  Jami Shah ARE MERGED!
J. J. Shah  AND  Jami J. Shah ARE MERGED!
Fuewen Frank Liou  AND  Frank Liou ARE MERGED!
Frank W. Liou  AND  Fuewen Frank Liou ARE MERGED!
George Fadel  AND  Georges M. Fadel ARE MERGED!
Georges Fadel  AND  George Fadel ARE MERGED!
Bryony L. DuPont  AND  Bryony DuPont ARE MERGED!
Bryony L. Du Pont  AND  Bryony L. DuPont ARE MERGED!
Carolyn Seepersad  AND  Carolyn C. Seepersad ARE MERGED!
Carolyn Conner Seepersad  AND  Carolyn Seepersad ARE MERGED!
Panos Papalambros  AND  P. Papalambros ARE MERGED!
Panos Y. Papalambros  AND  Panos Papalambros ARE MERGED!
Nam H. Kim  AND  Nam Ho Kim ARE MERGED!
Nam-Ho Kim  AND  Nam H. Kim ARE MERGED!
Katie White

## 5. Network Construction

In [16]:
import networkx as nx

In [28]:
def make_pairs(input_list):
    length = len(input_list)
    ret = []
    if length == 1:
        return [(input_list[0], input_list[0])]
    for i in range(0, length-1):
        for j in range(i+1, length):
            ret.append((input_list[i], input_list[j]))
    return ret

In [29]:
def papers_by_year(papers, inf, sup):
    ret = []
    for p in papers.values():
        if p.year <= sup and p.year >= inf:
            ret.append(p)
    return ret

In [30]:
def make_edges(papers_selected):
    edge_list = []
    for p in papers_selected:
        edge_list.extend(make_pairs(p.author_ids))
    return edge_list

In [31]:
def author_network(papers, inf_year, sup_year):
    papers_between = papers_by_year(papers, inf_year, sup_year)
    edge_list = make_edges(papers_between)
    
    G=nx.Graph()
    for edge in edge_list:
        G.add_edge(edge[0], edge[1])
    return G

In [32]:
g = author_network(papers, 2000, 2015)

In [33]:
len(g)

2374

In [35]:
g.degree()

{'643': 7,
 '592': 3,
 '344': 2,
 '345': 6,
 '346': 2,
 '347': 2,
 '340': 3,
 '341': 5,
 '342': 5,
 '343': 12,
 '348': 3,
 '349': 3,
 '2318': 9,
 '2319': 1,
 '2316': 2,
 '2317': 7,
 '2314': 7,
 '2315': 2,
 '2313': 2,
 '2311': 7,
 '298': 4,
 '299': 4,
 '296': 4,
 '297': 1,
 '294': 2,
 '295': 2,
 '292': 2,
 '293': 2,
 '290': 3,
 '291': 2,
 '270': 1,
 '271': 4,
 '272': 3,
 '273': 2,
 '274': 1,
 '275': 7,
 '276': 3,
 '277': 2,
 '278': 2,
 '279': 2,
 '738': 4,
 '581': 3,
 '2268': 2,
 '2269': 7,
 '2262': 3,
 '2263': 3,
 '2260': 3,
 '2261': 2,
 '2266': 2,
 '2267': 2,
 '2264': 2,
 '2265': 3,
 '2442': 3,
 '2443': 9,
 '2440': 4,
 '2441': 1,
 '2446': 5,
 '2447': 1,
 '2445': 1,
 '2448': 1,
 '108': 1,
 '109': 2,
 '102': 6,
 '103': 4,
 '100': 13,
 '101': 2,
 '106': 1,
 '107': 2,
 '104': 1,
 '105': 4,
 '2046': 2,
 '2047': 1,
 '2045': 1,
 '2043': 1,
 '2040': 8,
 '2041': 4,
 '1297': 1,
 '2049': 3,
 '1210': 6,
 '99': 1,
 '98': 1,
 '91': 2,
 '90': 3,
 '93': 2,
 '92': 2,
 '95': 3,
 '94': 5,
 '97': 2,
 '96

In [None]:
len(authors)

In [None]:
def print_author_node(author_list):
    node_list = ["id\tname"]
    for author in author_list:
        node_info = "\t".join([author.aid, author.name])
        node_list.append(node_info)
    return node_list

In [None]:
def print_paper_node(paper_list):
    node_list = ["id\ttitle"]
    for paper in paper_list:
        node_info = "\t".join([paper.pid, paper.title])
        node_list.append(node_info)
    return node_list

In [None]:
def print_to_file(info_list, filename):
    with open(filename, "wb") as f:
        for line in info_list:
            f.write(line)
            f.write("\n")
    print filename, "DONE!"

### 5.1 Preparation

In [None]:
class Interpreter():
    def __init__(self):
        self.digit_holder = {}
        self.string_holder = {}
        
    def add(self, key, value):
        self.digit_holder[key] = value
        self.string_holder[value] = key
    
    def lookup(self, key):
        if type(key) is int:
            return self.digit_holder[key]
        else:
            return self.string_holder[key]

In [None]:
fake_aid = Interpreter()
fake_pid = Interpreter()

for i in range(0, len(authors.keys())):
    aid = authors.keys()[i]
    fake_aid.add(i, aid)
    
for i in range(0, len(papers.keys())):
    pid = papers.keys()[i]
    fake_pid.add(i, pid)

In [None]:
def make_pairs(input_list):
    length = len(input_list)
    ret = []
    if length <= 1:
        return []
    for i in range(0, length-1):
        for j in range(i+1, length):
            ret.append((input_list[i], input_list[j]))
    return ret

# 6 Author Network Construction

In [None]:
def author_network(authors, papers, start_year, end_year):
    edge_list = []
    #edge_list = ["from\tto\tweight\tpaper"]
    for p in papers.values():
        if p.year < start_year or p.year > end_year:
            continue
        author_ids = p.author_ids
        edges = make_pairs(author_ids)
        for edge in edges:
            # print edge
            edge_list.append("\t".join([edge[0], edge[1], "10", p.title]))
    return edge_list

In [None]:
def author_from_edgelist(edge_list):
    author_set = set()
    for edge in edge_list:
        segs = edge.split("\t")
        author_set.add(segs[0])
        author_set.add(segs[1])
    return list(author_set)

In [None]:
def network_output(authors, papers, start_year, end_year, edge_file, node_file):
    edge_list = author_network(authors, papers, start_year, end_year)
    
    with open(edge_file, "wb") as f:
        f.write("from\tto\tweight\tpaper\n")
        for line in edge_list[0:50]:
            f.write(line.encode('utf8'))
            f.write("\n")
    node_list = author_from_edgelist(edge_list[0:50])
    
    with open(node_file, "wb") as f:
        f.write("ID\tName\tType\n")
        for au in node_list[0:50]:
            line = "\t".join([au, authors[au].name, "P"])
            f.write(line.encode('utf8'))
            f.write("\n")
    return

In [None]:
network_output(authors, papers, 2011, 2015, "edges.txt", "nodes.txt")

In [None]:
len(authors)

## 6. Paper Network Construction

## 10. Export

In [None]:
import pickle

In [None]:
out_au = []
for au in authors.values():
    out_au.append(au.__dict__)

with open("./Data/Author_Data.json", "wb") as f:
    json.dump(out_au, f)

In [None]:
out_pp = []
for pp in papers.values():
    out_pp.append(pp.__dict__)

with open("./Data/Paper_Data.json", "wb") as f:
    json.dump(out_pp, f)

In [None]:
with open("./Data/Author_Data.pickle", "wb") as f:
    pickle.dump(authors, f)

In [None]:
with open("./Data/Paper_Data.pickle", "wb") as f:
    pickle.dump(papers, f)

In [None]:
import rake
import operator

In [None]:
G=nx.path_graph(4)

In [None]:
nx.write_edgelist(G, "test.edgelist")

In [None]:
o = []
for p in papers.values():
    o.extend(p.topics)

In [None]:
o = list(set(o))

In [None]:
len(o)

In [None]:
len(papers)

In [None]:
print "from\tto\tweight\ttype"