<h1 align="center">DAC Topic Modeling</h1> 

Reference:    
[1] Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic keyword extraction from individual documents. Text Mining, 1-20.

In [1]:
import json

In [2]:
class Author():
    def __init__(self, name, aid):
        self.name = name
        self.aid = aid
        self.nicknames = []
        self.paper_ids = []
    
    def add_paper(self, pid):
        if pid not in self.paper_ids:
            self.paper_ids.append(pid)
            
    def add_nickname(self, name):
        if name not in self.nicknames:
            self.nicknames.append(name)
        

In [3]:
class Paper():
    def __init__(self, title, abstract, year, author_names, b_topic, topics, pid, detc, url):
        
        # Basic info
        self.title = title
        self.abstract = abstract
        self.year = year
        self.author_names = author_names
        self.broad_topic = b_topic
        self.topics = topics
        self.pid = pid
        self.detc = detc
        self.url = url
        
        # add later
        self.author_ids = []
        self.citations = []
        self.cited_by = []
    
    def add_author_id(self, aid):
        if aid not in self.author_ids:
            self.author_ids.append(aid)
        

## 1. Read papers 

In [4]:
file_path = "../data/DAC_Entire_DataBase.json"

with open(file_path, "r") as f:
    database = json.load(f)

In [5]:
papers = {}
for p in database:
    paper = Paper(p['Title'], p['Abstract'],p['Year'],p['Authors'], p['Broad_Topic'],\
                  p['Topics'], p['PaperID'],p['DETC'], p['URL'])
    papers[paper.pid] = paper

## 2. Read authors

In [6]:
## add author into the dataset
author_names = {}

for p in papers.values():
    for n in p.author_names:
        author_names[n] = 1

In [7]:
# Assign IDs to each author

id = 0
for n in author_names.keys():
    author_names[n] = str(id)
    id += 1

In [8]:
authors = {}

for name in author_names.keys():
    authors[author_names[name]] = Author(name, author_names[name])

In [9]:
def make_name_to_author_dict(authors):
    ret = {}
    for author in authors.values():
        ret[author.name] = author
    return ret

# 3. Retrieve Documents

In [24]:
def extract_text(papers, start_year, end_year):
    text_list = []
    for p in papers.values():
        if p.year >= start_year or p.year <= end_year:
            text_list.append(p.abstract)
    text = " ".join(text_list)
    return text

In [25]:
text = extract_text(papers, 2002, 2015)

# 4. Keyword Extraction

In [26]:
import rake
import operator
import pyprind

In [27]:
rake_object = rake.Rake("SmartStoplist.txt", 5, 5, 8)

In [None]:
keywords = rake_object.run(text)
for key in keywords:
    print key[0]