In [1]:
%load_ext autoreload
%autoreload 2

import glob
import textract
from typing import Dict
from tqdm.auto import tqdm
import time
import dataclasses
import os
import csv

# Data loading

In [2]:
# Note: to get textract to work here, we had to modify textract/parsers/utils.py
# decode function to return UTF8 whenever the chardet confidence was not high
# enough, eg:
#
#   result = chardet.detect(text)
#   encoding = result['encoding'] if result['confidence'] > 0.85 else 'utf-8'
#   return text.decode(encoding, 'ignore')
#
# We'd otherwise just get a bunch of UnicodeDecodeErrors

# Paper struct
@dataclasses.dataclass
class Paper:
    title: str
    contents: str
    extension: str


# Load CSV
with open("../masterdatafinal.csv") as file:
    csv_contents = list(csv.reader(file))

# Load up all papers
paper_from_id: Dict[int, Paper] = {}

paper_paths = glob.glob("papers/*")
for path in tqdm(paper_paths):
    paper_id, extension = os.path.basename(path).split(".")
    paper_id = int(paper_id)

    contents = textract.process(path).decode("utf8")

    # Keep only the first 70% or 500000 characters
    # This is kind of a hack: gets rid of citations, speeds up computation, saves memory, etc
    max_length = 500000
    desired_length = min(int(len(contents) * 0.7), max_length)

    contents = contents[:desired_length].rpartition(" ")[0]
    paper_from_id[paper_id] = Paper(
        title=csv_contents[paper_id][1], contents=contents, extension=extension
    )

HBox(children=(FloatProgress(value=0.0, max=138.0), HTML(value='')))




# NLP stuff 

First, let's just define a helper for pulling out keywords.

We'll just use TextRank, which is maybe not state of the art but will hopefully be sufficient...

In [3]:
# Uncomment to download language model
# !python -m spacy download en_core_web_lg

# Keyword struct
@dataclasses.dataclass
class Keyword:
    keyword: str
    count: int
    rank: float


# NLP helper
import spacy
import pytextrank
import en_core_web_lg

nlp = en_core_web_lg.load()
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

# Parsing helper
def get_keywords(text):
    # Get keywords
    output = []
    doc = nlp(text)
    max_rank = 0
    for p in doc._.phrases:
        output.append(Keyword(keyword=p.text, count=p.count, rank=p.rank))
        max_rank = max(max_rank, p.rank)

    # Normalize ranks
    for keyword in output:
        keyword.rank = keyword.rank / max_rank
    return output

In [4]:
# Parallelized keyword extraction


def _():
    import multiprocessing
    import time

    pool = multiprocessing.Pool()

    start_time = time.time()
    keywords_list = pool.map(
        get_keywords, [paper.contents for paper in paper_from_id.values()],
    )
    print("Keyword extraction took:", time.time() - start_time, "seconds")

    return keywords_list


keywords_from_id = dict(zip(paper_from_id.keys(), _()))

Keyword extraction took: 138.6468641757965 seconds


In [38]:
# Consolidate keywords: help us determine which ones to actually use
def list_top_keywords():
    all_keyword_map = {}
    for keywords in tqdm(keywords_from_id.values()):
        for keyword in keywords:
            if keyword.keyword in all_keyword_map:
                all_keyword_map[keyword.keyword].count += keyword.count
                all_keyword_map[keyword.keyword].rank += keyword.rank
            else:
                all_keyword_map[keyword.keyword] = Keyword(
                    **dataclasses.asdict(keyword)
                )

    # Print out top 1000 keywords in alphabetical order
    sorted_keywords = sorted(
        all_keyword_map.keys(), key=lambda k: -all_keyword_map[k].rank
    )

    def is_valid(k):
        if len(k) <= 4:
            return False
        if len(set(k) - set("abcdefghijklmnopqrstuvwxyz ")) > 0:
            return False
        return True

    # Print out top ~1000 keywords, sorted alphabetically
    print(sorted(filter(is_valid, sorted_keywords[:1000])))


list_top_keywords()

HBox(children=(FloatProgress(value=0.0, max=138.0), HTML(value='')))


['a control policy', 'a function', 'a given state', 'a large number', 'a number', 'a random action', 'a real robot', 'a reward function', 'a set', 'a small number', 'a stochastic policy', 'a variety', 'a way', 'a wide range', 'abbeel', 'abbeel et al', 'abstract', 'access', 'account', 'accuracy', 'accurate models', 'action', 'action execution', 'action selection', 'actions', 'active learning', 'actuators', 'adaptation', 'adaptive control', 'addition', 'additional information', 'advance', 'advantage', 'agent', 'agents', 'algorithm', 'algorithms', 'an optimal policy', 'analysis', 'animals', 'application', 'applications', 'apprenticeship learning', 'approach', 'approaches', 'approximate inference', 'approximate inference methods', 'approximation', 'areas', 'artificial agents', 'artificial intelligence', 'artificial neural networks', 'artificial systems', 'assumptions', 'atkeson', 'attention', 'autonomous mental development', 'autonomous robots', 'autonomous systems', 'background', 'bagnel

In [39]:
del curated_keyword_map

In [40]:
from curated_keywords import curated_keyword_map

print(
    f"Loaded {len(curated_keyword_map)} total keywords, {len(set(curated_keyword_map.values()))} unique"
)

Loaded 120 total keywords, 25 unique


In [41]:
def _():
    top_keywords_from_id = {}

    rank_threshold = 0.6
    count_threshold = 2
    min_keywords = 4

    import collections

    keyword_counts = collections.defaultdict(lambda: 0)

    for paper_id, keywords in keywords_from_id.items():
        paper_keyword_counts = collections.defaultdict(lambda: 0)
        for k in keywords:
            if (
                k.rank <= rank_threshold
                and len(paper_keyword_counts) >= min_keywords
            ):
                break

            if k.keyword not in curated_keyword_map:
                continue

            nominal = curated_keyword_map[k.keyword]
            if nominal not in paper_keyword_counts:
                keyword_counts[nominal] += 1
            paper_keyword_counts[nominal] += k.count

        top_keywords = (
            []
        )  # A set would work here, but ordering is not supported natively
        for keyword, count in paper_keyword_counts.items():
            if count >= count_threshold and keyword not in top_keywords:
                top_keywords.append(keyword)

        for keyword, nominal in curated_keyword_map.items():
            if (
                keyword in paper_from_id[paper_id].title.lower()
                or (
                    nominal not in ("reinforcement learning")
                    and keyword in csv_contents[paper_id][5].lower()
                )
            ) and nominal not in top_keywords:
                top_keywords.insert(0, nominal)
                keyword_counts[nominal] += 1

        if len(top_keywords) > 0:
            top_keywords_from_id[paper_id] = top_keywords

    for k in sorted(keyword_counts.keys(), key=lambda k: keyword_counts[k]):
        print(f"{k} ({keyword_counts[k]}) ", end="")
    print()
    return top_keywords_from_id


top_keywords_from_id = _()

contact dynamics (4) genetic algorithms (7) evolution (8) cognitive sciences (9) unsupervised learning (9) nonlinear systems (10) legged robots (10) dynamic programming (13) trajectory optimization (13) humanoid robotics (14) locomotion (15) survey (17) mobile robots (18) policy gradients (22) state estimation (23) gaussians (24) manipulation (25) visual perception (26) optimal control (27) planning (32) probabilistic models (34) dynamical systems (41) neural networks (44) learning from demonstration (48) reinforcement learning (77) 


In [42]:
not_surveys = (130, 43)

for paper_id in sorted(top_keywords_from_id.keys()):
    keywords = top_keywords_from_id[paper_id]
    if "survey" in keywords:
        if paper_id in not_surveys:
            keywords.remove("survey")
        print(f"{paper_id}: {paper_from_id[paper_id].title}")

print(top_keywords_from_id[128])


20: a survey on policy search for robotics
34: a survey of iterative learning control
43: robots that can adapt like animals
61: is imitation learning the route to humanoid robots?
72: data-driven grasp synthesis-a survey
76: reinforcement learning: a survey
84: an algorithmic perspective on imitation learning
93: model learning for robot control: a survey
94: a brief survey of deep reinforcement learning
96: locally weighted learning and locally weighted learning for control
97: reinforcement learning in robotics: a survey
101: cognitive developmental robotics: a survey
129: affordances in psychology|neuroscience and robotics: a survey
130: assessing grasp stability based on learning and haptic data
144: learning control in robotics
145: a review of robot learning for manipulation: challenges, representations, and algorithms
['optimal control']


In [43]:
# Print papers with missing keywords
for id in sorted(
    set(range(1, len(csv_contents))) - set(top_keywords_from_id.keys())
):
    print(id, csv_contents[id][1])

31 applied nonlinear control
50 robot skill learning: from reinforcement learning to evolution strategies
70 robotics|vision and control - fundamental algorithms in matlab
73 a simple learning strategy for high-speed quadrocopter multi-flips
102 on the adaptive control of robot manipulator
105 forward models: supervised learning with a distal teacher
109 representations for robot knowledge in the knowrob framework
115 resilient machines through continuous self-modeling
120 how the body shapes the way we think: a new view of intelligence
132 optimal control and estimation
139 sequential composition of dynamically dexterous robot behaviors
140 map learning with uninterpreted sensors and effectors
141 experiments in synthetic psychology


In [47]:
manual_keywords_from_id = {
    31: ["nonlinear systems", "optimal control"],  # applied nonlinear control
    50: [
        "reinforcement learning",
        "evolution",
    ],  # robot skill learning: from reinforcement learning to evolution strategies
    70: [
        "visual perception"
    ],  # robotics|vision and control - fundamental algorithms in matlab
    73: [
        "policy gradients"
    ],  # a simple learning strategy for high-speed quadrocopter multi-flips
    102: [
        "dynamical systems",
        "manipulation",
    ],  # on the adaptive control of robot manipulator
    105: [
        "dynamical systems",
        "neural networks",
    ],  # forward models: supervised learning with a distal teacher
    109: [],  # representations for robot knowledge in the knowrob framework
    115: [
        "legged robots",
        "locomotion",
    ],  # resilient machines through continuous self-modeling
    120: [
        "cognitive sciences"
    ],  # how the body shapes the way we think: a new view of intelligence
    128: [
        "cognitive sciences",
        "dynamical systems",
    ],  # the coordination of arm movements: an experimentally confirmed mathematical model
    132: [
        "optimal control",
        "state estimation",
    ],  # optimal control and estimation
    139: [
        "dynamical systems",
    ],  # sequential composition of dynamically dexterous robot behaviors
    140: [],  # map learning with uninterpreted sensors and effectors
    141: ["cognitive sciences"],  # experiments in synthetic psychology
    146: ["policy gradients", "manipulation"]  # closing the sim-to-real loop: adapting simulation randomization with real world experience
}

# Validate
for k, v in manual_keywords_from_id.items():
    if k in top_keywords_from_id:
        print(paper_from_id[k].title)
        print("Current:", top_keywords_from_id[k])
        print("Adding:", v)
        top_keywords_from_id[k].extend(
            [
                keyword
                for keyword in v
                if keyword not in top_keywords_from_id[k]
            ]
        )
    for keyword in v:
        assert curated_keyword_map[keyword] == keyword

the coordination of arm movements: an experimentally confirmed mathematical model
Current: ['optimal control']
Adding: ['cognitive sciences', 'dynamical systems']
closing the sim-to-real loop: adapting simulation randomization with real world experience
Current: ['reinforcement learning']
Adding: ['policy gradients', 'manipulation']


In [48]:
# Deep copy CSV contents
new_csv_contents = [row[:] for row in csv_contents]
new_csv_contents[0].append("Keywords")

for row in new_csv_contents[1:]:
    paper_id = int(row[0])
    if paper_id in top_keywords_from_id:
        keywords = top_keywords_from_id[paper_id]
    else:
        keywords = manual_keywords_from_id[paper_id]
    row.append(",".join(keywords))

In [49]:
with open("./masterdata_keywords.csv", "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in new_csv_contents:
        csv_writer.writerow(row)

In [51]:
# keywords_from_id[146]