In [1]:
%load_ext autoreload
%autoreload 2

import glob
import textract
from typing import Dict
from tqdm.auto import tqdm
from dataclasses import dataclass
import os
import csv

# Data loading

In [39]:
# Note: to get textract to work here, we had to modify textract/parsers/utils.py
# decode function to return UTF8 whenever the chardet confidence was not high
# enough, eg:
#
#   result = chardet.detect(text)
#   encoding = result['encoding'] if result['confidence'] > 0.85 else 'utf-8'
#   return text.decode(encoding, 'ignore')
#
# We'd otherwise just get a bunch of UnicodeDecodeErrors

# Paper struct
@dataclass
class Paper:
    title: str
    contents: str
    extension: str


# Load CSV
with open("../masterdatafinal.csv") as file:
    csv_contents = list(csv.reader(file))

paper_map: Dict[int, Paper] = {}

paper_paths = glob.glob("papers/*")
for path in tqdm(paper_paths):
    paper_id, extension = os.path.basename(path).split(".")
    paper_id = int(paper_id)
    contents = textract.process(path).decode("utf8")
    paper_map[paper_id] = Paper(
        title=csv_contents[paper_id][1], contents=contents, extension=extension
    )

HBox(children=(FloatProgress(value=0.0, max=133.0), HTML(value='')))




# NLP stuff 

First, let's just define a helper for pulling out keywords.

We'll just use TextRank, which is maybe not state of the art but will hopefully be sufficient...

In [24]:
# Uncomment to download language model
# !python -m spacy download en_core_web_lg

import spacy
import pytextrank
import en_core_web_lg

nlp = en_core_web_lg.load()
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)


@dataclass
class Keyword:
    keyword: str
    count: int
    rank: float


def get_keywords(text):
    # Get keywords
    output = []
    doc = nlp(text)
    max_rank = 0
    for p in doc._.phrases:
        output.append(Keyword(keyword=p.text, count=p.count, rank=p.rank))
        max_rank = max(max_rank, p.rank)

    # Normalize ranks
    for keyword in output:
        keyword.rank = keyword.rank / max_rank
    return output

In [62]:
max_length = 300000  # absolute max is 1000000 (words?)
keyword_map = {}

for paper_id, paper in tqdm(paper_map.items()):
    for keyword in get_keywords(
        paper.contents[:max_length].rpartition(" ")[0]
    ):
        if keyword.keyword in keyword_map:
            keyword_map[keyword.keyword].count += keyword.count
            keyword_map[keyword.keyword].rank += keyword.rank
        else:
            keyword_map[keyword.keyword] = keyword

HBox(children=(FloatProgress(value=0.0, max=133.0), HTML(value='')))

24 .48 .2 .52 .83 .34 .15 .21 .97 .38 .56 .130 .101 .1 .104 .42 .91 .49 .26 .126 .9 .36 .113 .28 .118 .110 .99 .131 .37 .23 .114 .106 .80 .14 .98 .127 .71 .5 .78 .103 .85 .123 .18 .45 .87 .11 .61 .46 .22 .4 .63 .72 .133 .107 .20 .17 .41 .136 .13 .66 .96 .44 .39 .122 .62 .3 .137 .89 .7 .112 .100 .90 .76 .124 .31 .117 .33 .58 .93 .65 .64 .67 .40 .86 .134 .108 .84 .139 .142 .119 .88 .53 .47 .16 .60 .54 .6 .27 .10 .128 .95 .25 .35 .121 .29 .59 .43 .111 .57 .82 .30 .51 .68 .135 .79 .8 .69 .116 .129 .32 .73 .125 .120 .138 .94 .19 .92 .81 .55 .12 .75 .115 .77 .


In [94]:
# Print out top 1000 keywords in alphabetical order
sorted_keywords = sorted(
    keyword_map.keys(), key=lambda k: -keyword_map[k].rank
)


def is_valid(k):
    if len(k) <= 4:
        return False
    if len(set(k) - set("abcdefghijklmnopqrstuvwxyz ")) > 0:
        return False
    return True


## Uncomment this line to print out top ~1000 keywords, sorted alphabetically
# print(sorted(filter(is_valid, sorted_keywords[:1000])))

In [98]:
from curated_keywords import curated_keywords

print(f"Loaded {len(curated_keywords)} curated keywords")

Loaded 41 curated keywords
