# python から scrapbox に記入
webbrowser だと tab を開いてしまうのが課題

https://ich.hatenadiary.com/entry/scrapbox-newpage-from-terminal

In [146]:
import webbrowser
# import chromedriver_binary
# from selenium import webdriver
import urllib.request

import time, datetime
import subprocess

In [2]:
class WriteToScrapbox():
    
    def __init__(self,
                 project: str,   # project name
                ):
        self.today = datetime.datetime.today().strftime("%Y.%m.%d")
        self.project = project[:-1] if project.endswith('/') else project
        
    def write(self, title, contents, tag, open=False):
        tag += '\n\n'
        title, contents, tag = map(lambda x: urllib.parse.quote(x), [title, contents, tag])
        url = 'https://scrapbox.io/{}/{}?body={}{}'.format(
                        self.project, title, tag, contents)
        webbrowser.open(url)

In [3]:
project = 'ms-papers'
sbwriter = WriteToScrapbox(project)

# Scraping

In [90]:
import time
from typing import List, Dict, Tuple
from tqdm import tqdm

import re
import requests
from bs4 import BeautifulSoup

import arxiv
from googletrans import Translator


trans = Translator()
def translator(text, src='en', tgt='ja'):
    return trans.translate(text, src=src, dest=tgt)


In [5]:
class Scraping():
    def __init__(self, url:str):
        self.url = url
    
    def _request(self, url):   # -> bs4.BeautifulSoup
        return requests.get(self.url, timeout=10)
    
    def _bs4(self, r):
        return BeautifulSoup(r.content, 'html.parser')

In [140]:
class ReadFromAnthology(Scraping):
    
    def __init__(self,
                 conference: str,
                 year: int,
                 url: str = 'https://www.aclweb.org/anthology',
                 ):
        super().__init__(url)
        self.conference = '{}-{}'.format(conference, year)
        self.parent = url
        self.url = self.parent + '/events/{}-{}/'.format(conference.lower(), year)
        self.r = self._request(self.url)
        self.soup = self._bs4(self.r)
        
    def read(self, *query) -> List[Dict[str, str]]:
        """ WEB SOURCE
        <span class=d-block>
            <strong><a class=align-middle href=/anthology/W19-3202/>
                Lexical Normalization of User-Generated Medical Text
            </a></strong>
        """
        papers = self.soup.find_all('a',
                       attrs={'href':re.compile('/anthology/[A-Z]([0-9]*?)-([0-9]*?)/'), 'class':'align-middle'},
                       text=re.compile('|'.join(query), flags=re.IGNORECASE)
                      )
        return [{'title': p.text, 'url': self.parent[:-len('/anthology')]+p['href'], 'venue':self.conference} for p in papers]

In [141]:
reader = ReadFromAnthology('ACL', 2019)

In [142]:
papers = reader.read('semantic Role', 'QA', 'Question Answering')
papers[0]

{'title': 'The Meaning of “Most” for Visual Question Answering Models',
 'url': 'https://www.aclweb.org/anthology/W19-4806/',
 'venue': 'ACL-2019'}

### ReadFromAnthology.read から取得可能な paper が arxiv にヒットしない

In [163]:
from functools import lru_cache

    
class ArxivSearch():
    def __init__(self):
        pass
    
    def replacing(self, abst:str):
        def handler(abst, before, after):
            return abst.replace(before, after)
        conversions = {'\n':' ', 'G\\':'', '\\':'', '{':''}
        map(lambda b, a: handler(abst, b, a), conversions.items())
        return ' '.join(abst.split('\n'))
        
    def paper_info(self, year, sort_by='submittedDate', order='descending', max=50, **query) -> List[Dict[str, str]]:
        """
        sort_by: relevance, lastUpdatedDate, submittedDate
        ------
        ti: Title
        au: Author
        abs: Abstract
        cat: Subject Category
        all: All
        """
        tmp = []
        for k, vs in query.items():
            if type(vs) is str:
                tmp.append('{}:{}'.format(k, vs))
            else:
                for v in vs:
                    tmp.append('{}:{}'.format(k, v))
                    
        q = ' OR '.join(tmp)
        out = []
        ls = arxiv.query(query=q, max_results=max)
        if ls:
            for l in ls:
                published = l.get('published')
                if (published is not None) and int(published[:4]) == year:
                    en_abst = self.replacing(l.get('summary', ''))
                    ja_abst = translator(en_abst).text
                    info = {
                        'title': l.get('title', ''),
                        'author': l.get('author', ''),
                        'arxiv': l.get('arxiv_url', ''),
                        'en_abst': en_abst,
                        'ja_abst': ja_abst
                    }
                    out.append(info)
        return out

In [133]:
arxiver = ArxivSearch()
papers = arxiver.paper_info(ti='semantic role')

ti:semantic role


## main

In [165]:
if __name__ == '__main__':
    reading_src = ['arxiv', 'anthology'][0]
    YEAR = 2019
    QUERY_TI = ('semantic Role', 'QA', 'Question Answering')
    VENUE = 'ACL'
    
    ## scrapbox
    project = 'ms-papers'
    sbwriter = WriteToScrapbox(project)
    
    ## papers from ACLAnthology
    if reading_src == 'anthology':
        reader = ReadFromAnthology(VENUE, YEAR) #乱発禁止
        papers = reader.read(*QUERY_TI)

        for paper in papers:
            tag = '#' + paper['venue']
            title = paper['title']
            body = paper['url']
            sbwriter.write(title, body, tag)
            
    elif reading_src == 'arxiv':
        reader = ArxivSearch()
        papers = reader.paper_info(YEAR, ti=QUERY_TI)
        
        body = ''
        for paper in papers:
            title = paper['title']
            body += paper['author'] + '\n'
            body += paper['arxiv'] + '\n\n'
            body += '>' + paper['en_abst'] + '\n\n'
            body += '>' + paper['ja_abst'] + '\n\n'
            tag = '#' + str(YEAR)
            sbwriter.write(title, body, tag)