In [3]:
import requests
import re,os
from time import sleep
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
from pprint import pprint
from tqdm.notebook import tqdm
import mojimoji

example_url = 'http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/contents/2154.html'

In [4]:
def editor(dd_tag):
    try:
        editor_name = dd_tag.text.split('\n')[1].replace('\u3000','')
    except:
        editor_name = 'No name'
    return editor_name

def title(dd_tag):
    try:
        title_list = list(dd_tag.a.childGenerator())
        english_title = title_list[0]#.replace('\n',' ')
        japanese_title = title_list[2]
        return english_title,japanese_title
    except:
        try:
            return dd_tag.a.text,''
        except:
            return '',''

def link(dd_tag,kokyuroku_path='http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/'):
    try:
        path = kokyuroku_path+dd_tag.a.get('href')
    except:
        path = 'No URL'
    return path

In [5]:
def scrape_koukyuroku_list(start_year=1964,end_year=2020,print_info=False):
    data = []
    for year in tqdm(range(start_year,end_year+1)):
        res = requests.get('http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/{0}.html'.format(year))
        soup = bs(res.text.encode(res.encoding),'html5lib')
        dd = soup.find_all('dd')
        numbers = reversed(list(map(lambda x:x.text[3:],soup.find_all('dt'))))
        editors = reversed(list(map(editor, dd)))
        titles = reversed(list(map(title, dd)))
        links = reversed(list(map(link, dd)))
        for nu,au,ti, li in zip(numbers,editors,titles,links):
            data.append([nu,au,ti[0],ti[1],li,year])
            if print_info:
                print(nu,au,ti[0],ti[1])
        sleep(1)
    
    # DataFrameへの変換
    col = ['editors','English Title','Japanese Title','Link','Year']
    data_np = np.array(data)
    for i in range(data_np.shape[0]):
        for j in range(data_np.shape[1]):
            if '\n' in data_np[i,j] or '\r' in data_np[i,j]:
                data_np[i,j] = data_np[i,j].replace('\n','').replace('\r','')
    data_df = pd.DataFrame(data_np[:,1:],index=data_np[:,0],columns=col)
    return data_df

def save_koukyuroku_list(data_df,save_path):
    data_df.to_csv(save_path)
    
def remove_new_lines(csv_path,save_path):
    #改行消す
    with open(csv_path,'r') as f:
        lines = f.readlines()
        new_lines = [lines[0]]
        for i in range(1,len(lines)):
            line = lines[i]
            if re.fullmatch(r'[0-9]+',line.split(',')[0]):
                new_lines.append(line)
            else:
                new_lines[-1] = ''.join([new_lines[-1][:-1],line])
    
    #全角英字を消してeditorがいないものは'No editor'にする
    for i in range(len(new_lines)):
        record = mojimoji.zen_to_han(new_lines[i].replace('，','・'), kana=False, ascii=True).split(',')
        if record[1].replace(' ','')==(''.join(record[2:4]).strip().replace(' ','')):
            record[1] = 'No editor'
        if re.fullmatch(r"[a-zA-Z0-9\(\), -:']+",record[2]) is None and record[3]=='':
            record[2],record[3] = record[3], record[2]
        if re.fullmatch(r"[a-zA-Z0-9\(\), -:']+",record[3]) and record[2]=='':
            record[2],record[3] = record[3], record[2]
        new_lines[i] = ",".join(record)
        
    with open(save_path, 'w') as f:
        f.write(''.join(new_lines))
        f.flush()
        
def pipeline_scrape_to_save(start_year=1964,end_year=2020):
    csv_path = f'./kokyuroku_{start_year}_to_{end_year}.csv'
    data_df = scrape_koukyuroku_list(start_year,end_year)
    save_koukyuroku_list(data_df,csv_path)
    remove_new_lines(csv_path,csv_path)

In [168]:
pipeline_scrape_to_save()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


A Jupyter Widget




In [12]:
# res = requests.get('http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/2008.html')
# soup = bs(res.text.encode(res.encoding),'html5lib')
# dd = soup.find_all('dd')
# pprint(list(zip(list(map(lambda x:x.text[3:], soup.find_all('dt'))),list(map(title, dd)),list(map(author, dd)),list(map(link, dd)))))
# pprint(list(map(title, dd)))
# pprint(list(map(link, dd)))
# pprint(list(map(title, dd)))

In [None]:
# data = []
# for year in tqdm(range(1964,2021)):
#     res = requests.get('http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/{0}.html'.format(year))
#     soup = bs(res.text.encode(res.encoding),'html5lib')
#     dd = soup.find_all('dd')
#     numbers = reversed(list(map(lambda x:x.text[3:],soup.find_all('dt'))))
#     authors = reversed(list(map(author, dd)))
#     titles = reversed(list(map(title, dd)))
#     links = reversed(list(map(link, dd)))
#     for nu,au,ti, li in zip(numbers,authors,titles,links):
#         data.append([nu,au,ti[0],ti[1],li,year])
#         print(nu,au,ti[0],ti[1])
#     sleep(1)

In [253]:
# data_np = np.array(data)
# for i in range(data_np.shape[0]):
#     for j in range(data_np.shape[1]):
#         if '\n' in data_np[i,j] or '\r' in data_np[i,j]:
#             print(repr(data_np[i,j]))
#             data_np[i,j] = data_np[i,j].replace('\n','').replace('\r','')
# col = ['authors','English Title','Japanese Title','Link','Year'
# df = pd.DataFrame(data_np[:,1:],index=data_np[:,0],columns=col)
# df

In [255]:
# df.to_csv('./kokyuroku.csv')

In [8]:
# pd.read_csv('./kokyuroku.csv',index_col=0)

In [10]:
# with open('./kokyuroku.csv') as f:
#     lines = f.readlines()
#     new_lines = [lines[0]]
#     for i in range(1,len(lines)):
#         line = lines[i]
#         if re.fullmatch(r'[0-9]+',line.split(',')[0]):
#             new_lines.append(line)
#         else:
#             new_lines[-1] = ''.join([new_lines[-1][:-1],line])

# for i in range(len(new_lines)):
#     record = mojimoji.zen_to_han(new_lines[i].replace('，','・'), kana=False, ascii=True).split(',')
#     if record[1].replace(' ','')==(''.join(record[2:4]).strip().replace(' ','')):
#         record[1] = 'No author'
#     new_lines[i] = ",".join(record)

# with open('./kokyuroku_organized.csv', 'w') as f:
#     f.write(''.join(new_lines))
#     f.flush()

In [11]:
pd.read_csv('./kokyuroku_organized.csv',index_col=0)

Unnamed: 0,authors,English Title,Japanese Title,Link,Year
1,"中野茂男 (NAKANO,SHIGEO )",Some Analytic Structures Associated to Algebra...,代数曲線に附随する二三の解析的構造,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
2,岩野 正宏、木村 俊房、大久保謙二郎,境界層と変わり点に関するシンポジウム報告(I),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
3,角谷 典彦、今井 巧、森口 治生,境界層と変わり点に関するシンポジウム報告(II),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
4,"阿曾義之 (ASO,YOSHIYUKI )",Numerical Solution of Singular Integral Equati...,特異積分方程式の数値解法,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
5,No author,Proceeding of the Conference on Operator Ring ...,作用素環とその物理的応用に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
6,No author,Proceeding of the Conference on the Mathematic...,散乱の理論の数学に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
7,No author,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
8,No author,Reports on the Symposium of the Functional Equ...,最適制御問題の函数方程式研究シンポジウム報告1,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
9,"荒木不二洋 (ARAKI,HUZIHIRO )",Proceeding of the Conference on Operator Ring,作用素環研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
10,No author,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966


In [None]:
res = requests.get()
soup = bs(res.text.encode(res.encoding),'html5lib')

In [6]:
def number_of_pdf(div_tag):
    return int(re.search('[0-9]+\.',div_tag.text).group(0).replace('.',''))
    
def title_of_pdf(div_tag):
    if div_tag.a is None:
        text = div_tag.text.split(' ')[1]
        text = text.split('-')[0]
        return text
    return div_tag.a.text
    
def link_of_pdf(div_tag):
    if div_tag.a is None: return ''
    RIMS_path = "http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/contents/"
    pdf_path = div_tag.a.get('href').replace('./',RIMS_path)
    return pdf_path

def authors_of_pdf(div_tag):
    text = div_tag.text.strip()
    if '\u3000\u3000\u3000' in text:
        affiliations,authors = text.split('\u3000\u3000\u3000')
        affiliations = affiliations.split('/')
        affiliations = list(map(lambda x:x.strip(),affiliations))
    else: 
        authors = text.strip()
        
    if re.search(r'\([a-zA-Z]+',authors):
        try:
            authors_JPN,authors_ENG =  authors.split('(')
        except:
            start = re.search(r'\([a-zA-Z]+',authors).start()
            authors_JPN,authors_ENG =  authors[:start],authors[start+1:]
        authors_JPN = list(map(lambda x:x.strip(),authors_JPN.strip().split('/')))
        authors_ENG = list(map(lambda x:x.strip(),authors_ENG.replace(')','').strip().split('/')))
    else:
        authors_ENG = list(map(lambda x:x.strip(),authors.strip().split('/')))
        authors_JPN = ['']*len(authors_ENG)
    
    if not '\u3000\u3000\u3000' in text:
        affiliations = ['']*len(authors_ENG)
    return affiliations,authors_JPN,authors_ENG

def make_list_of_pdf_info(url):
    res = requests.get(url)
    soup = bs(res.text.encode(res.encoding),'html5lib')
    all_div = soup.find_all('div')
    
    i = 0
    list_pdf_info = []
    while i < len(all_div):
        div_tag = all_div[i]
        if re.search('[0-9]+\.',div_tag.text):
            number = number_of_pdf(div_tag)
            title = title_of_pdf(div_tag)
            link = link_of_pdf(div_tag)
            affiliations,authors_JPN,authors_ENG = authors_of_pdf(all_div[i+1])
            list_pdf_info.append([number,title,link,affiliations,authors_JPN,authors_ENG])
            i += 2
        else:
            i += 1
    return list_pdf_info

def save_pdf_info(url,kokyuroku_number):
    info = make_list_of_pdf_info(url)
    info = np.array([list(map(lambda x: ','.join(x) if isinstance(x,list) else x, l)) for l in info])
    columns = ['No.','title','link','Affiliation','Author in JPN','Author in ENG']
    info = pd.DataFrame(info,columns=columns).set_index('No.')
    info.to_csv(f'./pdf_info/{kokyuroku_number}.csv')
    
def save_all_pdf_info(kokyuroku_csv_path, print_title=False, skip_existing=True):
    df = pd.read_csv(kokyuroku_csv_path,index_col=0)
    kokyuroku_numbers = df.index.values
    links = df.values[:,3]
    titles = df.values[:,2]
    if os.path.exists('./error_log.txt'): os.remove('./error_log.txt')
    for i in tqdm(range(len(kokyuroku_numbers))):
        if print_title: print(titles[i]) if isinstance(titles[i],str) or not np.isnan(titles[i]) else print(df.values[i,1])
        if skip_existing and os.path.exists(f'./pdf_info/{kokyuroku_numbers[i]}.csv'): continue

        try:
            save_pdf_info(links[i],kokyuroku_numbers[i])
        except Exception as e:
            with open('./error_log.txt','a') as f:
                f.write(','.join([str(kokyuroku_numbers[i]),str(links[i]),str(e),'\n']))
                f.flush()
        finally:
            sleep(1)

In [1]:
# kokyuroku_csv_path = './kokyuroku_1964_to_2020.csv'
# save_all_pdf_info(kokyuroku_csv_path,print_title=True,skip_existing=True)

In [45]:
def concatenate_csv():
    info_dir = './pdf_info/'
    csv_list = sorted([file for file in os.listdir(info_dir) if '.csv' in file],
                       key = lambda x:int(x.replace('.csv','')))
    concatenated_df = pd.read_csv(''.join([info_dir,csv_list[0]]),index_col=0)
    kokyuroku_number = int(csv_list[0].replace('.csv',''))
    concatenated_df['kokyuroku_number'] = kokyuroku_number
    concatenated_df['kokyuroku_url'] = f'http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/contents/{kokyuroku_number}.html'
    for csv in tqdm(csv_list[1:]):
        df = pd.read_csv(''.join([info_dir,csv]),index_col=0)
        kokyuroku_number = int(csv.replace('.csv',''))
        df['kokyuroku_number'] = kokyuroku_number
        df['kokyuroku_url'] = f'http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/contents/{kokyuroku_number}.html'
        concatenated_df = concatenated_df.append(df)
    return concatenated_df.reset_index().set_index(['kokyuroku_number','No.'])

def read_all_pdf_info():
     return pd.read_csv('./all_pdf_info.csv').set_index(['kokyuroku_number','No.'])

In [46]:
read_all_pdf_info()

Unnamed: 0_level_0,Unnamed: 1_level_0,title,link,Affiliation,Author in JPN,Author in ENG,kokyuroku_url
kokyuroku_number,No.,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,"代数曲線に付随するニ,三の解析的構造 (代数曲線に附随する二三の解析的構造)",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,京都大学数理解析研究所,中野 茂男,"NAKANO,SHIGEO",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
2,1,変わり点を含む線型常微分方程式 : Reductionと解の漸近展開 (境界層と変わり点に関...,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,東京都立大学理学部,岩野 正宏,"IWANO,MASAHIRO",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
2,2,変わり点を含む線型常微分方程式 : 解の漸近展開 (境界層と変わり点に関するシンポジウム報告 I),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,東京大学教養学部,木村 俊房,"KIMURA,TOSIHUSA",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
2,3,変わり点を含む線型常微分方程式 : 接続公式 (境界層と変わり点に関するシンポジウム報告 I),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,上智大学理工学部,大久保 謙二郎,"OKUBO,KENJIRO",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
3,1,Orr-Sommerfeld方程式の転移点 (境界層と変わり点に関するシンポジウム報告 II),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,京都大学理学部,角谷 典彦,"KAKUTANI,TSUNEHIKO",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
3,2,物理学におけるWKB法の応用 (境界層と変わり点に関するシンポジウム報告 II),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,東京大学理学部,今井 功,"IMAI,ISAO",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
3,3,WKB法の精密化 (境界層と変わり点に関するシンポジウム報告 II),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,静岡大学文理学部,森口 治生,"MORIGUCHI,HARUO",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
4,1,"特異積分方程式の数値解法 : 除く, 固有値問題, Volterra型, 微積分方程式, 純...",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,群馬大学学芸学部,阿曾 義之,"ASO,YOSHIYUKI",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
5,1,$C^*$-algebraのテンソル積とその表現 (作用素環とその物理的応用に関する研究会報告集),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,東北大学理学部,竹崎 正道,"TAKESAKI,MASAMICHI",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
5,2,Factorsの構成法と接合積 (作用素環とその物理的応用に関する研究会報告集),http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,東北大学教育学部,鶴丸 孝司,"TSURUMARU,TAKASHI",http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...
