In [168]:
import requests
import re
from time import sleep
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import mojimoji

In [228]:
def author(dd_tag):
    try:
        author_name = dd_tag.text.split('\n')[1].replace('\u3000','')
    except:
        author_name = 'No name'
    return author_name

def title(dd_tag):
    try:
        title_list = list(dd_tag.a.childGenerator())
        english_title = title_list[0]#.replace('\n',' ')
        japanese_title = title_list[2]
        return english_title,japanese_title
    except:
        try:
            return dd_tag.a.text,''
        except:
            return '',''

def link(dd_tag,kokyuroku_path='http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/'):
    try:
        path = kokyuroku_path+dd_tag.a.get('href')
    except:
        path = 'No path'
    return path

In [260]:
def scrape_koukyuroku_list(start_year=1964,end_year=2020,print_info=False):
    data = []
    for year in tqdm(range(start_year,end_year+1)):
        res = requests.get('http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/{0}.html'.format(year))
        soup = bs(res.text.encode(res.encoding),'html5lib')
        dd = soup.find_all('dd')
        numbers = reversed(list(map(lambda x:x.text[3:],soup.find_all('dt'))))
        authors = reversed(list(map(author, dd)))
        titles = reversed(list(map(title, dd)))
        links = reversed(list(map(link, dd)))
        for nu,au,ti, li in zip(numbers,authors,titles,links):
            data.append([nu,au,ti[0],ti[1],li,year])
            if print_info:
                print(nu,au,ti[0],ti[1])
        sleep(1)
    
    # DataFrameへの変換
    col = ['authors','English Title','Japanese Title','Link','Year']
    data_np = np.array(data)
    for i in range(data_np.shape[0]):
        for j in range(data_np.shape[1]):
            if '\n' in data_np[i,j] or '\r' in data_np[i,j]:
                data_np[i,j] = data_np[i,j].replace('\n','').replace('\r','')
    data_df = pd.DataFrame(data_np[:,1:],index=data_np[:,0],columns=col)
    return data_df

def save_koukyuroku_list(data_df,save_path):
    data_df.tocsv(save_path)
    
def remove_new_lines(csv_path,save_path):
    #改行消す
    with open(csv_path,'r') as f:
        lines = f.readlines()
        new_lines = [lines[0]]
        for i in range(1,len(lines)):
            line = lines[i]
            if re.fullmatch(r'[0-9]+',line.split(',')[0]):
                new_lines.append(line)
            else:
                new_lines[-1] = ''.join([new_lines[-1][:-1],line])
    
    #全角英字を消してAuthorがいないものは'No author'にする
    for i in range(len(new_lines)):
        record = mojimoji.zen_to_han(new_lines[i].replace('，','・'), kana=False, ascii=True).split(',')
        if record[1].replace(' ','')==(''.join(record[2:4]).strip().replace(' ','')):
            record[1] = 'No author'
        new_lines[i] = ",".join(record)
        
    with open(save_path, 'w') as f:
        f.write(''.join(new_lines))
        f.flush()
        
def pipeline_scrape_to_save(start_year=1964,end_year=2020):
    csv_path = f'./kokyuroku_{start_year}_to_{end_year}.csv'
    data_df = scrape_koukyuroku_list(start_year,end_year)
    save_koukyuroku_list(data_df,csv_path)
    remove_new_lines(csv_path,csv_path)

In [251]:
res = requests.get('http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/2008.html')
soup = bs(res.text.encode(res.encoding),'html5lib')
dd = soup.find_all('dd')
# pprint(list(zip(list(map(lambda x:x.text[3:], soup.find_all('dt'))),list(map(title, dd)),list(map(author, dd)),list(map(link, dd)))))
# pprint(list(map(title, dd)))
# pprint(list(map(link, dd)))
# pprint(list(map(title, dd)))

In [None]:
# data = []
# for year in tqdm(range(1964,2021)):
#     res = requests.get('http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuroku/{0}.html'.format(year))
#     soup = bs(res.text.encode(res.encoding),'html5lib')
#     dd = soup.find_all('dd')
#     numbers = reversed(list(map(lambda x:x.text[3:],soup.find_all('dt'))))
#     authors = reversed(list(map(author, dd)))
#     titles = reversed(list(map(title, dd)))
#     links = reversed(list(map(link, dd)))
#     for nu,au,ti, li in zip(numbers,authors,titles,links):
#         data.append([nu,au,ti[0],ti[1],li,year])
#         print(nu,au,ti[0],ti[1])
#     sleep(1)

In [253]:
# data_np = np.array(data)
# for i in range(data_np.shape[0]):
#     for j in range(data_np.shape[1]):
#         if '\n' in data_np[i,j] or '\r' in data_np[i,j]:
#             print(repr(data_np[i,j]))
#             data_np[i,j] = data_np[i,j].replace('\n','').replace('\r','')
# col = ['authors','English Title','Japanese Title','Link','Year'
# df = pd.DataFrame(data_np[:,1:],index=data_np[:,0],columns=col)
# df

In [255]:
# df.to_csv('./kokyuroku.csv')

In [256]:
pd.read_csv('./kokyuroku.csv',index_col=0)

Unnamed: 0,authors,English Title,Japanese Title,Link,Year
1,"中野茂男 (NAKANO,SHIGEO )",Some Analytic Structures Associated to Algebra...,代数曲線に附随する二三の解析的構造,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
2,岩野 正宏、木村 俊房、大久保謙二郎,境界層と変わり点に関するシンポジウム報告(I),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
3,角谷 典彦、今井 巧、森口 治生,境界層と変わり点に関するシンポジウム報告(II),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
4,"阿曾義之 (ASO,YOSHIYUKI )",Numerical Solution of Singular Integral Equati...,特異積分方程式の数値解法,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
5,Proceeding of the Conference on Operator Ring ...,Proceeding of the Conference on Operator Ring ...,作用素環とその物理的応用に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
6,Proceeding of the Conference on the Mathematic...,Proceeding of the Conference on the Mathematic...,散乱の理論の数学に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
7,Reports on the Symposium of the Numerical Comp...,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
8,Reports on the Symposium of the Functional Equ...,Reports on the Symposium of the Functional Equ...,最適制御問題の函数方程式研究シンポジウム報告１,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
9,"荒木不二洋 (ARAKI,HUZIHIRO )",Proceeding of the Conference on Operator Ring,作用素環研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
10,Reports on the Symposium of the Numerical Comp...,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966


In [284]:
with open('./kokyuroku.csv') as f:
    lines = f.readlines()
    new_lines = [lines[0]]
    for i in range(1,len(lines)):
        line = lines[i]
        if re.fullmatch(r'[0-9]+',line.split(',')[0]):
            new_lines.append(line)
        else:
            new_lines[-1] = ''.join([new_lines[-1][:-1],line])

for i in range(len(new_lines)):
    record = mojimoji.zen_to_han(new_lines[i].replace('，','・'), kana=False, ascii=True).split(',')
    if record[1].replace(' ','')==(''.join(record[2:4]).strip().replace(' ','')):
        record[1] = 'No author'
    new_lines[i] = ",".join(record)

In [285]:
with open('./kokyuroku_organized.csv', 'w') as f:
    f.write(''.join(new_lines))
    f.flush()

In [286]:
pd.read_csv('./kokyuroku_organized.csv',index_col=0)

Unnamed: 0,authors,English Title,Japanese Title,Link,Year
1,"中野茂男 (NAKANO,SHIGEO )",Some Analytic Structures Associated to Algebra...,代数曲線に附随する二三の解析的構造,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
2,岩野 正宏、木村 俊房、大久保謙二郎,境界層と変わり点に関するシンポジウム報告(I),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
3,角谷 典彦、今井 巧、森口 治生,境界層と変わり点に関するシンポジウム報告(II),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
4,"阿曾義之 (ASO,YOSHIYUKI )",Numerical Solution of Singular Integral Equati...,特異積分方程式の数値解法,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
5,No author,Proceeding of the Conference on Operator Ring ...,作用素環とその物理的応用に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
6,No author,Proceeding of the Conference on the Mathematic...,散乱の理論の数学に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
7,No author,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
8,No author,Reports on the Symposium of the Functional Equ...,最適制御問題の函数方程式研究シンポジウム報告1,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
9,"荒木不二洋 (ARAKI,HUZIHIRO )",Proceeding of the Conference on Operator Ring,作用素環研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
10,No author,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966


In [283]:
pd.read_csv('./kokyuroku_organized.csv',index_col=0)

Unnamed: 0,authors,English Title,Japanese Title,Link,Year
1,"中野茂男 (NAKANO,SHIGEO )",Some Analytic Structures Associated to Algebra...,代数曲線に附随する二三の解析的構造,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
2,岩野 正宏、木村 俊房、大久保謙二郎,境界層と変わり点に関するシンポジウム報告(I),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
3,角谷 典彦、今井 巧、森口 治生,境界層と変わり点に関するシンポジウム報告(II),,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1964
4,"阿曾義之 (ASO,YOSHIYUKI )",Numerical Solution of Singular Integral Equati...,特異積分方程式の数値解法,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
5,Proceeding of the Conference on Operator Ring ...,Proceeding of the Conference on Operator Ring ...,作用素環とその物理的応用に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
6,Proceeding of the Conference on the Mathematic...,Proceeding of the Conference on the Mathematic...,散乱の理論の数学に関する研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
7,Reports on the Symposium of the Numerical Comp...,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1965
8,Reports on the Symposium of the Functional Equ...,Reports on the Symposium of the Functional Equ...,最適制御問題の函数方程式研究シンポジウム報告1,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
9,"荒木不二洋 (ARAKI,HUZIHIRO )",Proceeding of the Conference on Operator Ring,作用素環研究会報告集,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966
10,Reports on the Symposium of the Numerical Comp...,Reports on the Symposium of the Numerical Comp...,流体力学における数値計算シンポジウム報告,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,1966


In [263]:
with open('./kokyuroku.csv') as f:
    lines = f.readlines()
    new_lines = [lines[0]]
    for i in range(1,len(lines)):
        line = lines[i]
        if re.fullmatch(r'[0-9]+',line.split(',')[0]):
            new_lines.append(line)
        else:
            print('\n',line.split(',')[0])
            print(new_lines[-1],"".join(line.split(',')[:2]))
            if new_lines[-1].split(',')[-1] == "".join(line.split(',')[:2]):
                new_lines[-1] = ''.join([new_lines[-1].split(',')[0],",''",line])
            else:
                new_lines[-1] = ''.join([new_lines[-1][:-1],line])


 ― around dynamical systems
1493,吉野正史(MasafumiYoshino),"New Trends and Applications of Complex Asymptotic Analysis
 ― around dynamical systems summability

 The Taisei Sankei
1858,森本光生(Mitsuo Morimoto),"
 The Taisei Sankei Text Collated by Komatsu Hikosaburo Part 1"

 "
1984,仙葉隆(Takasi Senba),"Reconsideration of the method of estimates on partial differential equations from a point of view of the theory on abstract evolution equations
 "抽象発展方程式理論から見た偏微分方程式に関する評価方法の再考

 "
1994,辻川亨(Tohru Tsujikawa),"Theory of Biomathematics and Its Applications XII -Mathematical and experimental approach to clarify patterns in a transition process-
 "第１２回生物数学の理論とその応用-遷移過程に現れるパターンの解明に向けて-

 "
2001,芦野隆一(Ryuichi Ashino),"Wavelet analysis and signal processing
 "ウェーブレット解析と信号処理

 "
2015,古庄英和(Hidekazu Furusho),"Various Aspects of Multiple Zeta Value
 "多重ゼータ値の諸相

 "
2027,林俊介(Shunsuke Hayashi),"The state-of-the-art optimization technique and future development
 "最適化技法の最先端と今後の展開

 "
2082,岡部真也(Shinya Okabe),Analy

In [118]:
x = pd.read_csv('./kokyuroku_organized.csv',index_col=0)
x[(1550<x.index.values)*(x.index.values<1610)]

Unnamed: 0,authors,English Title,Japanese Title,Link,Year
1551,稲葉寿(Hisashi Inaba),,,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1552,亀山敦(Atsushi Kameyama),Recent Developments in Dynamical Systems,力学系理論の最近の発展,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1553,鈴木紀明(Noriaki Suzuki),Potential Theory and its Related Fields,ポテンシャル論とその関連分野,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1554,中野浩嗣(Koji Nakano),Theory of Computer Science and Its Applications,計算機科学の理論とその応用,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1555,池田宏一郎(Koichiro Ikeda),Model theoretic aspects of the notion of indep...,モデル理論における独立概念と次元,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1556,瀬野裕美(Hiromi Seno),Kyoto Winter School of Mathematical Biology 2006,新しい生物数学の研究交流プロジェクト,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1557,丸山徹(Toru Maruyama),Mathematical Economics,経済の数理解析,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1558,中村誠(Makoto Nakamura),Boundary value problems for partial differenti...,偏微分方程式に対する境界値問題,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1559,中井達(Toru Nakai),Developments of probability models on optimiza...,最適化問題における確率モデルの展開と応用,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
1560,赤平昌文(Masafumi Akahira),Statistical Decision for Multiple Comparison a...,多重比較の統計的決定とそれに関連する話題,http://www.kurims.kyoto-u.ac.jp/~kyodo/kokyuro...,2007
