In [None]:
import csv
import time
import requests
from bs4 import BeautifulSoup

class Timer:
    """
    Timer measures elapsed time taken to execute a code block.
    It is recommended to use this object with `with` block.
    
    Usage:
        with Timer(message):
            # code block to be measured here...
    """
    def __init__(self, msg: str):
        self.msg = msg
        
    def __enter__(self):
        print(self.msg, end='')
        self.start = time.time()
        return self

    def __exit__(self, *args):
        elapsed = time.time() - self.start
        print(' done: elapsed %.2f sec.' % elapsed)

In [None]:
def get_num_pages(url):    
    #データ取得
    result = requests.get(url)

    #HTMLを元に、 BeautifulSoup オブジェクトを作る
    soup = BeautifulSoup(result.content, 'lxml')

    #ページ数を取得
    pages = soup.select('ol.pagination-parts li a')
    # 最後のページ番号が総ページ数
    return int(pages[-1].text)

In [None]:
def parse_item(cassetteitem):    
    # 建物名取得
    name = cassetteitem.select_one('.cassetteitem_content-title').text
    
    #住所取得
    address = cassetteitem.select_one('.cassetteitem_detail-col1').text

    #最寄り駅を取得
    stations = [item.text for item in cassetteitem.select('.cassetteitem_detail-col2 .cassetteitem_detail-text')]
                        
    #築年数と建物高さを取得
    age, height = [item.text for item in cassetteitem.select('.cassetteitem_detail-col3 div')]

    #各建物から売りに出ている部屋情報を取得
    rows = []
    rooms = cassetteitem.select('table.cassetteitem_other tbody tr')
    for room in rooms:
        cols = [col.text for col in room.select('td')[2:8]]
        rows.append([name, address] + stations + [age, height] + cols)
        
    return rows

In [None]:
def scrape(url):
    result = requests.get(url)
    page = BeautifulSoup(result.content, 'lxml')
    
    #マンション名、住所、立地（最寄駅/徒歩~分）、築年数、建物高さが入っているcassetteitemを全て抜き出し
    cassetteitems = page.select('#js-bukkenList .cassetteitem')

    # 各 cassetteitem に対する parse 結果を返す generator を生成
    return (parse_item(item) for item in cassetteitems)

In [None]:
base_urls = [
    {
        # 東京都心部, 東部 (13区)
        'area': 'central_east',
        'url': 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=13&sc=13101&sc=13102&sc=13103&sc=13104&sc=13105&sc=13113&sc=13106&sc=13107&sc=13108&sc=13118&sc=13121&sc=13122&sc=13123&cb=0.0&ct=9999999&mb=0&mt=9999999&et=9999999&cn=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&sngz=&po1=25&pc=50'
    },
    {
        # 東京西部, 北部, 南部 (10区)
        'area': 'west_north_south',
        'url': 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=13&sc=13109&sc=13110&sc=13111&sc=13112&sc=13114&sc=13115&sc=13120&sc=13116&sc=13117&sc=13119&cb=0.0&ct=9999999&mb=0&mt=9999999&et=9999999&cn=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&sngz=&po1=25&pc=50'
    },
]

In [None]:
for base_url in base_urls:
    url = base_url['url']
    
    with Timer('Fetching the number of total pages of %s ...' % url):
        # URL（検索結果の1ページ目）
        num_pages = get_num_pages(url)
    print('Total %d pages to scrape.' % num_pages)

    #URLを入れるリスト; 1ページ目から最後のページまでを格納
    urls = ['%s&pn=%d' % (url, i) for i in range(1, num_pages + 1)]
    
    out_file = 'suumo_%s.csv' % base_url['area']
    with open(out_file, 'w') as f:
        writer = csv.writer(f)

        # CSV ヘッダを書き込む
        writer.writerow(['マンション名','住所','立地1','立地2','立地3','築年数','建物高さ','階','賃料','管理費',
                         '敷/礼/保証/敷引,償却','間取り','専有面積'])

        # 各ページをスクレイピング
        for url in urls:
            with Timer('Scraping %s ...' % url):
                for rows in scrape(url):
                    writer.writerows(rows)

    print('Successfully created %s.' % out_file)