In [2]:
import pandas as pd
import numpy as np
import json 
import os
import os.path
import requests
import matplotlib.pyplot as plt
import re
from datetime import datetime
from bs4 import BeautifulSoup
import doctest

In [3]:
def circle(href):
    return href and re.compile("circle").search(href)
def item(href):
    return href and re.compile("item").search(href)

def price(string):
    return string and re.compile("円").search(string)

def get_price(tag_string):
    amount = tag_string.replace('円','').strip()
    amount = amount.replace(',','')   
    return int(float(amount))

In [4]:
def check_price_format(string):
    try:
        return get_price(string)
    except ValueError:
        return False

In [5]:
def send_request(url):    
    resp = requests.get(url)
    print(resp.status_code)
    soup = BeautifulSoup(resp.content, 'html.parser')
    return soup

def find_main_product_div(soup):
    #find the main_div
    main_div = soup.find_all('div', class_='product-list')[0] #only one match
    return main_div

def get_product_div(main_div):
    """return a list of divs of products from the main div  
    """
    #store all product divs
    item_div_list = []
    for child in main_div.children:
        if child == '\n':
            continue
        else:
            item_div_list.append(child)
    return item_div_list

def extract_info(item_tag):
    """from an item tag, extract and return a dictionary of relevant info of the item"""
    item_info = {}
    
    #get item url and name
    results = item_tag.find_all(href=item)
    result = results[1] # use the second tag b/c easier to retrieve string
    item_info['item_name'] = result.text.strip()
    item_info['item_url'] = base_url + result['href']
    

    #get circle name and url
    results = item_tag.find_all(href=circle)
    if not results:
        item_info['circle_name'] = None
        item_info['circle_url'] = None
    else:
        result = results[0]
        item_info['circle_name'] = result.text.strip()
        item_info['circle_url'] = base_url + result['href']    
    
    #get price
    #temporary solutions -> need to extract exact tag string!
    results = item_tag.find_all(string=price)
    amount = 0
    for string in results:
        if check_price_format(string):
            amount = get_price(string)
#     amount = get_price(price_string)
    item_info['item_price'] = amount
    
    return item_info

def extract_all_info(item_div_list):
    """return a dictionary of best selling items from 1 to 100"""
    best_sellers = {}
    i = 1
    for item_tag in item_div_list:
        item_info = extract_info(item_tag)
        best_sellers[i] = item_info
        i += 1
    return best_sellers

In [6]:
#format filename
#examples of date tex
txt = '[集計期間：2021年07月03日 00時~ 2021年07月09日 24時]'

def extract_date_info(soup):
    """return a list of two tuples of (year, month, day) of the best seller url"""
    date_text = soup.find_all('p', class_='heading01_leadtext')[0].text
    p = re.compile(r'(\d+)年(\d+)月(\d+)日')
    groups = []
    for group in p.findall(date_text):
        groups.append(group)
    return groups

# issue: need separate base_name i.e date as key in dict -> a new func to format base-name

def format_basename(date_groups, format_,):
    """
    Return the formatted filename for either daily or weekly or monthly rankings 
    """
    if format_ == 'd':
        base_name = ''.join(date_groups[0])
    elif format_ == 'w':
        base_name = '_'.join([''.join(group) for group in date_groups])
    elif format_ == 'm':
        base_name = '_'.join(date_groups[0][:2])

    return base_name

def format_filename(base_name, ext='json'):
    return base_name + '.' + ext

In [7]:
tora_doujinshi_daily = 'https://ec.toranoana.jp/tora_r/ec/cot/ranking/daily/all'
tora_book_daily = 'https://ec.toranoana.jp/tora_r/ec/bok/ranking/daily/all'

types = [('doujinshi', tora_doujinshi_daily) , ('book', tora_book_daily) ]

def extract_daily_rankings(overwrite=False):
    """Extract daily rankings of doujinshi and books from Toranoana website. Save to disk"""
    data = {}
    rankings = {}
    for type_, url in types:
        soup = send_request(url)
        main_div = find_main_product_div(soup)
        item_tag_list = get_product_div(main_div)
        items = extract_all_info(item_tag_list)
        rankings[type_] = items
    
   
    date_groups = extract_date_info(soup)
    basename = format_basename(date_groups, format_='d')
    data[basename] = rankings
    
    #save file
    filename = format_filename(basename, ext='json')    
    #check if file exist, if overwrite is False then stop
    data_exist = os.path.isfile(filename) 
    if data_exist and not overwrite:
        print('Daily ranking for ', basename, 'has been retrieved. Exiting...')
        return None
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        print('Data written to: ', filename)
    return filename

In [8]:
tasks = [('doujinshi', 'https://ec.toranoana.jp/tora_r/ec/cot/ranking/daily/all'),
         ('doujinshi', 'https://ec.toranoana.jp/tora_r/ec/cot/ranking/weekly/all'),
         ('doujinshi', 'https://ec.toranoana.jp/tora_r/ec/cot/ranking/monthly/all'),
         ('book', 'https://ec.toranoana.jp/tora_r/ec/bok/ranking/daily/all'),
         ('book', 'https://ec.toranoana.jp/tora_r/ec/bok/ranking/weekly/all'),
         ('book', 'https://ec.toranoana.jp/tora_r/ec/bok/ranking/monthly/all')]
daily = [tasks[0], tasks[3]]
weekly = [tasks[1], tasks[4]]
monthly = [tasks[2], tasks[5]]

base_url = 'https://ec.toranoana.jp'

def extract_data(tasks, duration, overwrite=False):
    """Extract  rankings of doujinshi and books from Toranoana website. Save to disk"""
    data = {}
    rankings = {}
    for type_, url in tasks:
        print(url)
        soup = send_request(url)
        main_div = find_main_product_div(soup)
        item_div_list = get_product_div(main_div)
        items = extract_all_info(item_div_list)
#         print(items)
        rankings[type_] = items
       
    date_groups = extract_date_info(soup)
    basename = format_basename(date_groups, format_=duration)
    data[basename] = rankings
    
    #save file
    filename = format_filename(basename, ext='json')    
    #check if file exist, if overwrite is False then stop
    data_exist = os.path.isfile(filename) 
    if data_exist and not overwrite:
        print('Data for ', basename, 'already exists. Exiting...')
        return None
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        print('Data written to: ', filename)
    return filename

def extract_rankings(duration):
    assert duration in ['d', 'w', 'm'] , "Enter 'd', 'w' or 'm'."
    if duration == 'd':
        tasks = daily
    elif duration == 'w':
        tasks = weekly
    elif duration == 'm':
        tasks = monthly
    else:
        return None
    filename = extract_data(tasks, duration, overwrite=False)
    return filename

def extract_all():
    extract_rankings('d')
    extract_rankings('w')
    extract_rankings('m')

# -------------

In [9]:
if __name__ == '__main__':
    extract_all()

https://ec.toranoana.jp/tora_r/ec/cot/ranking/daily/all
200
https://ec.toranoana.jp/tora_r/ec/bok/ranking/daily/all
200
Data written to:  20210614.json
https://ec.toranoana.jp/tora_r/ec/cot/ranking/weekly/all
200
https://ec.toranoana.jp/tora_r/ec/bok/ranking/weekly/all
200
Data for  20210606_20210612 already exists. Exiting...
https://ec.toranoana.jp/tora_r/ec/cot/ranking/monthly/all
200
https://ec.toranoana.jp/tora_r/ec/bok/ranking/monthly/all
200
Data for  2021_05 already exists. Exiting...
