Scrape data from the manga section of animeplanet.com. Inspired by https://github.com/victor-soeiro/WebScraping-Projects/blob/main/anime-planet/main.ipynb
Data from 05/17/21

In [6]:
# import libraries

import numpy as np
import pandas as pd
import requests
import time

from tqdm.notebook import tnrange
from bs4 import BeautifulSoup



In [35]:
# Variables used throughout the project.

URL = 'https://www.anime-planet.com/manga/all'
headers ={"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'}
end_page = 2051
field_names = ['title','description','latest chapter','publisher','year','rating','tags']
pages_data = {}

In [11]:
# checks if scraping failed, returns nan if scraping failed

def check_text(value):
    if value:
        return value.text
    
    return np.nan



In [25]:
# Extracts info needed from html and stores it in variables to return it. Some series have different missing html elements. This causes columns to be
# shifted over incorrectly. This is fixed in my data cleaning file.

def item_scaper(item):
    info = item.a['title']
    info_soup = BeautifulSoup(info,'html.parser')
    
    title = info_soup.find('h5').text.strip()
    description = info_soup.find('p').text.strip()
    if info_soup.h4:
     tags = [t.text for t in info_soup.h4.nextSibling.findAll('li')]
    else:
        tags = []
    body = info_soup.find('ul', attrs={"class":'entryBar'})
    body_items = body.findAll('li')
    if (len(body_items) == 4):
        latest_chapter = body_items[0].text
        publisher = body_items[1].text
        year = body_items[2].text.split(' - ')[0]
        rating = body_items[3].text
    elif(len(body_items) == 3):
           latest_chapter = body_items[0].text
           publisher = np.nan
           year = body_items[1].text.split(' - ')[0]
           rating = body_items[2].text
    elif(len(body_items) ==2):
           latest_chapter= body_items[0].text
           publisher = np.nan
           year = body_items[1].text.split(' - ')[0]
           rating = np.nan
    elif(len(body_items)==1):
           latest_chapter= body_items[0].text
           publisher = np.nan
           year = np.nan
           rating = np.nan
    else:
        latest_chapter= np.nan
        publisher = np.nan
        year = np.nan
        rating = np.nan
    data = [title,description, latest_chapter, publisher, year, rating,tags]
    
    return data

In [37]:
# Starts the data scrapping project for each page.

def scraper(page=1):
    req = requests.get(URL, headers=headers, params={'page': page})
    if req.status_code != 200:
        return []
    
    soup = BeautifulSoup(req.text, 'html.parser')
    
    container = soup.find('ul', attrs={'class': 'cardDeck'})
    items = container.findAll('li')
    
    data = [item_scaper(i) for i in items]
    return data


In [38]:
# Loops through the pages we are webscrapping

for i in tnrange(end_page, desc='Pages'):
    page = i + 1
    
    if pages_data.get(page, []):
        continue
    
    data = scraper(page)
    pages_data[page] = data
    
    time.sleep(1)

Pages:   0%|          | 0/2051 [00:00<?, ?it/s]

In [39]:
# Adds all the data to a list and then writes it to a csv file.

full_data= []
for i in pages_data.values():
    full_data.extend(i)

pd.DataFrame(full_data, columns=field_names).to_csv('manga_planet_data.csv')