In [13]:
import json
import re
import requests

from typing import Any, Dict, List, Tuple

import pandas as pd

from bs4 import BeautifulSoup as bs

In [2]:
BASE_URL = "https://web.archive.org"
START_URL = "https://socialblade.com/youtube/top/category/entertainment"

CATEGORIES_DICT = {}
CHANNELS_DICT = {}

In [3]:
def comma_separated_to_num(string: str) -> int:
    if string is None:
        return None
    splits = [s.strip() for s in string.split(",")[::-1]]
    splits = [s for s in splits if s]
    num = 0
    base = 0
    for split in splits:
        num += int(split) * (10 ** base)
        base += len(split)
    return num

def short_form_to_num(short_form: str) -> int:
    if short_form is None:
        return None
    if short_form[-1].lower() == "k":
        multiplier = 1000
    elif short_form[-1].lower() == "m":
        multiplier = 1e6
    elif short_form[-1].lower() == "b":
        multiplier = 1e9
    else:
        try:
            return comma_separated_to_num(short_form)
        except Exception as e:
            print(short_form)
            raise e
    num = float(short_form[:-1].strip())
    return int(num * multiplier)

def find_sb_url(url: str) -> Tuple[str, str]:
    if url is None:
        return None, None
    pat = re.compile("\/web\/(\d+)\/(.*)")
    return re.search(pat, url).groups()

In [4]:
class Page:
    def __init__(self, name: str, url: str, archive_datetime: str):
        self.sb_url = url
        self.url = f"{BASE_URL}/web/{archive_datetime}/{url}"
        
    def process(self) -> List[Any]:
        raise NotImplementedError("Base Page can not process.")
        
    def _get_soup(self):
        try:
            response = requests.get(self.url)
        except requests.TooManyRedirects:
            print(f"Too many redirects for the URL {self.url}.")
            return None
        if response.status_code != 200:
            if "The Wayback Machine has not archived that URL." in response.text:
                print(f"The Wayback Machine has not archived the URL - {self.url}.")
                return None
            print(f"Error fetching page {self.url}")
            return None
            
        return bs(response.text, 'html.parser')

In [5]:
class Category(Page):
    def __init__(self, name: str, url: str, archive_datetime: str):
        super(Category, self).__init__(name, url, archive_datetime)
        self.url = f"{self.url}/mostsubscribed"
        print(f"processing URL {self.url}")
        self.channels = []
        self.channels_visited = {}
        
    def process(self) -> List[Dict[str, str]]:
        soup = self._get_soup()
        if not soup:
            return []
        else:
            left = soup.find("div", {"style": "float: left; width: 300px;"})
            right = soup.find("div", {"style": "float: right; width: 900px;"})
            self._add_channels(right)
            return self._other_categories(left)
    
    def _add_channels(self, root):
        global CHANNELS_DICT
        
        for channel_div in self._get_channel_divs(root):
            children = channel_div.findChildren("div", recursive=False)
            archive_date, sb_url = find_sb_url(children[2].find('a')['href'])
            if sb_url in CHANNELS_DICT:
                if not self.channels_visited.get(sb_url, False):
                    channel = CHANNELS_DICT[sb_url]
                else:
                    continue
            else:
                channel = Channel(children[2].find("a").text.strip(), sb_url, archive_date)
                channel.process()
                CHANNELS_DICT[sb_url] = channel
            self.channels_visited[sb_url] = True
            self.channels.append(channel)

    def _other_categories(self, root) -> List[Dict[str, str]]:
        others = []
        for div in self._get_category_divs(root):
            others.append(self._get_category_details(div))
        return others
    
    @staticmethod
    def _get_channel_divs(root):
        i = 0
        children = root.findChildren("div", recursive=False)
        for div in children:
            i += 1
            if div.attrs.get("id") == "sort-by":
                i += 1
                break
        return children[i:]
    
    @staticmethod
    def _get_category_divs(root):
        i = 0
        children = root.findChildren("div", recursive=False)
        for div in children:
            i += 1
            if div.text.strip() == 'Top 100 by Channeltype':
                break
        return children[i].find_all("a")
    
    @staticmethod
    def _get_category_details(cat_div):
        archive_date, sb_url = find_sb_url(cat_div["href"])
        return {
            "name": cat_div.text.strip(),
            "archive_datetime": archive_date,
            "url": sb_url
        }

In [6]:
class Channel(Page):
    def __init__(self, name: str, url: str, archive_datetime: str):
        super(Channel, self).__init__(name, url, archive_datetime)
        print(f"processing URL {self.url}")
        self.uploads = None
        self.subs = None
        self.views = None
        self.country = None
        self.country_archive_date = None
        self.country_url = None
        self.category = None
        self.category_archive_date = None
        self.category_url = None
        self.created_at = None
        self.grade = None
        self.sb_rank = None
        self.subs_rank = None
        self.vid_view_rank = None
        self.country_rank = None
        self.category_rank = None
        
    def process(self):
        soup = self._get_soup()
        if soup:
            top_info = soup.find("div", {"id": "YouTubeUserTopInfoWrap"})
            if top_info:
                self._set_top_info(top_info)
            user_content = soup.find("div", {"id": "socialblade-user-content"})
            if user_content:
                contents = user_content.findChildren("div", recursive=False)
                self._set_grade_rank(contents[0])
    
    def _set_top_info(self, root):
        avatar = root.find("img")["src"]
        info = root.find("div", {"id": "YouTubeUserTopInfoBlockTop"})
        info = info.find("div", {"id": "YouTubeUserTopInfoBlock"})
        info = info.find_all("div", {"class": "YouTubeUserTopInfo"})
        uploads = info[0].find_all("span")[1].text.strip()
        subs = info[1].find_all("span")[1].text.strip()
        views = info[2].find_all("span")[1].text.strip()
        country = info[3].find_all("span")[1].find("a")
        archive_date, url = find_sb_url(country["href"])
        category = info[4].find_all("span")[2].find("a")
        cat_archive_date, cat_url = find_sb_url(category["href"])

        self.uploads = comma_separated_to_num(uploads)
        self.subs = short_form_to_num(subs)
        self.views = comma_separated_to_num(views)
        self.country = country.text.strip()
        self.country_archive_date = archive_date
        self.country_url = url
        self.category = category.text.strip()
        self.category_archive_date = cat_archive_date
        self.category_url = cat_url
        self.created_at = info[5].find_all("span")[1].text.strip()
        
    def _set_grade_rank(self, root):
        grade, ranks = root.findChildren("div", recursive=False)
        ranks = ranks.findChildren("div", recursive=False)
        self.grade = grade.findChild("div", recursive=False).text.strip()
        self.sb_rank = ranks[0].find("p").text.strip()
        self.subs_rank = ranks[1].find("p").text.strip()
        self.vid_view_rank = ranks[2].find("p").text.strip()
        self.country_rank = ranks[3].find("p").text.strip()
        self.category_rank = ranks[4].find("p").text.strip()
        
    def serialize(self):
        return {
            "name": self.name,
            "url": self.url,
            "sb_url": self.sb_url,
            "uploads": self.uploads,
            "subs": self.subs,
            "views": self.views,
            "country": self.country,
            "country_url": self.country_url,
            "category": self.category,
            "cat_url": self.category_url,
            "created_at": self.created_at,
            "grade": self.grade,
            "sb_rank": self.sb_rank,
            "subs_rank": self.subs_rank,
            "vid_view_rank": self.vid_view_rank,
            "country_rank": self.country_rank,
            "category_rank": self.category_rank
        }

In [7]:
def process_category(name: str, url: str, archive_datetime: str):
    if url in CATEGORIES_DICT:
        return
    category = Category(name, url, archive_datetime)
    other_categories = category.process()
    CATEGORIES_DICT[url] = category
    print(len(CATEGORIES_DICT))
    for category in other_categories:
        process_category(**category)

In [8]:
process_category("Entertainment", START_URL, "20210318122140")

processing URL https://web.archive.org/web/20210318122140/https://socialblade.com/youtube/top/category/entertainment/mostsubscribed
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/set-india
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/kids-diana-show
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/channel/UCJplp5SjeGSdVdwsfb9Q7lQ
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/vladandniki
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/user/zeetv
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/mrbeast6000
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/sonysab
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/badabunoficials
processing URL htt

processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/nbcagt
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/channel/UC4JCksJF76g_MdzPVBJoC3Q
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/venusmovies
The Wayback Machine has not archived the URL - https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/venusmovies.
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/channel/UC5ma-WCc8jNCV0WyRnfbHTg
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/trans7official
processing URL https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/slivkichanel\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
The Wayback Machine has not archived the URL - https://web.archive.org/web/20210301042225/https://socialblade.com/youtube/c/slivkichanel\\\\\\\\\\\\\\\\\\\\\\\\\\\\

processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/thestradman\
The Wayback Machine has not archived the URL - https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/thestradman\.
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/1320videos
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/carthrottle
Error fetching page https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/carthrottle
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/explained
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/user/bayancover
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/user/dailydrivenexotics
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/arabgt
processing URL https

Error fetching page https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/cartvpress
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/mercedes-benz
The Wayback Machine has not archived the URL - https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/mercedes-benz.
processing URL https://web.archive.org/web/20210301042114/https://socialblade.com/youtube/c/acelerados


TypeError: 'NoneType' object is not subscriptable

In [9]:
len(CATEGORIES_DICT), len(CHANNELS_DICT)

(1, 183)

In [12]:
list(CHANNELS_DICT.values())[0].serialize()

{'uploads': 51750,
 'subs': 98900000,
 'views': 81559621102,
 'country': 'IN',
 'country_archive_date': '20210318120511',
 'country_url': 'https://socialblade.com/youtube/top/country/in',
 'category': 'Entertainment',
 'cat_archive_date': '20210318120511',
 'cat_url': 'https://socialblade.com/youtube/top/category/entertainment',
 'created_at': 'Sep 20th, 2006',
 'grade': 'A++',
 'sb_rank': '3rd',
 'subs_rank': '6th',
 'vid_view_rank': '3rd',
 'country_rank': '2nd',
 'category_rank': '1st'}

In [16]:
def get_channel_df():
    data = []
    for channel in CHANNELS_DICT.values():
        data.append(channel.serialize())
    return pd.DataFrame.from_records(data)

df = get_channel_df()
df.head()

Unnamed: 0,uploads,subs,views,country,country_archive_date,country_url,category,cat_archive_date,cat_url,created_at,grade,sb_rank,subs_rank,vid_view_rank,country_rank,category_rank
0,51750.0,98900000.0,81559620000.0,IN,20210318120511,https://socialblade.com/youtube/top/country/in,Entertainment,20210318120511,https://socialblade.com/youtube/top/category/e...,"Sep 20th, 2006",A++,3rd,6th,3rd,2nd,1st
1,831.0,75700000.0,52859330000.0,US,20210316082321,https://socialblade.com/youtube/top/country/us,Film,20210316082321,https://socialblade.com/youtube/top/category/film,"May 12th, 2015",A++,4th,8th,7th,3rd,2nd
2,548.0,70300000.0,53279430000.0,US,20210228122718,https://socialblade.com/youtube/top/country/us,Entertainment,20210228122718,https://socialblade.com/youtube/top/category/e...,"Dec 6th, 2016",A++,6th,12th,6th,6th,3rd
3,330.0,63100000.0,44123480000.0,US,20210225134935,https://socialblade.com/youtube/top/country/us,Entertainment,20210225134935,https://socialblade.com/youtube/top/category/e...,"Apr 23rd, 2018",A++,7th,15th,11th,7th,4th
4,67367.0,57200000.0,27873770000.0,IN,20210304092457,https://socialblade.com/youtube/top/country/in,Entertainment,20210304092457,https://socialblade.com/youtube/top/category/e...,"Dec 11th, 2005",A++,8th,18th,4th,4th,5th


In [17]:
df.to_csv("channels.csv", index=False)