In [10]:
# Dependencies
import os
import json
#import wordcloud
import requests
import pandas as pd
from scipy import stats
from config import api_key
import googleapiclient.errors
import matplotlib.pyplot as plt
import googleapiclient.discovery
from youtube_api import YoutubeDataApi
from googleapiclient.discovery import build

In [12]:
# List of simple to collect features
snippet_features = ["title",
                    "publishedAt",
                    "channelId",
                    "channelTitle",
                    "categoryId"]

# Any characters to exclude, generally these are things that become problematic in CSV files
unsafe_characters = ['\n', '"']

# Used to identify columns, currently hardcoded order
header = ["video_id"] + snippet_features + ["trending_date", "tags", "view_count", "likes", "dislikes",
                                            "comment_count", "thumbnail_link", "comments_disabled",
                                            "ratings_disabled", "description"]


#def setup(api_path, code_path):
    #with open(api_path, 'r') as file:
        #api_key = file.readline()

    with open(code_path) as file:
        country_codes = [x.rstrip() for x in file]

    return api_key, country_codes


def prepare_feature(feature):
    # Removes any character from the unsafe characters list and surrounds the whole item in quotes
    for ch in unsafe_characters:
        feature = str(feature).replace(ch, "")
    return f'"{feature}"'


def api_request(api_key, country_code):
    # Builds the URL and requests the JSON from it
    request_url = f"https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet{page_token}chart=mostPopular&regionCode={US}&maxResults=50&key={api_key}"
    request = requests.get(request_url)
    if request.status_code == 429:
        print("Temp-Banned due to excess requests, please wait and continue later")
        sys.exit()
    return request.json()


def get_tags(tags_list):
    # Takes a list of tags, prepares each tag and joins them into a string by the pipe character
    return prepare_feature("|".join(tags_list))


def get_videos(items):
    lines = []
    for video in items:
        comments_disabled = False
        ratings_disabled = False

        # We can assume something is wrong with the video if it has no statistics, often this means it has been deleted
        # so we can just skip it
        if "statistics" not in video:
            continue

        # A full explanation of all of these features can be found on the GitHub page for this project
        video_id = prepare_feature(video['id'])

        # Snippet and statistics are sub-dicts of video, containing the most useful info
        snippet = video['snippet']
        statistics = video['statistics']

        # This list contains all of the features in snippet that are 1 deep and require no special processing
        features = [prepare_feature(snippet.get(feature, "")) for feature in snippet_features]

        # The following are special case features which require unique processing, or are not within the snippet dict
        description = snippet.get("description", "")
        thumbnail_link = snippet.get("thumbnails", dict()).get("default", dict()).get("url", "")
        trending_date = time.strftime("%y.%d.%m")
        tags = get_tags(snippet.get("tags", ["[none]"]))
        view_count = statistics.get("viewCount", 0)

        # This may be unclear, essentially the way the API works is that if a video has comments or ratings disabled
        # then it has no feature for it, thus if they don't exist in the statistics dict we know they are disabled
        if 'likeCount' in statistics and 'dislikeCount' in statistics:
            likes = statistics['likeCount']
            dislikes = statistics['dislikeCount']
        else:
            ratings_disabled = True
            likes = 0
            dislikes = 0

        if 'commentCount' in statistics:
            comment_count = statistics['commentCount']
        else:
            comments_disabled = True
            comment_count = 0

        # Compiles all of the various bits of info into one consistently formatted line
        line = [video_id] + features + [prepare_feature(x) for x in [trending_date, tags, view_count, likes, dislikes,
                                                                       comment_count, thumbnail_link, comments_disabled,
                                                                       ratings_disabled, description]]
        lines.append(",".join(line))
    return lines


def get_pages(country_code, next_page_token="&"):
    country_data = []

    # Because the API uses page tokens (which are literally just the same function of numbers everywhere) it is much
    # more inconvenient to iterate over pages, but that is what is done here.
    while next_page_token is not None:
        # A page of data i.e. a list of videos and all needed data
        video_data_page = api_request(next_page_token, country_code)

        # Get the next page token and build a string which can be injected into the request with it, unless it's None,
        # then let the whole thing be None so that the loop ends after this cycle
        next_page_token = video_data_page.get("nextPageToken", None)
        next_page_token = f"&pageToken={next_page_token}&" if next_page_token is not None else next_page_token

        # Get all of the items as a list and let get_videos return the needed features
        items = video_data_page.get('items', [])
        country_data += get_videos(items)

    return country_data


def write_to_file(country_code, country_data):

    print(f"Writing {country_code} data to file...")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(f"{output_dir}/{time.strftime('%y.%d.%m')}_{country_code}_videos.csv", "w+", encoding='utf-8') as file:
        for row in country_data:
            file.write(f"{row}\n")


def get_data():
    for country_code in country_codes:
        country_data = [",".join(header)] + get_pages(country_code)
        write_to_file(country_code, country_data)


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('--key_path', help='Path to the file containing the api key, by default will use api_key.txt in the same directory', default='api_key.txt')
    parser.add_argument('--country_code_path', help='Path to the file containing the list of country codes to scrape, by default will use country_codes.txt in the same directory', default='country_codes.txt')
    parser.add_argument('--output_dir', help='Path to save the outputted files in', default='output/')

    args = parser.parse_args()

    output_dir = args.output_dir
    api_key, country_codes = setup(args.key_path, args.country_code_path)

    get_data()

IndentationError: unexpected indent (<ipython-input-12-dd0acf9b2c1f>, line 21)

In [7]:
from time import sleep
from time import perf_counter
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
sns.set()

# Dates info to use for file name later
date = datetime.datetime.now() # Gets current date
day = date.day # Gets day
month = date.month # Gets month
year = date.year # Gets year
hour = date.hour
minute = date.minute
if len(str(minute)) == 1: minute = '0'+str(minute)

# Function to find the category text
def findCat():
    showmore = driver.find_element_by_xpath('//*[@id="more"]/yt-formatted-string')
    driver.execute_script("arguments[0].click();", showmore)

    category = driver.find_element_by_xpath('//*[@id="content"]/yt-formatted-string/a')
    return category.get_attribute('innerHTML')
# Function to go to a video and find it's category
def getCategory(link): 
    print('========================================')
    print('Going to '+link)
    driver.get(link) # Go to video
    sleep(1) # Wait to load

    c = None
    try: # Tries to get it
        c = findCat()
    except: # If it fails, wait some seconds and try again
        try:
            sleep(3)
            c = findCat()
        except: 
            pass
    
    print(f'Getting categories ({percent(videos.index(link)+1, len(videos))}%)')
    return c

# Simple function to get percentage
def percent(n, total):
    return round((n/total)*100, 1)

tic = perf_counter()

driver = webdriver.Chrome(executable_path = 'tools/chromedriver') # Gets Chrome driver
driver.get('https://www.youtube.com/feed/trending') # Go to Youtube Trending page

print('Getting links')
videos = [thumb.get_attribute('href') for thumb in driver.find_elements_by_id('thumbnail') if thumb.get_attribute('href') != None]
print(f'All links listed! ({len(videos)})')

categories = [getCategory(link) for link in videos if not None]

driver.close()

# Create Data Frame
print('========================================')
print('Making data frame')
n = [1 for category in categories]
total = sum(n)
data = pd.DataFrame({"Category": categories, "n": n})
data = data.groupby('Category')['n'].sum()
data = data.to_frame()
data.reset_index(level=0, inplace=True)
data = data.sort_values(by='n', ascending=True)
print(data) 

# Create chart
print('========================================')
print('Building chart')
mpl.rcParams['font.size'] = 9.0
fig1, chart = plt.subplots(constrained_layout=True)

cols = [
    '#FFDB15', '#3F5E98', '#918E80', '#2F2440', 
    '#01949A', '#F7BEC0', '#E7625F', '#02894B', 
    '#EAB996', '#C22660','#CFC1CE', '#9F2B00', 
    '#729663', '#9E3A14','#ed0000', '#2E3B51'
] # Set color palette

chart.pie(data['n'], shadow=False, startangle=90, colors=cols)
chart.axis('equal')

# Sets legend
percents = [percent(n, total) for n in data['n']]
plt.legend(labels=['%s, %1.1f %%' % (l, s) for l, s in zip(data['Category'], percents)], frameon=False, loc=2, bbox_to_anchor=(.94,.85))

centre_circle = plt.Circle((0,0),0.75,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.suptitle(f'\nYoutube Trending Categories\n{hour}:{minute} on {day}/{month}/{year}')
toc = perf_counter()

totalsecs = toc - tic
minutes = int(totalsecs//60)
fmin = totalsecs/60
remain = float(str(fmin-int(fmin))[1:])
seconds = int(60*remain)
time = f'{minutes} minutes'
if seconds != 0: time = time+f' and {seconds} seconds'
print('~~~~~~~~~~~~~~~~~~~~')
print(f'-~ Done in {time}. ~-')
print('~~~~~~~~~~~~~~~~~~~~')


# Save figure file
fname = f'yt-trending-{day}-{month}-{year}.png'
print(f'Saving chart as {fname}')

plt.draw()
plt.savefig(fname, dpi=700)
plt.show()

ModuleNotFoundError: No module named 'selenium'

In [6]:
import requests
import json
import html
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError


class Scraper:
    """
    Scraper for the trending page on YouTube.
    """
    URL = "https://www.youtube.{0}"
    @staticmethod
    def scrape(country_code="com"):
        json_array = []
    try:
        response = requests.get(Scraper.URL.format(country_code) + "/feed/trending")
    except ConnectionError as err:
            json_array.append({"error": str(err)})
        return json.dumps(json_array)
        soup = BeautifulSoup(response.text, "html.parser")
        trending_videos = soup.find_all(attrs={"class": "expanded-shelf-content-item"})
        for video_element in trending_videos:
            video_obj = dict()
            thumbnail = video_element.find(attrs={"class": ["yt-thumb", "video-thumb"]}).find(attrs={"class": "yt-thumb-simple"}).find("img")
            if thumbnail.get("data-thumb") is not None:
                    video_obj["video_thumbnail"] = thumbnail["data-thumb"]
                else:
                    video_obj["video_thumbnail"] = thumbnail["src"]
                    title_content = video_element.find(attrs={"class": "yt-lockup-title"})
                    link_meta = title_content.find("a")
                    video_obj["video_url"] = link_meta.get("href")
                    video_obj["video_title"] = link_meta.get("title")
                    video_time = title_content.select("span.accessible-description")
                    if len(video_time) != 0:
                        video_obj["video_time"] = video_time[0].text
                    else:
                        video_obj["video_time"] = "LIVE NOW"
            profile_content = video_element.find(attrs={"class": "yt-lockup-byline"})
            profile_meta = profile_content.find("a")
            video_obj["profile_url"] = profile_meta.get("href")
            video_obj["profile_name"] = profile_meta.string
            meta_info = video_element.find(attrs={"class": "yt-lockup-meta-info"})
            if len(meta_info.contents) > 1:
                video_obj["upload_date"] = meta_info.contents[0].string
                video_obj["view_count"] = meta_info.contents[1].string.split(" ")[0]
            else:
                video_page_response = requests.get(Scraper.URL.format(country_code) + video_obj["video_url"])
                parsed_response = BeautifulSoup(video_page_response.text, "html.parser")
            if parsed_response.find("span", class_="date"):
                video_obj["upload_date"] = parsed_response.find("span", class_="date").string
            elif parsed_response.find("strong", class_="watch-time-text"):
                video_obj["upload_date"] = parsed_response.find("strong", class_="watch-time-text").string
                video_obj["view_count"] = meta_info.contents[0].string
            description_content = video_element.select("div.yt-lockup-description")
            video_description = ""
            if len(description_content) > 0:
                video_description = description_content[0].text
                video_obj["video_desc"] = html.escape(str(video_description))
                json_array.append(video_obj)
return json.dumps(json_array, sort_keys=True)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 20)