In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import datetime
import os
from IPython.display import display


def getlinks(playlist_soup):
    links=["https://youtube.com" + element['href'] for element in playlist_soup.select(f'#pl-video-table tr a.pl-video-title-link')]
    return links
    
def videosoup(url):
    
    response = requests.get(url, headers=headers).content
    video_soup = BeautifulSoup(response, "html.parser")
    
    return video_soup

def gettingtitle(video_soup):
    title = video_soup.select("h1.watch-title-container")
    if len(title)>0 :
        return (title[0].text.strip())
    else :
        return "No title available"    #Will not actually be added to the df.
                                    #Just a step to avoid errors when a video is unavailable.


def nbviews(video_soup):

    views = video_soup.select("div#watch7-views-info")
    nb_of_views=" ".join(views[0].text.strip().split()).strip(" vues")     #stripping the "vues" =views
    nb_of_views_numerical = int("".join([nb for nb in nb_of_views if nb.isdigit()]))   #turning the nb of views into int

    return(nb_of_views_numerical)



def likes(video_soup):
    likes_dislikes = video_soup.select('div span.yt-uix-clickcard button[aria-label]')
    likes = int(''.join(likes_dislikes[1].text.split()))
    return(likes)

def dislikes(video_soup):
    
    likes_dislikes = video_soup.select('div span.yt-uix-clickcard button[aria-label]')
    dislikes = int(''.join(likes_dislikes[3].text.split()))
    return(dislikes)

def duration(video_soup):
    regex = r"length_seconds\":\"(\d*)"
    duration = int(re.search(regex, str(video_soup)).group(1))
    duration_dt = datetime.timedelta(seconds=duration)

    return duration_dt

#Short descrition is also in the metadata.

def shortdesc(video_soup):
    regex2 = r"shortDescription\\\":\\\"(.*?)\\"
    short_desc = re.search(regex2, str(video_soup))
    if short_desc is None:
        p=2
    else:
        short_desc=short_desc.group(1)
        p=0
    return short_desc,p

def pubdate(video_soup):
    regexdate = r"([0-9]{4}-[0-9]{2}-[0-9]{2})\" itemprop=\"datePublished" 
    datepub=re.search(regexdate, str(video_soup)).group(1)
    datepub_object = datetime.datetime.strptime(datepub, '%Y-%m-%d')
    return datepub_object                  

def episodenb(descriptiontxt):
    regexnb = r"episode (\d*)" 
    ep_nb=re.search(regexnb, descriptiontxt).group(1)
    return ep_nb

def subject(descriptiontxt):
    regexsubj = r"episode \d* (.*)"
    subj=re.search(regexsubj, descriptiontxt).group(1)
    return subj

def getimglinks(playlist_soup):
    longlinks=[element["data-thumb"] for element in playlist_soup.select('#pl-video-table tr span.yt-thumb-clip img[data-thumb]')]
    regeximg = r"https.*jpg"  #shortening the links
    imglinks = [re.findall(regeximg, imglink) for imglink in longlinks]
    imglinks = [imglink[0] for imglink in imglinks if len(imglink)>0] #cleaning the list from non-existent thumbnails
    return imglinks

In [5]:

class IronhackSpider:
    def __init__(self,sleep_interval=-1, content_parser=None):
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
    
    def sleep_interval(self):
        if self.sleep_interval > 0:
            time.sleep(1)
    
    def scrape_url(self, playlist_url,headers):
        response = requests.get(playlist_url, headers=headers)
  
        try :
            result = self.content_parser(response.content)
            self.output_results(result)
        except requests.exceptions.SSLError:
            print("This site is not secure.")
        except requests.exceptions.Timeout:
            print("The request timed out.")
        except requests.exceptions.TooManyRedirects:
            print("Too many redirects.")
        except requests.exceptions.HTTPError:
            print("Http error.")
        except requests.exceptions.URLRequired:
            print("The URL is not valid.")
        except requests.exceptions.ConnectionError:
            print("Connection Error")

    def output_results(self, r):
        
        export_yn = input("Do you wish to export the dataframe to CSV file ? Y/N")
        
        if export_yn in ['Y','y']:
            print(f"The resulting file will be saved to {os.getcwd()}.")
            filename = input("Please type in a valid file name")
            r.to_csv(filename)
            print(f"Congratulations ! You can find {filename} as a CSV file in {os.getcwd()}.")
            display(r)
        else :
            display(r)
            
    def kickstart(self):
        self.scrape_url(playlist_url,headers)

###################### Enter the URL to the playlist here

playlist_url = "https://www.youtube.com/playlist?list=PL0LnWnL-0hb6xblHqC_LA3f-UMAD_qEPm"
headers = {'User-Agent': 'Chrome/71.0.3578.98'}
######################


def yt_playlist(content): 
    
    playlist_soup = BeautifulSoup(content, "html.parser")

    links = getlinks(playlist_soup)

    yt_df = pd.DataFrame(columns=["TITLE", "SHORT_DESC", "PUBLICATION DATE", "DURATION", "NB_OF_VIEWS","LIKES", "DISLIKES","THUMBNAIL","URL" ])
    counter = 0
    img_counter = 0
    for link in links :
        i=0
        url = link
        video_soup = videosoup(url)
        title = gettingtitle(video_soup)
        short_desc,i = shortdesc(video_soup)
        img_counter +=1
        if i>1:
            continue
        thumbnail_url = getimglinks(playlist_soup)[img_counter-1]
        pub_date = pubdate(video_soup)
        duration_obj = duration(video_soup)
        nb_of_views = nbviews(video_soup)
        likes_nb = likes(video_soup)
        dislikes_nb = dislikes(video_soup)

        link_info_list = [title, short_desc, pub_date, duration_obj, nb_of_views, likes_nb, dislikes_nb, thumbnail_url, url]

        # Add a new row at index k with values provided in list
        yt_df.loc[counter] = link_info_list
        counter+=1
        
    return yt_df
    
# Instantiate the IronhackSpider class
my_spider = IronhackSpider(playlist_url, content_parser=yt_playlist)

# Start scraping jobs
my_spider.kickstart()

Do you wish to export the dataframe to CSV file ? Y/Nn


Unnamed: 0,TITLE,SHORT_DESC,PUBLICATION DATE,DURATION,NB_OF_VIEWS,LIKES,DISLIKES,THUMBNAIL,URL
0,Foo Fighters - Best Of You (Official Music Video),Foo Fighters' official music video for 'Best O...,2009-10-03,00:04:16,159400690,674073,20436,https://i.ytimg.com/vi/h_L4Rixya64/hqdefault.jpg,https://youtube.com/watch?v=h_L4Rixya64&list=P...
1,Foo Fighters - The Sky Is A Neighborhood (Offi...,Get,2017-08-23,00:04:08,49404014,390841,12728,https://i.ytimg.com/vi/TRqiFPpw2fY/hqdefault.jpg,https://youtube.com/watch?v=TRqiFPpw2fY&list=P...
2,Foo Fighters - Run (Official Music Video),Get,2017-06-01,00:06:22,38181931,372093,14029,https://i.ytimg.com/vi/ifwc5xgI3QM/hqdefault.jpg,https://youtube.com/watch?v=ifwc5xgI3QM&list=P...
3,Learn to Fly,Provided to YouTube by Sony Music Entertainment,2015-08-11,00:03:55,2443821,43799,5737,https://i.ytimg.com/vi/HJMLLKgknvk/hqdefault.jpg,https://youtube.com/watch?v=HJMLLKgknvk&list=P...
4,Foo Fighters - The Line (Audio),Get,2017-09-07,00:03:40,3835898,37842,1078,https://i.ytimg.com/vi/8TsNkgW2ox0/hqdefault.jpg,https://youtube.com/watch?v=8TsNkgW2ox0&list=P...
5,"Foo Fighters: Hey, Johnny Park!","Foo Fighters: Hey, Johnny Park!...yes i know i...",2007-10-15,00:04:10,968441,3448,114,https://i.ytimg.com/vi/XnZ1Ft20pHw/hqdefault.jpg,https://youtube.com/watch?v=XnZ1Ft20pHw&list=P...
6,Foo Fighters - Stacked Actors,"Viernes,13 Diciembre - Foro Sol - Mexico D.F.",2013-10-30,00:04:17,327583,2067,36,https://i.ytimg.com/vi/qj91k4Omyfo/hqdefault.jpg,https://youtube.com/watch?v=qj91k4Omyfo&list=P...
7,Foo Fighters - Stacked Actors (Live on Letterman),Music video by Foo Fighters performing Stacked...,2011-04-19,00:08:29,1874333,11306,178,https://i.ytimg.com/vi/kqmTxasTnio/hqdefault.jpg,https://youtube.com/watch?v=kqmTxasTnio&list=P...
8,Foo Fighters.. Band On The Run (cover),Foo Fighters playing a Wings cover,2007-11-13,00:05:09,1676612,6570,277,https://i.ytimg.com/vi/DSIUrba-aoc/hqdefault.jpg,https://youtube.com/watch?v=DSIUrba-aoc&list=P...
9,Foo Fighters - February Stars,Foo FIghters track from their CD. The Colour a...,2008-08-20,00:04:52,485796,2574,73,https://i.ytimg.com/vi/9wG1xEHij08/hqdefault.jpg,https://youtube.com/watch?v=9wG1xEHij08&list=P...


In [4]:
display(yt_df)

NameError: name 'yt_df' is not defined