# Scraping YouTube

## Initial Setup

In [1]:
from bs4 import BeautifulSoup
import requests

## Connect to webpage

In [26]:
base_url = "https://www.youtube.com/"
r = requests.get(base_url)
r.status_code

200

In [4]:
# get HTML
html = r.content

In [5]:
# convert HTML to BeautifulSoup object
soup = BeautifulSoup(html)

In [6]:
with open('youtube.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

## 1) Scrape the text from each span tag
## 2) How many images are on YouTube'e homepage?
## 3) Can you find the URL of the link with title = "Movies"?  Music? Sports?
## 4) Now, try connecting to and scraping https://www.youtube.com/results?search_query=stairway+to+heaven
## a) Can you get the names of the first few videos in the search results?
## b) Next, connect to one of the search result videos - https://www.youtube.com/watch?v=qHFxncb1gRY
## c) Can you find the "related" videos?  What are their titles?  Durations?  URLs? Number of views?
## d) Try finding (and scraping) the Twitter description of the video.

-----------


## 1) Scrape span tag

In [7]:
span_div = soup.find_all('span')
span_div

[<span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-appbar-guide yt-sprite"></span></span>,
 <span class="yt-uix-button-icon yt-uix-button-icon-appbar-guide yt-sprite"></span>,
 <span id="yt-masthead-logo-fragment"><a class="masthead-logo-renderer yt-uix-sessionlink spf-link" data-sessionlink="itct=CAMQsV4iEwj24ZOoxOnoAhXkxXMBHcvpBno" href="/" id="logo-container" title="YouTube Home"> <span class="logo masthead-logo-renderer-logo yt-sprite" title="YouTube Home"></span>
 </a></span>,
 <span class="logo masthead-logo-renderer-logo yt-sprite" title="YouTube Home"></span>,
 <span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-material-upload yt-sprite"></span></span>,
 <span class="yt-uix-button-icon yt-uix-button-icon-material-upload yt-sprite"></span>,
 <span class="yt-uix-button-content">Sign in</span>,
 <span class="yt-uix-button-content">Search</span>,
 <span class="yt-uix-button-content">Trending</span>,


In [13]:
span_list = [div.text.strip() for div in span_div  if div.text.strip()!='' ]
span_list

['Sign in',
 'Search',
 'Trending',
 'Home',
 'Home',
 'Home',
 'Trending',
 'Trending',
 'Trending',
 'History',
 'History',
 'History',
 'Music',
 'Music',
 'Music',
 'Sports',
 'Sports',
 'Sports',
 'Gaming',
 'Gaming',
 'Gaming',
 'News',
 'News',
 'News',
 'Live',
 'Live',
 'Live',
 'Fashion',
 'Fashion',
 'Fashion',
 'Learning',
 'Learning',
 'Learning',
 '360Â° Video',
 '360Â° Video',
 '360Â° Video',
 'Browse channels',
 'Browse channels',
 'Browse channels',
 'Sign in']

In [15]:
unique_span_list = set(span_list)
unique_span_list

{'360Â° Video',
 'Browse channels',
 'Fashion',
 'Gaming',
 'History',
 'Home',
 'Learning',
 'Live',
 'Music',
 'News',
 'Search',
 'Sign in',
 'Sports',
 'Trending'}

------------

## 2) How many images in youtube home page?

In [23]:
images = soup.find_all('img')
images

[<img alt="" aria-hidden="true" data-thumb="//yt3.ggpht.com/zYQhquP150XaBl5f-C5PPNa9qd4ux-b4zoJnUtlESSXVI4g0CCfAAWGEsYLqyP8mjDbHcHjm0g=s20-c-k-c0xffffffff-no-nd-rj" data-ytimg="1" height="20" onload=";window.__ytRIL &amp;&amp; __ytRIL(this)" src="/yts/img/pixel-vfl3z5WfW.gif" width="20"/>,
 <img alt="" aria-hidden="true" data-thumb="//yt3.ggpht.com/0suiXm6iuhFWYcAy3Yp_PCvskNY2Ri_MOqWVSL27T5iHXJCebB1_GOpYIBeObW6ypmKY4-wAwas=s20-c-k-c0xffffffff-no-nd-rj" data-ytimg="1" height="20" onload=";window.__ytRIL &amp;&amp; __ytRIL(this)" src="/yts/img/pixel-vfl3z5WfW.gif" width="20"/>,
 <img alt="" aria-hidden="true" data-thumb="//yt3.ggpht.com/YtiQPvkW7Z0eIPkwpfgu_JC3qXNrha2nHp-sIJXjYXFzyFPVhk6TlHZ2mQqPkZzWvB_H4wfI=s20-c-k-c0xffffffff-no-nd-rj" data-ytimg="1" height="20" onload=";window.__ytRIL &amp;&amp; __ytRIL(this)" src="/yts/img/pixel-vfl3z5WfW.gif" width="20"/>,
 <img alt="" aria-hidden="true" data-thumb="//yt3.ggpht.com/u1XZByRtsdqkeDaKlGWce_2q0naKuLjkdB9dk7Rcywi6NsfH6_Tg0TGgyyeJPsM--1m5

In [24]:
len(images)

8

In [30]:
img_url_list = [img.get('src') for img in images]
img_url_list

['/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif',
 '/yts/img/pixel-vfl3z5WfW.gif']

In [31]:
from urllib.parse import urljoin

img_with_full_path = [urljoin(base_url,img_url) for img_url in img_url_list]
img_with_full_path

['https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif',
 'https://www.youtube.com/yts/img/pixel-vfl3z5WfW.gif']

-----------

## 3) URL of the link with title = "Movies"? Music? Sports?

In [64]:
links = []

for item in unique_span_list:
    link = soup.find_all('a',{'title':item})
    links.append(link)
    
len(links)

14

In [85]:
from urllib.parse import urljoin

url_list = []

for link in links:
    if len(link) != 0:
        url = link[0].get('href')
        url_list.append(urljoin(base_url, url))
        
print(url_list)

['https://www.youtube.com/gaming', 'https://www.youtube.com/channel/UCEgdi0XIXXZ-qJOFPf4JSKw', 'https://www.youtube.com/channel/UCYfdidRxbB8Qhf0Nx7ioOYw', 'https://www.youtube.com/channel/UC4R8DWoMoI7CAwX8_LjQHig', 'https://www.youtube.com/feed/history', 'https://www.youtube.com/learning', 'https://www.youtube.com/channel/UCzuqhhs6NWbgTzMuM09WKDQ', 'https://www.youtube.com/channel/UC-9-kyTW8ZkZNDHQJ6FgpwQ', 'https://www.youtube.com/channel/UCrpQ4p1Ql_hG8rKXIKM1MOQ', 'https://www.youtube.com/feed/guide_builder', 'https://www.youtube.com/', 'https://www.youtube.com/feed/trending']


----------


## 4) scraping https://www.youtube.com/results?search_query=stairway+to+heaven

In [125]:
r = requests.get('https://www.youtube.com/results?search_query=stairway+to+heaven')

In [126]:
r.status_code

200

In [127]:
html = r.content

In [128]:
soup = BeautifulSoup(html,'lxml')

In [129]:
with open('stairway_to_heaven.html','wb') as file:
    file.write(soup.prettify('utf-8'))

## 5) get the names of the first few videos in the search results

In [130]:
videos_div = soup.find_all('div', {'class':'yt-lockup-content'})
len(videos_div)

20

In [131]:
for div in videos_div:
    print(div.find('a').text)


Led Zeppelin -  Stairway to Heaven Live
Led Zeppelin - Stairway To Heaven
Led Zeppelin - Stairway To Heaven (Official Audio)
Stairway to Heaven Led Zeppelin Lyrics
Heart - Stairway to Heaven (Live at Kennedy Center Honors) [FULL VERSION]
Stairway to Heaven - OST
레전드 드라마 [천국의 계단] '1~6회 정주행 가즈아!' / 'Stairway to heaven' Review
Stairway to heaven
Led Zeppelin Stairway To Heaven Live Earls Court 1975 HD
Led Zeppelin -  Stairway to Heaven
STAIRWAY TO HEAVEN -  Flashmob - LEGENDADO PORTUGUÊS - INGLÊS
Led Zeppelin Live Aid 1985 3 Stairway to Heaven Stereo
Stairway to Heaven (Led Zeppelin Tribute): Heart's Ann and Nancy Wilson - 2012 Kennedy Center Honors
Led Zeppelin "Stairway to Heaven" performed by The Classic Rock Show
Ost. Temptation & Stairway To Heaven
Mix - Led Zeppelin -  Stairway to Heaven Live
stairway to heaven love song
Stairway to Heaven- Korean Drama, Episode 1 (Ave Maria)
Stairway to Heaven with Amazing Gimnazija Kranj Symphony Orchestra
Led Zeppelin - Stairway To Heaven (HQ)


----------

## 6)  search result videos - https://www.youtube.com/watch?v=qHFxncb1gRY

In [133]:
for div in videos_div:
    url = div.find('a').get('href')
    full_url = urljoin(base_url, url)
    print(full_url)

https://www.youtube.com/watch?v=xbhCPt6PZIU
https://www.youtube.com/watch?v=Nnu1E5Kslig
https://www.youtube.com/watch?v=QkF3oxziUI4
https://www.youtube.com/watch?v=qHFxncb1gRY
https://www.youtube.com/watch?v=LFxOaDeJmXk
https://www.youtube.com/watch?v=3LtnRRxpUxc
https://www.youtube.com/watch?v=aasYVxGlXds
https://www.youtube.com/watch?v=5rOGUOKPQXc&list=PLVn8mr8EMW7ebpxW1R6BeytfYMLCiNOXX
https://www.youtube.com/watch?v=CxfniXCwrJA
https://www.youtube.com/watch?v=t2qqFCNuys0
https://www.youtube.com/watch?v=t9R8uzCXS10
https://www.youtube.com/watch?v=CBk-iRihSUg
https://www.youtube.com/watch?v=2u-PjvRyr0I
https://www.youtube.com/watch?v=o0RlZ0DBJaQ
https://www.youtube.com/watch?v=8ahD7WcHhc4
https://www.youtube.com/watch?v=xbhCPt6PZIU&list=RDxbhCPt6PZIU&start_radio=1
https://www.youtube.com/watch?v=HanQrTkbQ7I
https://www.youtube.com/watch?v=75x4HzVKhsQ
https://www.youtube.com/watch?v=dR5GN2aPsyY
https://www.youtube.com/watch?v=CPSkNFODVRE


-----------

## 8) description of the video

In [138]:
r = requests.get('https://www.youtube.com/watch?v=qHFxncb1gRY')
html = r.content
soup = BeautifulSoup(html,'lxml')

In [139]:
with open('searched_video.html','wb') as file:
    file.write(soup.prettify('utf-8'))

In [144]:
video_desc = soup.find('p', {'id' : 'eow-description'}).text
print(video_desc)

"Stairway to Heaven" is a song by the English rock band Led Zeppelin. It was composed by guitarist Jimmy Page and vocalist Robert Plant for the band's fourth unnamed studio album, (see Led Zeppelin IV (1971)). The song was voted #3 in 2000 by VH1 on their list of the 100 Greatest Rock Songs.[1] It was the most requested song on FM radio stations in the United States in the 1970s, despite never having been released as a single there.[2] In November 2007, through download sales promoting Led Zeppelin's Mothership release, "Stairway to Heaven" hit #37 on the UK Singles Chart.[3]


--------------

### 7) "related" videos? What are their titles? Durations? URLs? Number of views?

In [148]:
related_video_li = soup.find_all('li',{'class':'video-list-item'})
len(related_video_li)

20

In [178]:
for video in related_video_li:
    #title
    related_video_title = video.find('span', {'class' : 'title'}).text.strip()

    #durations
    span_class = video.find('span', {'class' : 'accessible-description'})
    if(span_class is not None):    
        related_video_duration = span_class.text.strip()
        
    #URLs
    related_video_url = urljoin(base_url, video.find('a').get('href'))
    
    #number of views
    views = video.find('span',{'class':'stat view-count'})
    if views is not None:
        related_video_number_of_views = views.text