# Web Scraping using Python BeautifulSoup

## FAANGM Stock data scraping from Yahoo Finance

In [4]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

In [5]:
url ="https://finance.yahoo.com/quote/NFLX"

In [6]:
req = requests.get(url)

In [7]:
req.status_code

200

In [132]:
#req.text

In [9]:
soup = BeautifulSoup(req.text,'html.parser')

In [10]:
print(soup.title.text)

Netflix, Inc. (NFLX) Stock Price, News, Quote & History - Yahoo Finance


In [11]:
price = soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text
change = soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[1].text
percent = soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[2].text

In [12]:
print(price,change,percent)

361.20 -4.73 (-1.29%)


In [13]:
def getData(comp):
    url =f'https://finance.yahoo.com/quote/{comp}'
    req = requests.get(url)
    soup = BeautifulSoup(req.text,'html.parser')
    stock = {
        'comp' : comp,
        'price' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
        'change' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[1].text,
        'percent' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[2].text,
            }
    return stock

In [14]:
getData('NFLX')

{'comp': 'NFLX', 'price': '361.20', 'change': '-4.73', 'percent': '(-1.29%)'}

In [15]:
getData('GOOG')

{'comp': 'GOOG', 'price': '140.29', 'change': '-1.41', 'percent': '(-1.00%)'}

In [16]:
getData('PXD')

{'comp': 'PXD', 'price': '240.36', 'change': '-0.46', 'percent': '(-0.19%)'}

In [17]:
mystocks = ['META','NFLX','GOOG','MSFT','AAPL','AMZN']
stockData = []

def getData(comp):
    url =f'https://finance.yahoo.com/quote/{comp}'
    req = requests.get(url)
    soup = BeautifulSoup(req.text,'html.parser')
    stock = {
        'comp' : comp,
        'price' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
        'change' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[1].text,
        'percent' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[2].text,
            }
    return stock

for i in mystocks:
    stockData.append(getData(i))
    print('Stock Price of :',i)
    
print(stockData)    

Stock Price of : META
Stock Price of : NFLX
Stock Price of : GOOG
Stock Price of : MSFT
Stock Price of : AAPL
Stock Price of : AMZN
[{'comp': 'META', 'price': '324.16', 'change': '-3.66', 'percent': '(-1.12%)'}, {'comp': 'NFLX', 'price': '361.20', 'change': '-4.73', 'percent': '(-1.29%)'}, {'comp': 'GOOG', 'price': '140.29', 'change': '-1.41', 'percent': '(-1.00%)'}, {'comp': 'MSFT', 'price': '331.16', 'change': '-1.26', 'percent': '(-0.38%)'}, {'comp': 'AAPL', 'price': '180.71', 'change': '+0.91', 'percent': '(+0.51%)'}, {'comp': 'AMZN', 'price': '132.33', 'change': '+0.50', 'percent': '(+0.38%)'}]


### Storing Retrieved data in JSON and CSV format 

In [18]:
mystocks = ['META','NFLX','GOOG','MSFT','AAPL','AMZN']
stockData = []

def getData(comp):
    url =f'https://finance.yahoo.com/quote/{comp}'
    req = requests.get(url)
    soup = BeautifulSoup(req.text,'html.parser')
    stock = {
        'comp' : comp,
        'price' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
        'change' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[1].text,
        'percent' : soup.find('div',{'class' : 'D(ib) Mend(20px)'}).find_all('fin-streamer')[2].text,
            }
    return stock

for i in mystocks:
    stockData.append(getData(i))
    print('Stock :',i)

with open('stockData.json','w') as f:
    json.dump(stockData,f)
    
df = pd.DataFrame(stockData) 
df.to_csv('stockData.csv')

Stock : META
Stock : NFLX
Stock : GOOG
Stock : MSFT
Stock : AAPL
Stock : AMZN


## Web Scraping IMDB to retrieve Movie details

In [19]:
imdb_url = 'https://www.imdb.com/search/title/?title_type=feature&year=2022-01-01,2022-12-31&sort=num_votes,desc'
r = requests.get(imdb_url)
r.status_code

200

In [130]:
#r.text

In [21]:
imdb_soup = BeautifulSoup(r.text, 'html.parser')

In [131]:
#print(imdb_soup.prettify())

In [23]:
imdb_soup.title.text

'Feature Film,\nReleased between 2022-01-01 and 2022-12-31\n(Sorted by Number of Votes Descending) - IMDb'

In [24]:
movie_containers = imdb_soup.find_all('div', class_ = 'lister-item mode-advanced')

In [25]:
len(movie_containers)

50

In [133]:
#movie_containers[0]

In [27]:
movie_containers[0].h3

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt1877830/">The Batman</a>
<span class="lister-item-year text-muted unbold">(2022)</span>
</h3>

In [28]:
movie_containers[0].h3

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt1877830/">The Batman</a>
<span class="lister-item-year text-muted unbold">(2022)</span>
</h3>

In [29]:
name = movie_containers[0].h3.a.text

In [30]:
print(name)

The Batman


In [31]:
year = movie_containers[0].h3.find('span',{'class' : 'lister-item-year text-muted unbold' }).text

In [32]:
print(year)

(2022)


In [33]:
rating = movie_containers[0].strong.text

In [34]:
print(rating)

7.8


In [35]:
runtime = movie_containers[0].find('span', class_ = "runtime").text

In [36]:
print(runtime)

176 min


In [37]:
dir = movie_containers[0].find('p', class_ = "").a.text

In [38]:
print(dir)

Matt Reeves


In [39]:
metascore = int(movie_containers[0].find('span', class_ = "metascore favorable").text)

In [40]:
metascore

72

In [41]:
#span data-value="736356" name="nv">736,356</span>
votes = int(movie_containers[0].find('span', {'name' : "nv"})['data-value'])

In [42]:
votes

736532

In [43]:
genre_one = movie_containers[0].find('span',{'class' : 'genre'})

In [44]:
genre_one

<span class="genre">
Action, Crime, Drama            </span>

In [45]:
eighth_movie_metascore = movie_containers[6].find('div', class_ = 'ratings-metascore')
eighth_movie_metascore

<div class="inline-block ratings-metascore">
<span class="metascore favorable">81        </span>
        Metascore
            </div>

In [121]:
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000, 2022)] 

In [122]:
names = []
years = []
votes = []
metascores = []
imdb_ratings = []
genres = []
runtimes = []
directors = []
grosss = []


for year_url in years_url:
    for page in pages:
        page_html = BeautifulSoup(r.text, 'html.parser')
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
        for container in mv_containers:
            if container.find('div', class_ = 'ratings-metascore') is not None:
#name   
                name = container.h3.a.text
                names.append(name)
#year
                year = container.find('span', class_ = "lister-item-year").text
                years.append(year)
#rating
                rating = float(container.strong.text)
                imdb_ratings.append(rating)
#votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))
#metascore
                score = container.find('span', class_ = 'metascore').text
                metascores.append(int(score))
#genre
                genre = container.find('span', class_ = "genre").text.strip()
                genres.append(genre)
#runtime
                runtime = container.find('span', class_ = "runtime").text
                runtimes.append(runtime)
#director       
                director = container.find('p', class_ = "").a.text
                directors.append(director)
#gross
                gross = container.find('span', attrs = {'name':'nv'})['data-value'][1]
                grosss.append(int(gross))

In [123]:
test_df = pd.DataFrame({'Movie':names, 'Year': years, 'IMDB': imdb_ratings, 'Metascore': metascores, 'Votes':votes, 'Runtime' : runtimes, 'Director':directors, 'Genre':genres})

In [124]:
test_df

Unnamed: 0,Movie,Year,IMDB,Metascore,Votes,Runtime,Director,Genre
0,The Batman,(2022),7.8,72,736532,176 min,Matt Reeves,"Action, Crime, Drama"
1,Top Gun: Maverick,(2022),8.3,78,634206,130 min,Joseph Kosinski,"Action, Drama"
2,Everything Everywhere All at Once,(2022),7.8,81,482646,139 min,Daniel Kwan,"Action, Adventure, Comedy"
3,Avatar: The Way of Water,(2022),7.6,67,462939,192 min,James Cameron,"Action, Adventure, Fantasy"
4,Doctor Strange in the Multiverse of Madness,(2022),6.9,60,455443,126 min,Sam Raimi,"Action, Adventure, Fantasy"
...,...,...,...,...,...,...,...,...
4131,The Pale Blue Eye,(2022),6.6,56,115206,128 min,Scott Cooper,"Crime, Horror, Mystery"
4132,Brahmastra Part One: Shiva,(2022),5.6,57,110008,167 min,Ayan Mukerji,"Action, Adventure, Fantasy"
4133,Where the Crawdads Sing,(2022),7.2,43,108566,125 min,Olivia Newman,"Drama, Mystery, Romance"
4134,Guillermo del Toro's Pinocchio,(2022),7.6,79,105476,117 min,Guillermo del Toro,"Animation, Drama, Family"


In [125]:
test_df = test_df[['Movie', 'Year', 'IMDB', 'Metascore', 'Votes', 'Runtime', 'Director', 'Genre']]
test_df.tail()

Unnamed: 0,Movie,Year,IMDB,Metascore,Votes,Runtime,Director,Genre
4131,The Pale Blue Eye,(2022),6.6,56,115206,128 min,Scott Cooper,"Crime, Horror, Mystery"
4132,Brahmastra Part One: Shiva,(2022),5.6,57,110008,167 min,Ayan Mukerji,"Action, Adventure, Fantasy"
4133,Where the Crawdads Sing,(2022),7.2,43,108566,125 min,Olivia Newman,"Drama, Mystery, Romance"
4134,Guillermo del Toro's Pinocchio,(2022),7.6,79,105476,117 min,Guillermo del Toro,"Animation, Drama, Family"
4135,The Fabelmans,(2022),7.5,84,103788,151 min,Steven Spielberg,Drama


In [126]:
test_df.loc[:, 'Year'] = test_df['Year'].str[-5:-1].astype(int)

  test_df.loc[:, 'Year'] = test_df['Year'].str[-5:-1].astype(int)


## IMDB Dataset for Movies in 2022

In [127]:
test_df.head()

Unnamed: 0,Movie,Year,IMDB,Metascore,Votes,Runtime,Director,Genre
0,The Batman,2022,7.8,72,736532,176 min,Matt Reeves,"Action, Crime, Drama"
1,Top Gun: Maverick,2022,8.3,78,634206,130 min,Joseph Kosinski,"Action, Drama"
2,Everything Everywhere All at Once,2022,7.8,81,482646,139 min,Daniel Kwan,"Action, Adventure, Comedy"
3,Avatar: The Way of Water,2022,7.6,67,462939,192 min,James Cameron,"Action, Adventure, Fantasy"
4,Doctor Strange in the Multiverse of Madness,2022,6.9,60,455443,126 min,Sam Raimi,"Action, Adventure, Fantasy"


In [129]:
test_df.to_csv('IMDB_movie_rating5.csv')