In [None]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1256 sha256=5ccdeecb76ae786737dc48736a453bf396c11cb75ef14dc09e776b5f7319c237
  Stored in directory: /root/.cache/pip/wheels/25/42/45/b773edc52acb16cd2db4cf1a0b47117e2f69bb4eb300ed0e70
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [None]:
pip install requests



In [4]:
pip install texttable

Collecting texttable
  Downloading texttable-1.6.7-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable
Successfully installed texttable-1.6.7


## **Scraping the data from wikipedia**

URL: https://en.wikipedia.org/wiki/World_War_II

In [None]:
# ## scrap data from wikipedia

import requests
from bs4 import BeautifulSoup

wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
soup=BeautifulSoup(wiki.text,'html')
print(soup.find('title'))


# ### find html tags with classes

ww2_contents=soup.find_all("div",class_='toc')
for i in ww2_contents:
    print(i.text)


overview=soup.find_all('table',class_='infobox vevent')
for z in overview:
    print(z.text)

<title>World War II - Wikipedia</title>
World War IIClockwise from the top: 
German Stuka dive bombers on the Eastern Front, 1943
British Matilda II tank during the North African campaign, 1941
Soviet troops at the Battle of Stalingrad, 1942–1943
U.S. naval force in the Lingayen Gulf, 1945
Soviet soldier raising a flag over the Reichstag after the Battle of Berlin, 1945
U.S. atomic bombing of Nagasaki in Japan, 1945
Date1 September 1939 – 2 September 1945[a] (6 years, 1 day)LocationMajor theaters: 
Europe
Pacific
Atlantic
Indian Ocean
South-East Asia
China
Japan
Middle East
Mediterranean
North Africa
Horn of Africa
Central Africa
Australia
Caribbean
North and South America
Result
Allied victory
Fall of Nazi Germany, Fascist Italy, and Imperial Japan
Allied military occupations of Germany, Japan, Austria, and Korea
Beginning of the Nuclear Age
Dissolution of the League of Nations and creation of the United Nations
Decolonisation of Asia and Africa and decline of European international i

# **P1 - Scraping Quotes from website**

**URL:** [Quotes Website](http://www.values.com/inspirational-quotes)

**scrapes the website and saves quotes to a file inspirational_quotes.csv**

In [2]:
#Python program to scrape website and save quotes to a file inspirational_quotes.csv

import requests
from bs4 import BeautifulSoup
import csv

#URL = "http://www.values.com/inspirational-quotes"
URL = "https://www.passiton.com/inspirational-quotes"

r = requests.get(URL)

soup = BeautifulSoup(r.content, 'html5lib')

quotes=[] # a list to store quotes

table = soup.find('div', attrs = {'id':'all_quotes'})

for row in table.findAll('div',
						attrs = {'class':'col-6 col-lg-4 text-center margin-30px-bottom sm-margin-30px-top'}):
	quote = {}
	quote['theme'] = row.h5.text
	quote['url'] = row.a['href']
	quote['img'] = row.img['src']
	quote['lines'] = row.img['alt'].split(" #")[0]
	quote['author'] = row.img['alt'].split(" #")[1]
	quotes.append(quote)

filename = 'inspirational_quotes.csv'
with open(filename, 'w', newline='') as f:
	w = csv.DictWriter(f,['theme','url','img','lines','author'])
	w.writeheader()
	for quote in quotes:
		w.writerow(quote)


# **P2 - Scraping Covid-19 stats**

URL: [COVID-19 STATS COUNTRY WISE](https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/)

In [5]:
# URl to Scrap: https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/

import requests
from bs4 import BeautifulSoup
import texttable as tt

# URL for scrapping data
url = 'https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/'

# get URL's html
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

data = []

# soup.find_all('td') will scrape every element in the url's table
data_iterator = iter(soup.find_all('td'))
# data_iterator is the iterator of the table

# This loop will keep repeating till there is data available in the iterator
while True:
	try:
		country = next(data_iterator).text
		confirmed = next(data_iterator).text
		deaths = next(data_iterator).text
		continent = next(data_iterator).text

		# For 'confirmed' and 'deaths', make sure to remove the commas and convert to int
		data.append((
			country,
			int(confirmed.replace(',', '')),
			int(deaths.replace(',', '')),
			continent
		))

	# StopIteration error is raised when there are no more elements left to iterate through
	except StopIteration:
		break

# Sort the data by the number of confirmed cases
data.sort(key = lambda row: row[1], reverse = True)


# create texttable object
table = tt.Texttable()
table.add_rows([(None, None, None, None)] + data)  # Add an empty row at the beginning for the headers
table.set_cols_align(('c', 'c', 'c', 'c'))  # 'l' denotes left, 'c' denotes center, and 'r' denotes right
table.header((' Country ', ' Number of cases ', ' Deaths ', ' Continent '))

print(table.draw())


+---------------------------+-------------------+----------+-------------------+
|          Country          |  Number of cases  |  Deaths  |     Continent     |
|       United States       |     1.078e+08     | 1172433  |   North America   |
+---------------------------+-------------------+----------+-------------------+
|           India           |     44996426      |  531924  |       Asia        |
+---------------------------+-------------------+----------+-------------------+
|          France           |     40138560      |  167642  |      Europe       |
+---------------------------+-------------------+----------+-------------------+
|          Germany          |     38428685      |  174352  |      Europe       |
+---------------------------+-------------------+----------+-------------------+
|          Brazil           |     37750389      |  705054  |   South America   |
+---------------------------+-------------------+----------+-------------------+
|        South Korea        

# **P3 - Scraping GPU Card Product Information**

URL: [GPU Card Info](https://www.newegg.com/p/pl?d=graphics+card&nm_mc=KNC-GoogleKWLess-Search-Broad&cm_mmc=KNC-GoogleKWLess-Search-Broad-_-VGA-_-graphics-card-_-PLP-Feature&page=2)

In [None]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

my_url = 'https://www.newegg.com/p/pl?d=graphics+card&nm_mc=KNC-GoogleKWLess-Search-Broad&cm_mmc=KNC-GoogleKWLess-Search-Broad-_-VGA-_-graphics-card-_-PLP-Feature&page=2'
uclient = ureq(my_url)
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html, "html.parser")
#print(page_soup)
#print(page_soup.body.id)
containers = page_soup.findAll("div",{"class":"item-container"})
print(containers)

filename = "products.csv"
f = open(filename, "w")
headers = "brand, productname, shipping\n"
f.write(headers)
print("before for")
for container in containers:
  print("after for")
  #brand = container.div.a["title"]
  brand = container.div.a["title"]
  title_container = container.findAll("a",{"class":"item-title"})
  product_name = title_container[0].text
  shipping = container.findAll("li",{"class":"price-ship"})
  shipping_price = shipping[0].text.strip()
  print(brand)
  print(product_name)
  print(shipping_price)
  f.write(brand + "," + product_name.replace(",","|") + "," + shipping_price + "\n")
f.close()


[<div class="item-container" id="9SIA24GJU06761"><a class="item-img" href="https://www.newegg.com/lenovo-nvidia-t400-4x61j52234/p/N82E16814502022"><img alt="Lenovo Nvidia T400 Graphic Card - 4GB GDDR6 4X61J52234" src="https://c1.neweggimages.com/ProductImageCompressAll300/14-502-022-01.png" title="Lenovo Nvidia T400 Graphic Card - 4GB GDDR6 4X61J52234"/><div class="btn btn-large btn-quickview">Quick View</div><div class="item-quick-action-container"><button class="quick-action"><i class="fa fa-heart-outline"></i></button></div></a><div class="item-info"><div class="item-branding has-brand-store"></div><a class="item-title" href="https://www.newegg.com/lenovo-nvidia-t400-4x61j52234/p/N82E16814502022" title="View Details"><span class="item-open-box-italic"></span>Lenovo Nvidia T400 Graphic Card - 4GB GDDR6 4X61J52234</a><ul class="item-features"><li><strong>Max Resolution:</strong> 7680 x 4320</li><li><strong>DisplayPort:</strong> 3 x Mini DisplayPort</li><li><strong>Chipset Manufacturer

TypeError: ignored

In [8]:
pip install fake_useragent



# **P4 - Web Scraping Customer Reports**

URL: http://www.consumerreports.org/cro/a-to-z-index/products/index.htm


In [None]:
import requests
from fake_useragent import UserAgent

url = 'http://www.consumerreports.org/cro/a-to-z-index/products/index.htm'                    # input your url here
file_name = 'consumer_reports.txt'              # output file name having complete HTML content

user_agent = UserAgent()

page = requests.get(url,headers={'user-agent':user_agent.chrome})
with open(file_name,'w') as file:
    file.write(page.content.decode('utf-8')) if type(page.content) == bytes else file.write(page.content)


from bs4 import BeautifulSoup
import re

def read_file():
    file = open('consumer_reports.txt')
    data = file.read()
    file.close()
    return data

soup = BeautifulSoup(read_file(),'lxml')

'''
When you inspect the consumer_report website we can see that all catagories From A - Z:
for example air conditioners are present in 'a' tag which inside the 'div' tag.
hence we use below code to extract all the 'a' tags.
'''

all_divs = soup.find_all('div',attrs={'class':'crux-body-copy'})


for div in all_divs:
  print(div.a.string)
  pass

'''
products = [div.a.string for div in all_divs]

for product in products:
    #print(product)
    #print()
    pass
'''
'''
Assignment: remove the space before ad after the string and copy the content to
a csv file consumer_list_formatted.
'''

In [None]:
'''
Here we are using the same consumer_reports.txt and creating a dictionary with
product name as key and product link as value and trying to dispaly.
'''

from bs4 import BeautifulSoup
import re


def read_file():
    file = open('consumer_reports.txt')
    data = file.read()
    file.close()
    return data

soup = BeautifulSoup(read_file(),'lxml')
products = {}   # product name - key and product link - value


product_names = [div.a.string for div in soup.find_all('div',class_='crux-body-copy')]

product_links = [div.a['href'] for div in soup.find_all('div',class_='crux-body-copy')]

products = {div.a.string:div.a['href'] for div in soup.find_all('div',class_='crux-body-copy')}  # Dictionary Comprahension

for key,value in products.items():
    print(key , '   -->',value)


'''
Assignment: remove the space before ad after the string and copy the content (Name + URL) to
a csv file consumer_list_link.
'''

AttributeError: ignored

# **P5 - Scraping Multiple web Pages**

Task is to Scrap java questions from codingbat website

URL: http://codingbat.com/java

I will divide the project into 3 parts:
1.   First script will describe you how to fetch the link of each section of Java questions.
2.   Secondly we will open each section(catagory)and we scrap link for each question.
3.   Thirdly we will open each question and get the problem statement, example associated with it.





In [9]:
#Part 1 - script will describe you how to fetch the link of each section of Java questions.

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

'''
Here we are scraping the link to each section.
Observe in inspect element that link is a ralative link (Warm-up) not absolute link
thus we used base_url above
'''
all_divs = soup.find_all('div',class_='summ')

#prints all the relative link
print("Printing all relative links\n")
for div in all_divs:
    print(div.a['href']) #Here 'a' is a child of 'div' tag


#prints all the absolute link
print("Printing all absolute links\n")
for div in all_divs:
    print(base_url + div.a['href'])  #Here 'a' is a child of 'div' tag

Printing all relative links

/java/Warmup-1
/java/Warmup-2
/java/String-1
/java/Array-1
/java/Logic-1
/java/Logic-2
/java/String-2
/java/String-3
/java/Array-2
/java/Array-3
/java/AP-1
/java/Recursion-1
/java/Recursion-2
/java/Map-1
/java/Map-2
/java/Functional-1
/java/Functional-2
Printing all absolute links

http://codingbat.com/java/Warmup-1
http://codingbat.com/java/Warmup-2
http://codingbat.com/java/String-1
http://codingbat.com/java/Array-1
http://codingbat.com/java/Logic-1
http://codingbat.com/java/Logic-2
http://codingbat.com/java/String-2
http://codingbat.com/java/String-3
http://codingbat.com/java/Array-2
http://codingbat.com/java/Array-3
http://codingbat.com/java/AP-1
http://codingbat.com/java/Recursion-1
http://codingbat.com/java/Recursion-2
http://codingbat.com/java/Map-1
http://codingbat.com/java/Map-2
http://codingbat.com/java/Functional-1
http://codingbat.com/java/Functional-2


In [None]:
#Secondly we will open each section and we scrap link for each question.
#--------Start - Same as above Script ----------------------------
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

all_divs = soup.find_all('div',class_='summ')


# all_links has link for each section (Page 1)
all_links = [base_url + div.a['href'] for div in all_divs] # This is list Comprahension

#--------End - Same as above Script ----------------------------

#Below code is to get link for each/all the section

for link in all_links:
    #link correspons to 2nd page ex:https://codingbat.com/java/Warmup-1
    inner_page = requests.get(link,headers={'user-agent':user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content,'lxml')

    #Now we need to scrap the link from 2nd inner page. (Inspect the HTML Page)

    div = inner_soup.find('div',class_='tabc')
    question_links = [base_url + td.a['href'] for td in div.table.find_all('td')] # has link to all the questions (list Comprahension)
    print(question_links)

    break #on commenting it you will get complete links for all the sections



['http://codingbat.com/prob/p187868', 'http://codingbat.com/prob/p181646', 'http://codingbat.com/prob/p154485', 'http://codingbat.com/prob/p116624', 'http://codingbat.com/prob/p140449', 'http://codingbat.com/prob/p182873', 'http://codingbat.com/prob/p184004', 'http://codingbat.com/prob/p159227', 'http://codingbat.com/prob/p191914', 'http://codingbat.com/prob/p190570', 'http://codingbat.com/prob/p123384', 'http://codingbat.com/prob/p136351', 'http://codingbat.com/prob/p161642', 'http://codingbat.com/prob/p112564', 'http://codingbat.com/prob/p183592', 'http://codingbat.com/prob/p191022', 'http://codingbat.com/prob/p192082', 'http://codingbat.com/prob/p144535', 'http://codingbat.com/prob/p178986', 'http://codingbat.com/prob/p165701', 'http://codingbat.com/prob/p100905', 'http://codingbat.com/prob/p151713', 'http://codingbat.com/prob/p199720', 'http://codingbat.com/prob/p101887', 'http://codingbat.com/prob/p172021', 'http://codingbat.com/prob/p132134', 'http://codingbat.com/prob/p177372', 

In [None]:
#Final Script

#part 1

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

all_divs = soup.find_all('div',class_='summ')

all_links = [base_url + div.a['href'] for div in all_divs]


# part 2

for link in all_links:
    inner_page = requests.get(link,headers={'user-agent':user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content,'lxml')
    div = inner_soup.find('div',class_='tabc')
    question_links = [base_url + td.a['href'] for td in div.table.find_all('td')]


# part 3

    for question_link in question_links:
        final_page = requests.get(question_link)
        final_soup = BeautifulSoup(final_page.content, 'lxml')
        indent_div = final_soup.find('div', attrs={'class':'indent'})

        problem_statement = indent_div.table.div.string

        siblings_of_statement = indent_div.table.div.next_siblings

        examples = [sibling for sibling in siblings_of_statement if sibling.string is not None]

        print(problem_statement)
        for example in examples:
            print(example)

        print('\n\n\n')

The parameter weekday is true if it is a weekday, and the parameter vacation is true if we are on vacation. We sleep in if it is not a weekday or we're on vacation. Return true if we sleep in.
sleepIn(false, false) → true
sleepIn(true, false) → false
sleepIn(false, true) → true




We have two monkeys, a and b, and the parameters aSmile and bSmile indicate if each is smiling. We are in trouble if they are both smiling or if neither of them is smiling. Return true if we are in trouble.
monkeyTrouble(true, true) → true
monkeyTrouble(false, false) → true
monkeyTrouble(true, false) → false




Given two int values, return their sum. Unless the two values are the same, then return double their sum.
sumDouble(1, 2) → 3
sumDouble(3, 2) → 5
sumDouble(2, 2) → 8




Given an int n, return the absolute difference between n and 21, except return double the absolute difference if n is over 21.
diff21(19) → 2
diff21(10) → 11
diff21(21) → 0




We have a loud talking parrot. The "hour" parameter is t

**WorkSpace**

In [None]:
'''

#Requirements
#pip3 install requests
#pip3 install bs4


# ## Basic fundamentals of web scraping

# import these two modules bs4 for selecting HTML tags easily
from bs4 import BeautifulSoup
# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.
import requests

# I put here my own blog url ,you can change it.
url="https://getpython.wordpress.com/"

#Requests module use to data from given url
source=requests.get(url)

# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)
soup=BeautifulSoup(source.text,'html')

# Find function is used to find a single element if there are more than once it always returns the first element.
title=soup.find('title') # place your html tagg in parentheses that you want to find from html.
print("this is with html tags :",title)

qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.

#use .text for extract only text without any html tags
print("this is without html tags:",qwery.text)


links=soup.find('a') #i extarcted link using "a" tag
print(links)


# ## extarct data from innerhtml

# here i extarcted href data from anchor tag.
print(links['href'])

# similarly i got class details from a anchor tag
print(links['class'])


# ## findall operation in Bs4

# findall function is used to fetch all tags at a single time.
many_link=soup.find_all('a') # here i extracted all the anchor tags of my website
total_links=len(many_link) # len function is use to calculate length of your array
print("total links in my website :",total_links)
print()
for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.
    print(i)

second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.
print(second_link)
print()
print("href is :",second_link['href']) #only href link is extracted from ancor tag


# select div tag from second link
nested_div=second_link.find('div')
# As you can see div element extarcted , it also have inner elements
print(nested_div)
print()
#here i extracted class element from div but it give us in the form of list
z=(nested_div['class'])
print(z)
print(type(z))
print()
#  " " .join () method use to convert list type  into string type
print("class name of div is :"," ".join(nested_div['class']))

'''





<title>World War II - Wikipedia</title>
World War IIClockwise from the top: 
German Stuka dive bombers on the Eastern Front, 1943
British Matilda II tank during the North African campaign, 1941
Soviet troops at the Battle of Stalingrad, 1942–1943
U.S. naval force in the Lingayen Gulf, 1945
Soviet soldier raising a flag over the Reichstag after the Battle of Berlin, 1945
U.S. atomic bombing of Nagasaki in Japan, 1945
Date1 September 1939 – 2 September 1945[a] (6 years, 1 day)LocationMajor theaters: 
Europe
Pacific
Atlantic
Indian Ocean
South-East Asia
China
Japan
Middle East
Mediterranean
North Africa
Horn of Africa
Central Africa
Australia
Caribbean
North and South America
Result
Allied victory
Fall of Nazi Germany, Fascist Italy, and Imperial Japan
Allied military occupations of Germany, Japan, Austria, and Korea
Beginning of the Nuclear Age
Dissolution of the League of Nations and creation of the United Nations
Decolonisation of Asia and Africa and decline of European international i

In [1]:
pip install fake_useragent

Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

user_agent = UserAgent()
main_url = 'https://www.goodreads.com/'
page = requests.get(main_url, headers={'user-agent': user_agent.chrome})
soup = BeautifulSoup(page.content, 'lxml')

base_url = 'https://www.goodreads.com'

all_divs = soup.find_all('div', class_='left')

all_links1 = [base_url + link['href'] for div in all_divs for link in div.find_all('a')]

if all_links1:
    first_link = all_links1[0]

    inner_page = requests.get(first_link, headers={'user-agent': user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content, 'lxml')

    divs_with_class = inner_soup.find_all('div', class_='coverWrapper')
    all_links2 = [base_url + link['href'] for div in divs_with_class for link in div.find_all('a')]
    
    for inner_link in all_links2:
        inner_inner_page = requests.get(inner_link, headers={'user-agent': user_agent.chrome})
        inner_inner_soup = BeautifulSoup(inner_inner_page.content, 'lxml')

        title_element = inner_inner_soup.find('div', class_='BookPageTitleSection__title')
        if title_element:
            title = title_element.get_text(strip=True)  # Get the text content of the element
            print(title)
        else:
            print("Title element not found on:", inner_link)

        print('\n\n')


Title element not found on: https://www.goodreads.com/book/show/62919376-alchemy-of-a-blackbird



Title element not found on: https://www.goodreads.com/book/show/53802058-contradiction-days



Title element not found on: https://www.goodreads.com/book/show/62593018-disobedient



Art Monsters : Unruly Bodies in Feminist Art



Spider-Verse: The Art of the Movies#2Spider-Man: Across the Spider-Verse: The Art of the Movie



Title element not found on: https://www.goodreads.com/book/show/58667414-worlds-beyond-time



Title element not found on: https://www.goodreads.com/book/show/62710654-the-glass-ch-teau



Title element not found on: https://www.goodreads.com/book/show/62854814-hands-of-time



1964: Eyes of the Storm



Title element not found on: https://www.goodreads.com/book/show/63883284-the-collector



When the Rain Ends



Title element not found on: https://www.goodreads.com/book/show/59109077-horse



Title element not found on: https://www.goodreads.com/book/show/60415700