In [1]:
import requests
from bs4 import BeautifulSoup

### Question 1: TigerDirect Website

In [49]:
url= "https://www.tigerdirect.com/applications/SearchTools/item-details.asp?EdpNo=1501390"
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
page

<Response [200]>

In [50]:
# Optionally, use print(page) to check if the webpage is successully accessed;
# Output "Response [200]" means suceessful connection.

In [51]:
# Create a beautifulsoup object 
soup = BeautifulSoup(page.text, 'lxml')

### Find and print the list price

In [52]:
# Find <span> that immediately follows <p> of class "list-price"
list_of_contents = soup.select("div.pdp-price p.list-price > span")
list_of_contents

[<span>List price:</span>,
 <span class="sr-only">$1,399
           and 99 cents
         </span>,
 <span><del aria-hidden="true">$1,399.99</del></span>]

In [53]:
# Print the HTML content to the screen (almost only text here) of the list price
for i in list_of_contents:  
    print(i.text)
    list_price = i.text # store the list price to a string

List price:
$1,399
          and 99 cents
        
$1,399.99


In [54]:
import re # Use Python's RegEx functionality; import package "re"

In [55]:
# Convert and print list price to "1234.56" format
print(re.sub(".*[$]([0-9]),([0-9]+[.][0-9]+).*","\\1\\2",list_price))

1399.99


### Find and print the current price (1)

In [56]:
# Alternative 1: Find html content that follows a specific path
list_of_price = soup.select("div.col-sm-12.col-lg-5.pdp-specs-info > div > div.pdp-price > p:nth-child(3) > span.sale-price > span.sr-only")
list_of_price

[<span class="sr-only">$1,029
           and 99 cents
         </span>]

In [57]:
# Print current price
for i in list_of_price:  
    print(i.text)
    current_price = i.text # store the current price to a string

$1,029
          and 99 cents
        


In [58]:
# Convert and print current price to "1234.56" format:
# Tried a RegEx method which does not work
print(re.sub("[$]([0-9]),([0-9]+)","\\1\\2.99",current_price))

1029.99
          and 99 cents
        


In [59]:
# Instead, use Compile Regex Pattern to match the first and second line of webpage text
pattern1 = re.compile(r'\$(\d{1}),(\d{3})')
pattern2 = re.compile(r' {10}and (\d{2}).*')

match1 = pattern1.search(current_price)
match2 = pattern2.search(current_price)

In [60]:
# Then, combine the separate parts together to print out current price in the correct format
price = [match1.group(1),match1.group(2),'.',match2.group(1)]
price

['1', '029', '.', '99']

In [61]:
print(''.join(price))

1029.99


### Find and print the current price (2)

In [62]:
# Alternative 2: Find html content that immediately follows <span> of class "sale-price"
list_of_price_2 = soup.select("div.pdp-price > p.final-price> span.sale-price")
list_of_price_2

[<span class="sale-price"><sup aria-hidden="true">$</sup><span aria-hidden="true">1,029</span><sup aria-hidden="true"><span class="priceDecimalMark">.</span>99<span class="priceFlag"></span></sup><span class="sr-only">$1,029
           and 99 cents
         </span></span>]

In [63]:
# Print current_price_2
for i in list_of_price_2:
    print(i.text)
    string = i.text 
    current_price_2 = string[0:9] 

$1,029.99$1,029
          and 99 cents
        


In [64]:
# Convert and print current price to "1234.56" format; got the same result as Alternative 1
print(re.sub("[^\d\.]", "", current_price_2))

1029.99


### Question 2: USNews Website

In [65]:
web_url = "https://www.usnews.com/"
page_usnews = requests.get(web_url, headers={"User-Agent": "Mozilla/5.0"})

In [66]:
soup = BeautifulSoup(page_usnews.text, 'lxml')

### Find access to its current "Top Stories"

In [67]:
# Alternative 1: Find all  elements <div> of the following class...
all_links = soup.find_all('div',class_ = 'Box-w0dun1-0 ArmRestTopStories__Part-s0vo7p-1 erkdnc biVKSR')

for div in all_links:
    links = div.findAll('a')
    TopStory1 = links[0]
    TopStory2 = links[3]
    print(TopStory1.get('href'))
    print(TopStory2.get('href'))

https://www.usnews.com/news/politics/articles/2023-01-20/mccarthy-biden-agree-to-sit-down-over-debt-ceiling
https://www.usnews.com/news/economy/articles/2023-01-20/existing-homes-fall-1-5-in-december-marking-11th-month-of-declines


In [68]:
# Alternative 2: Find all tags <a> with attribute "href" under <h3> tags
list_of_contents = soup.select("h3 > a[href]")

print(list_of_contents[0]['href']) # we see this is the first top story 
print(list_of_contents[1]['href']) # we see this is the second top story

https://www.usnews.com/news/politics/articles/2023-01-20/mccarthy-biden-agree-to-sit-down-over-debt-ceiling
https://www.usnews.com/news/economy/articles/2023-01-20/existing-homes-fall-1-5-in-december-marking-11th-month-of-declines


In [69]:
# Save URL of the first top story to a string
first_top_story = list_of_contents[0]['href']
first_top_story

'https://www.usnews.com/news/politics/articles/2023-01-20/mccarthy-biden-agree-to-sit-down-over-debt-ceiling'

In [70]:
# Save URL of the second top story to a string
second_top_story = list_of_contents[1]['href']
second_top_story 

'https://www.usnews.com/news/economy/articles/2023-01-20/existing-homes-fall-1-5-in-december-marking-11th-month-of-declines'

In [71]:
# Alternative 3: Find <a href> that directly follows a specific path
list_of_top_stories = soup.select("div.Box-w0dun1-0.ContentBox__Container-sc-1egb8dt-0.iZCosX.lmOexQ.ArmRestTopStories__CollapseBorderContentBox-s0vo7p-2.fTDCpH.ArmRestTopStories__CollapseBorderContentBox-s0vo7p-2.fTDCpH > h3.Heading-sc-1w5xk2o-0.ContentBox__StoryHeading-sc-1egb8dt-3.MRvpF.fqJuKa.story-headline > a[href]")
list_of_top_stories

for i in list_of_top_stories:  
    print(i['href'])

https://www.usnews.com/news/politics/articles/2023-01-20/mccarthy-biden-agree-to-sit-down-over-debt-ceiling
https://www.usnews.com/news/economy/articles/2023-01-20/existing-homes-fall-1-5-in-december-marking-11th-month-of-declines


### Load webpage of the first/second top story with the above URL

In [72]:
page1 = requests.get(first_top_story, headers={"User-Agent": "Mozilla/5.0"})
page1

<Response [200]>

In [73]:
page2 = requests.get(second_top_story, headers={"User-Agent": "Mozilla/5.0"})
page2

<Response [200]>

In [74]:
# Read the header of the first top story
soup1 = BeautifulSoup(page1.text, 'lxml')
story1 = soup1.select("h1")

# Check that "story1" gives the "first header" of the first top story
story1

[<h1 class="Heading-sc-1w5xk2o-0 iQhOvV">McCarthy, Biden Agree to Sit Down as Debt Ceiling Crisis Looms</h1>]

In [75]:
# Print the header out in the correct format
for i in story1:  
    print(i.text)

McCarthy, Biden Agree to Sit Down as Debt Ceiling Crisis Looms


In [76]:
# Then, read the header of the second top story
soup2 = BeautifulSoup(page2.text, 'lxml')
story2 = soup2.select("h1")

# Check that "story2" gives the "first header" of the second top story
story2

[<h1 class="Heading-sc-1w5xk2o-0 iQhOvV">Existing Homes Fall 1.5% in December, Marking 11th Month of Declines</h1>]

In [77]:
# Print the header out in the correct format
for i in story2:  
    print(i.text)

Existing Homes Fall 1.5% in December, Marking 11th Month of Declines


### Read and print the first 3 sentences of the main body in the second top story

In [78]:
# Alternative 1: Find all content that immediately follows <div> of class "Raw-slyvem-0 bCYKCn"
sentences = soup2.find_all('div',class_ = 'Raw-slyvem-0 bCYKCn')
target = sentences[0:3]

# Check that "target" gives exactly the first 3 sentences
target

[<div class="Raw-slyvem-0 bCYKCn"><p>Sales of existing homes slid 1.5% in December, somewhat better than expected but the 11th straight month of decline, the National Association of Realtors said on Friday.<br/></p></div>,
 <div class="Raw-slyvem-0 bCYKCn"><p></p></div>,
 <div class="Raw-slyvem-0 bCYKCn"><p>The number was better than estimates of a 3.4% drop and brings the annual rate of home sales just a hair above 4 million. Sales are now down 34% from year-ago levels.</p></div>]

In [79]:
# Print the sentences out in the correct format
for i in target:
    print(i.text+"\n")

Sales of existing homes slid 1.5% in December, somewhat better than expected but the 11th straight month of decline, the National Association of Realtors said on Friday.



The number was better than estimates of a 3.4% drop and brings the annual rate of home sales just a hair above 4 million. Sales are now down 34% from year-ago levels.



In [80]:
# Alternative 2: Find all content that immediately follows <div> with id="ad-in-text-target"
all_content = soup2.select("div#ad-in-text-target")

In [81]:
# Print the first 3 sentences of the main body
for div in all_content:
    main_body = div.findAll('p')
    print(main_body[0].text+"\n")
    print(main_body[3].text)

Sales of existing homes slid 1.5% in December, somewhat better than expected but the 11th straight month of decline, the National Association of Realtors said on Friday.

The number was better than estimates of a 3.4% drop and brings the annual rate of home sales just a hair above 4 million. Sales are now down 34% from year-ago levels.


In [82]:
# Alternative 3: Find <p> that immediately follows <div> of class "Raw-slyvem-0 bCYKCn"
paragraph = soup2.select('div.Raw-slyvem-0.bCYKCn > p')
three  = paragraph[0:3]

# Check that "three" gives the first 3 sentences in those paragraphs
three

[<p>Sales of existing homes slid 1.5% in December, somewhat better than expected but the 11th straight month of decline, the National Association of Realtors said on Friday.<br/></p>,
 <p></p>,
 <p>The number was better than estimates of a 3.4% drop and brings the annual rate of home sales just a hair above 4 million. Sales are now down 34% from year-ago levels.</p>]

In [83]:
# Print the sentences out in the correct format
for i in three:
    print(i.text+"\n")

Sales of existing homes slid 1.5% in December, somewhat better than expected but the 11th straight month of decline, the National Association of Realtors said on Friday.



The number was better than estimates of a 3.4% drop and brings the annual rate of home sales just a hair above 4 million. Sales are now down 34% from year-ago levels.

