#### This contains the learning from Documentation
```bash
pip install beautifulsoup4
```

#### BeautifulSoup have 4 types of Objects
- Tag : which itself is a HTML or XML tag
- BeautifulSoup 
- NavigableString : these are the texts
- Comment : this contains the HTML comment

In [None]:
from bs4 import BeautifulSoup
html_text = '''
<html>
<head>
<title>Demo Webscrapping</title>
</head>
<body>
<h1>Web Scraping Example</h1>
<p>This is a simple example of web scraping using Python.</p>
<p>Web scraping is the process of extracting data from websites.</p>
<p class="important">Important: Always check the website's terms of service before scraping.</p>
<p class="note">Note: This example is for educational purposes only.</p>
<p>For more information, visit <a href="https://www.example.com">Example Website</a>.</p>
<p>Follow us on <a href="https://twitter.com/example">Twitter</a> for updates.</p>
<div class="footer">
<p>&copy; 2023 Web Scraping Inc.</p>
</div>
</body>
</html>
'''

In [None]:
# Tag
soup = BeautifulSoup(html_text, 'html.parser')  # multi_valued_attributes = None  # to get the attributes as a list, otherwise it will be a string
tag = soup.title
print(type(tag))  # <class 'bs4.element.Tag'>
print(tag.name)  # title
print(tag.string)  # Demo Webscrapping, the text inside the title tag is a type of NavigableString
print(tag.attrs)  # {}
tag = soup.find('p', class_='important')
print(tag.attrs) # all attributes of the tag
print(tag.attrs.keys())  # keys of the attributes
print(tag.attrs.values())  # values of the attributes
print(tag['class'])  # important, accessing the class attribute directly
print(tag.get('class'))  # important, using get method to access the class attribute
print(tag.get('id', 'No id attribute'))  # No id attribute, using get method with default value
tag['id'] = 'important-paragraph'  # adding a new attribute
print(tag)  # <p class="important" id="important-paragraph">Important: Always check the website's terms of service before scraping.</p>
tag['class'] = 'critical'  # modifying an existing attribute
print(tag)  # <p class="critical" id="important-paragraph">Important: Always check the website's terms of service before scraping.</p>
tag.attrs['class'] = ['critical', 'urgent']# appending a new class to the existing class attribute
print(tag.attrs['class'])  # <p class="critical urgent" id="important-paragraph">Important: Always check the website's terms of service before scraping.</p>
print(tag)
del tag['id']  # deleting an attribute
print(tag)  # <p class="critical urgent">Important: Always check the website's terms of service before scraping.</p>
print(tag.attrs['class'])

<class 'bs4.element.Tag'>
title
Demo Webscrapping
{}
{'class': ['important']}
dict_keys(['class'])
dict_values([['important']])
['important']
['important']
No id attribute
<p class="important" id="important-paragraph">Important: Always check the website's terms of service before scraping.</p>
<p class="critical" id="important-paragraph">Important: Always check the website's terms of service before scraping.</p>
['critical', 'urgent']
<p class="critical urgent" id="important-paragraph">Important: Always check the website's terms of service before scraping.</p>
<p class="critical urgent">Important: Always check the website's terms of service before scraping.</p>
['critical', 'urgent']
['critical', 'urgent']


#### For xml 
class_is_multi= { '*' : 'class'}
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi)
xml_soup.p['class']
- Output -> ['body', 'strikeout']

In [36]:
# NavigableString
soup = BeautifulSoup('<p>Hello World</p>', 'html.parser')
# soup = BeautifulSoup('<p>Hello World <a> this is link</a></p>', 'html.parser')

print(type(soup.p.string))  # <class 'bs4.element.NavigableString'>
print(soup.p.string)  # Hello World
print(soup.p.string.replace('World', 'Python'))  # Hello Python
print(soup.p.string.upper())  # HELLO WORLD
soup.p.string = 'New Text'  # modifying the NavigableString
print(soup.p)  # <p>New Text</p>
soup.p.string.replace_with('Replaced Text')  # replacing the NavigableString with a new string
print(soup.p.string)
# print(soup.p.contents)  # this will give you a list of all the children of the p tag, including NavigableString and Tag objects

'''
We can also use the `stripped_strings` generator to get all the strings in the tag without leading or trailing whitespace.
for string in soup.p.stripped_strings:
    print(string)  # New Text   

We can not use .string on a Tag object that contains other tags, as it will return None.
# If we try to access the string of a tag that contains other tags, it will return None.
'''

<class 'bs4.element.NavigableString'>
Hello World
Hello Python
HELLO WORLD
<p>New Text</p>
Replaced Text


'\nWe can also use the `stripped_strings` generator to get all the strings in the tag without leading or trailing whitespace.\nfor string in soup.p.stripped_strings:\n    print(string)  # New Text   \n\nWe can not use .string on a Tag object that contains other tags, as it will return None.\n# If we try to access the string of a tag that contains other tags, it will return None.\n'

In [None]:
# NavigableString
doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml")
doc.find(text="INSERT FOOTER HERE").replace_with(footer)
print(doc)

<?xml version="1.0" encoding="utf-8"?>
<document><content/><footer>Here's the footer</footer></document>


  doc.find(text="INSERT FOOTER HERE").replace_with(footer)


In [41]:
# Comment
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
type(comment)
print(comment)
print(soup.b.prettify())

Hey, buddy. Want to buy a used parser?
<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>

