# Introduction to Web Scraping 

### Example 01

In [102]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [103]:
url = "https://ticker.finology.in/"

r= requests.get(url)
print(r)

<Response [200]>


### Create a soup object out of the html document that the module requests retrieved for us

In [104]:
soup = BeautifulSoup(r.text, "lxml")

### Create a table object from the html table contained in the soup object

In [105]:
table = soup.find("table", class_="table table-sm table-hover screenertable")
#print(table)     #uncomment to see full output

##3 Find all the headers in the scraped table

In [106]:
headers = table.find_all("th")
print(headers)

[<th scope="col">Company</th>, <th scope="col">Price<span class="muted">Rs.</span></th>, <th scope="col">Day High<span class="muted">Rs.</span></th>]


### Place the headers in a list called "titles"

In [107]:
titles = []

for i in headers:
    title = i.text
    titles.append(title)
    
print(titles)

['Company', 'PriceRs.', 'Day HighRs.']


### Create an empty pandas dataframe with the column names equal to the table headers in the list "titles"

In [108]:
df= pd.DataFrame(columns=titles)
print(df)

Empty DataFrame
Columns: [Company, PriceRs., Day HighRs.]
Index: []


### Find all the row in the table and place them into an object called rows

In [109]:
rows = table.find_all("tr")
#print(rows)           #uncomment to see full output

### Display all the rows except for the first one (which is has the headers)

In [110]:
# Uncomment to see full output

#for i in rows[1:]:
#    print(i)     


### Find all the data cells in each row

In [111]:
####Uncomment code below to see full output###

#for i in rows[1:]:
#    data = table.find_all("td")
#    print(data)

### Place the text from each row into a list called "row". A list comprehension is used. The attribute "text" pick only the text (not the tags) from each row.

In [112]:
## Uncomment code below to see full output

#for i in rows[1:]:
#    data = table.find_all("td")
#    #print(data)
#    row = [tr.text for tr in data]
#    print(row)

# We now have a list called rows with all the data in the table

### This is practically the same code as above

In [113]:
# Remember: rows is a long list. Each list item has tags.
# Example:
rows[1:2]

[<tr>
 <td>
 <a class="complink" href="company/NHPC">NHPC</a>
 </td>
 <td class="Number">100.65</td>
 <td class="Number">103.00</td>
 </tr>]

In [114]:
for i in rows[1:]:
    data = i.find_all("td")            # loop around, find the td tag in the first row. The td tag contain the data in the first row
    row = [tr.text for tr in data]     # remove the td tags from the first row and only keep the text
    l =len(df)                         # initially, the size of the dataframe is 0 but as we loop around it increases (more rows get added)
    df.loc[l]= row                     # cleverly, the rows of the dataframe df get filled up with the stripped up rows of the table

df.head(10)     

Unnamed: 0,Company,PriceRs.,Day HighRs.
0,\nNHPC\n,100.65,103.0
1,\nNBCC (India)\n,168.65,169.95
2,\nReliance Industries\n,2915.4,2949.8
3,\nBPCL\n,558.35,572.4
4,\nSJVN\n,142.25,146.35
5,\nPNB\n,125.4,128.25
6,\nHousing & Urban Dev.\n,208.0,226.45
7,\nSBI\n,649.65,660.55
8,\nIndian Oil Corp.\n,162.9,166.8
9,\nEngineers India\n,261.35,273.9


In [115]:
len(df)

15

## Example 02

### `find()`
* Method for finding the first tag with the specified name (as a string) or id and returning an object of type bs4. 
* Used for getting merely the first tag of the incoming HTML object for which condition is satisfied. 
* The return type is <class ‘bs4.element.Tag’>.

### `find_all()`
* Method for finding out ALL tags with the specified tag name or id and returning them as a list of type bs4. 
* It is used for getting all the incoming HTML objects for which condition is satisfied.  
* The return type is <class ‘bs4.element.ResultSet’>,

## Create a string that simulates a html page with a table

In [116]:
from bs4 import BeautifulSoup

html_doc = """
<html>
    <body>
        <h1>Hello, BeautifulSoup!</h1>
        <table border="1" >
  <tr>
    <th>Name</th>
    <th>Age</th>  
    <th>Country</th>
  </tr>
  <tr>
    <td>Todd Harris</td>
    <td>28</td>
    <td>Britain</td>
  </tr>
  <tr>
    <td>Nicole Johnson</td>
    <td>35</td>
    <td>USA</td>
  </tr>
  <tr>
    <td>Samuel Jackson</td>
    <td>19</td>
    <td>USA</td>
  </tr>
</table>
    </body>
</html>
"""
# Creating a soup object using the html.parser
#The parser has the job of "parsing" the html formatted code
# Parsing comes from Latin pars (orationis), meaning part (of speech).HTML parsing is basically taking in HTML code and extracting relevant information 
soup = BeautifulSoup(html_doc, 'html.parser')    

## Find All the (table> tags in the soup object by apply the method find("table") to it
table = soup.find('table')
print(table)

<table border="1">
<tr>
<th>Name</th>
<th>Age</th>
<th>Country</th>
</tr>
<tr>
<td>Todd Harris</td>
<td>28</td>
<td>Britain</td>
</tr>
<tr>
<td>Nicole Johnson</td>
<td>35</td>
<td>USA</td>
</tr>
<tr>
<td>Samuel Jackson</td>
<td>19</td>
<td>USA</td>
</tr>
</table>


In [117]:
headers = table.find_all("th")
print(headers)

[<th>Name</th>, <th>Age</th>, <th>Country</th>]


In [118]:
titles = []

for i in headers:
    title = i.text
    titles.append(title)
    
print(titles)

['Name', 'Age', 'Country']


In [119]:
df2 = pd.DataFrame(columns=titles)
print(df2)

Empty DataFrame
Columns: [Name, Age, Country]
Index: []


In [120]:
rows = table.find_all("tr")
print(rows) 

[<tr>
<th>Name</th>
<th>Age</th>
<th>Country</th>
</tr>, <tr>
<td>Todd Harris</td>
<td>28</td>
<td>Britain</td>
</tr>, <tr>
<td>Nicole Johnson</td>
<td>35</td>
<td>USA</td>
</tr>, <tr>
<td>Samuel Jackson</td>
<td>19</td>
<td>USA</td>
</tr>]


In [121]:
for i in rows[1:3]:
    print(i)  

<tr>
<td>Todd Harris</td>
<td>28</td>
<td>Britain</td>
</tr>
<tr>
<td>Nicole Johnson</td>
<td>35</td>
<td>USA</td>
</tr>


In [122]:
# Clean the rows (strip all tags) and place the rows in the list inside the empty dataframe named df2
for i in rows[1:]:
    data = i.find_all("td")            
    row = [tr.text for tr in data]     
    l =len(df2)                        
    df2.loc[l]= row                     

# Inspect the dataset 
df2.head() 

Unnamed: 0,Name,Age,Country
0,Todd Harris,28,Britain
1,Nicole Johnson,35,USA
2,Samuel Jackson,19,USA
