Steps for Scraping Any Website

To scrape a website using Python, you need to perform these four basic steps:
Sending an HTTP GET request to the URL of the webpage that you want to scrape, which will respond with HTML content. We can do this by using the Request library of Python.
Fetching and parsing the data using Beautifulsoup and maintain the data in some data structure such as Dict or List.
Analyzing the HTML tags and their attributes, such as class, id, and other HTML tag attributes. Also, identifying your HTML tags where your content lives.
Outputting the data in any file format such as CSV, XLSX, JSON, etc.

In [120]:
import pandas as pd

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
## LET'S START WITH #1: NAVIGATING TO THE HOMEPAGE OF THE WEBSITE THAT HOSTS THE CLIMATE DATA TABLE I WANT

driver = webdriver.Chrome(chromedriver)
driver.get('http://www.stormfax.com/elnino.htm')

In [14]:
# Make a GET request to fetch the raw HTML content
html_content = requests.get('http://www.stormfax.com/elnino.htm').text


In [15]:
# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
print(soup.prettify()) # print the parsed data of html

<html>
 <head>
  <title>
   STORMFAX® - El Niño Weather
  </title>
  <meta content="Stormfax" name="author"/>
  <meta content="weather" name="classification"/>
  <meta content="Here are the facts without the mumbo-jumbo." name="description"/>
  <meta content="El Nino, El Niño, El-Nino, La Nina, La-Nina, La Niña, ENSO" name="keywords"/>
 </head>
 <body background="/parch.jpg">
  <center>
   <img src="/ElNinoWeb.png"/>
  </center>
  <p>
  </p>
  <center>
   <img src="/ninotele.gif"/>
  </center>
  <center>
   <img src="/smalfax2.gif"/>
  </center>
  <p>
  </p>
  <center>
   <h1>
    <font color="BLUE">
     El Niño and La Niña
    </font>
   </h1>
  </center>
  <center>
   <font size="+1">
    <b>
     (
     <i>
      without
     </i>
     the Mumbo-Jumbo)
    </b>
   </font>
  </center>
  <p>
  </p>
  <center>
   Copyright
   <b>
    ©
   </b>
   1996-2021 STORMFAX
  </center>
  <p>
  </p>
  <center>
   <b>
    El Niño/La Niña
    <a href="http://www.stormfax.com/ninoword.htm">
     G

In [17]:
# Print the title of the webpage
print(soup.title.text)

STORMFAX® - El Niño Weather


In [63]:
soup.find('h2').next_sibling

In [74]:
my_table = soup.find('a', {'name': 'Table'}).find('table')

In [75]:
my_table

<table border="8" cellpadding="8" cellspacing="5" width="50%">
<tr bgcolor="FFFFE0">
<th><h2><font color="RED">El Niño<br/>Years</font></h2></th><th><h2><font color="BLUE">La Niña<br/>Years</font></h2></th></tr>
<tr align="center"><td><b>1900-1901</b></td><td></td></tr>
<tr align="center"><td><b>1902-1903</b></td><td><b>1903-1904</b></td></tr>
<tr align="center"><td><b>1905-1906</b></td><td><b>1906-1907</b></td></tr>
<tr align="center"><td></td><td><b>1908-1909</b></td></tr>
<tr align="center"><td><b>1911-1912</b></td><td></td></tr>
<tr align="center"><td><b>1914-1915</b></td><td><b>1916-1917</b></td></tr>
<tr align="center"><td><b>1918-1919</b></td><td><b>1920-1921</b></td></tr>
<tr align="center"><td><b>1923-1924</b></td><td><b>1924-1925</b></td></tr>
<tr align="center"><td><b>1925-1926</b></td><td><b>1928-1929</b></td></tr>
<tr align="center"><td><b>1930-1931</b></td><td><b>1931-1932</b></td></tr>
<tr align="center"><td><b>1932-1933</b></td><td><b>1938-1939</b></td></tr>
<tr align="

In [80]:
rows = my_table.find_all('tr')

In [99]:
#Just get the rows we care about
rows = rows[1:-1]

In [100]:
#
rows

[<tr align="center"><td><b>1900-1901</b></td><td></td></tr>,
 <tr align="center"><td><b>1902-1903</b></td><td><b>1903-1904</b></td></tr>,
 <tr align="center"><td><b>1905-1906</b></td><td><b>1906-1907</b></td></tr>,
 <tr align="center"><td></td><td><b>1908-1909</b></td></tr>,
 <tr align="center"><td><b>1911-1912</b></td><td></td></tr>,
 <tr align="center"><td><b>1914-1915</b></td><td><b>1916-1917</b></td></tr>,
 <tr align="center"><td><b>1918-1919</b></td><td><b>1920-1921</b></td></tr>,
 <tr align="center"><td><b>1923-1924</b></td><td><b>1924-1925</b></td></tr>,
 <tr align="center"><td><b>1925-1926</b></td><td><b>1928-1929</b></td></tr>,
 <tr align="center"><td><b>1930-1931</b></td><td><b>1931-1932</b></td></tr>,
 <tr align="center"><td><b>1932-1933</b></td><td><b>1938-1939</b></td></tr>,
 <tr align="center"><td><b>1939-1940</b></td><td></td></tr>,
 <tr align="center"><td><b>1940-1941</b></td><td></td></tr>,
 <tr align="center"><td><b>1941-1942</b></td><td><b>1942-1943</b></td></tr>,
 <

In [105]:
# Find all the td's
split_rows = [row.find_all('td') for row in rows]

In [106]:
split_rows

[[<td><b>1900-1901</b></td>, <td></td>],
 [<td><b>1902-1903</b></td>, <td><b>1903-1904</b></td>],
 [<td><b>1905-1906</b></td>, <td><b>1906-1907</b></td>],
 [<td></td>, <td><b>1908-1909</b></td>],
 [<td><b>1911-1912</b></td>, <td></td>],
 [<td><b>1914-1915</b></td>, <td><b>1916-1917</b></td>],
 [<td><b>1918-1919</b></td>, <td><b>1920-1921</b></td>],
 [<td><b>1923-1924</b></td>, <td><b>1924-1925</b></td>],
 [<td><b>1925-1926</b></td>, <td><b>1928-1929</b></td>],
 [<td><b>1930-1931</b></td>, <td><b>1931-1932</b></td>],
 [<td><b>1932-1933</b></td>, <td><b>1938-1939</b></td>],
 [<td><b>1939-1940</b></td>, <td></td>],
 [<td><b>1940-1941</b></td>, <td></td>],
 [<td><b>1941-1942</b></td>, <td><b>1942-1943</b></td>],
 [<td><b>1946-1947</b></td>, <td><b>1949-1950</b></td>],
 [<td><b>1951-1952</b></td>, <td></td>],
 [<td><b>1953-1954</b></td>, <td><b>1954-1955</b></td>],
 [<td><b>1957-1958</b></td>, <td></td>],
 [<td><b>1963-1964</b></td>, <td><b>1964-1965</b></td>],
 [<td><b>1965-1966</b></td>, 

In [116]:
split_rows[0][0].text

'1900-1901'

In [117]:
clean_table = []

for row in split_rows:
    single_row = []
    for data in row:
        single_row.append(data.text)
    clean_table.append(single_row)

In [118]:
clean_table

[['1900-1901', ''],
 ['1902-1903', '1903-1904'],
 ['1905-1906', '1906-1907'],
 ['', '1908-1909'],
 ['1911-1912', ''],
 ['1914-1915', '1916-1917'],
 ['1918-1919', '1920-1921'],
 ['1923-1924', '1924-1925'],
 ['1925-1926', '1928-1929'],
 ['1930-1931', '1931-1932'],
 ['1932-1933', '1938-1939'],
 ['1939-1940', ''],
 ['1940-1941', ''],
 ['1941-1942', '1942-1943'],
 ['1946-1947', '1949-1950'],
 ['1951-1952', ''],
 ['1953-1954', '1954-1955'],
 ['1957-1958', ''],
 ['1963-1964', '1964-1965'],
 ['1965-1966', ''],
 ['1969-1970', '1970-1971'],
 ['1972-1973', '1973-1974'],
 ['', '1975-1976'],
 ['1976-1977', ''],
 ['1977-1978', ''],
 ['1982-1983', ''],
 ['1986-1987', '1988-1989'],
 ['1991-1992', ''],
 ['1992-1993', ''],
 ['1994-1995', '1995-1996'],
 ['1997-1998', '1998-1999'],
 ['', '2000-2001'],
 ['2002-2003', ''],
 ['2004-2005', ''],
 ['', 'early 2006'],
 ['2006-2007', ''],
 ['', '2007-2008'],
 ['2009', ''],
 ['', 'late 2010 - early 2011'],
 ['2015 - mid 2016', 'late 2016'],
 ['late 2018 - mid 2019

In [128]:
Nino_table = pd.DataFrame(clean_table, columns = ['El_Nino', 'La_Nina'])

In [129]:
Nino_table

Unnamed: 0,El_Nino,La_Nina
0,1900-1901,
1,1902-1903,1903-1904
2,1905-1906,1906-1907
3,,1908-1909
4,1911-1912,
5,1914-1915,1916-1917
6,1918-1919,1920-1921
7,1923-1924,1924-1925
8,1925-1926,1928-1929
9,1930-1931,1931-1932


In [139]:
## Split the dates in each of the two columns. 
Nino_table.El_Nino.str.split

<bound method StringMethods.split of <pandas.core.strings.accessor.StringMethods object at 0x1102d1d60>>

In [144]:
Nino_table[['El_Nino_Left','El_Nino_Right']] = Nino_table.El_Nino.str.split("-",expand=True,)

In [159]:
Nino_table.head()

Unnamed: 0,El_Nino,La_Nina,El_Nino_Left,El_Nino_Right,La_Nina_Left,La_Nina_Right
0,1900-1901,,1900.0,1901.0,,
1,1902-1903,1903-1904,1902.0,1903.0,1903.0,1904.0
2,1905-1906,1906-1907,1905.0,1906.0,1906.0,1907.0
3,,1908-1909,,,1908.0,1909.0
4,1911-1912,,1911.0,1912.0,,


In [150]:
Nino_table.drop('Nino_dont_use', inplace=True, axis=1)

In [153]:
Nino_table.drop('La_nina_dont_use', inplace=True, axis=1)

In [155]:
Nino_table[['La_Nina_Left','La_Nina_Right']] = Nino_table.La_Nina.str.split("-",expand=True,)

In [160]:
Nino_table

Unnamed: 0,El_Nino,La_Nina,El_Nino_Left,El_Nino_Right,La_Nina_Left,La_Nina_Right
0,1900-1901,,1900,1901,,
1,1902-1903,1903-1904,1902,1903,1903,1904
2,1905-1906,1906-1907,1905,1906,1906,1907
3,,1908-1909,,,1908,1909
4,1911-1912,,1911,1912,,
5,1914-1915,1916-1917,1914,1915,1916,1917
6,1918-1919,1920-1921,1918,1919,1920,1921
7,1923-1924,1924-1925,1923,1924,1924,1925
8,1925-1926,1928-1929,1925,1926,1928,1929
9,1930-1931,1931-1932,1930,1931,1931,1932


In [157]:
## Export this table to be merged with the Larger climate dataset

Nino_table.to_csv('Nino_Nina_dates.csv', index = False)