# Wikipedia Web Scrape: List of Largest US Companies Data

In [1]:
# Import dependencies.

from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Set up BeautifulSoup.

url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html')

## First Method

In [3]:
# Define the tag and class explicitly.

table = soup.find('table', class_ = 'wikitable sortable')

# Read HTML as Pandas DataFrame. Note that 'table' must be stringed and
# since the result is a list, indicate the list index.

df = pd.read_html(str(table))[0]
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,,130000,"Waltham, Massachusetts"


## Second Method 

In [4]:
# Check to see how many tables there are.

len(soup.find_all('table'))

4

In [5]:
# Upon inspection of the webpage, the table needed was determined to be in position 1, or the second table. 

alt_table = soup.find_all('table')[1]

alt_df = pd.read_html(str(alt_table))[0]
alt_df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,,130000,"Waltham, Massachusetts"


## Third Method (Manual)

In [6]:
# First extract the column headers. 

alt_table2 = soup.find_all('table')[1]

column_headers = alt_table2.find_all('th')
column_headers

[<th>Rank
 </th>,
 <th>Name
 </th>,
 <th>Industry
 </th>,
 <th>Revenue <br/>(USD millions)
 </th>,
 <th>Revenue growth
 </th>,
 <th>Employees
 </th>,
 <th>Headquarters
 </th>]

In [7]:
# Clean the headers for readability. 

column_headers_cleaned = [headers.text.strip()for headers in column_headers]
column_headers_cleaned

['Rank',
 'Name',
 'Industry',
 'Revenue (USD millions)',
 'Revenue growth',
 'Employees',
 'Headquarters']

In [8]:
# Create an empty Pandas DataFrame.

alt_df2 = pd.DataFrame(columns = column_headers_cleaned)
alt_df2

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [9]:
# Extract the column data.

column_data = alt_table2.find_all('tr')

In [10]:
# Extract each individual row data.

for row in column_data[1:]: # Beginning at '1' removes the first empty list within the larger list.
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Use the length of the dataframe at each iteration as the index and insert the row data.
    index = len(alt_df2)
    alt_df2.loc[index] = individual_row_data

In [11]:
# Display the DataFrame.

alt_df2

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,9.4%,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,10.6%,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.5%,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,82.5%,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,14.5%,130000,"Waltham, Massachusetts"
