In [10]:
import requests
import pandas as pd
from pandas.core.frame import DataFrame
from bs4 import BeautifulSoup

In [9]:
# Specify the path to your Excel file
#excel_file_path = "data\raw\Forbes_2000_2023_all_companies.xlsx"
# Load the Excel file into a DataFrame
df_comp = pd.read_excel("C:\\Users\\sjess\\Identificaci-n-de-Segmentos-de-Negocio\\data\\raw\\Forbes_2000_2023_all_companies.xlsx")
# Display the DataFrame
df_comp

Unnamed: 0,RANK,UNIQUE_ID,NAME,COUNTRY,SALES,PROFIT,ASSETS,MARKET VALUE,URL-NAME
0,1,1,JPMorgan Chase,United States,$179.93 B,$41.8 B,"$3,744.3 B",$399.59 B,jpmorgan-chase
1,2,2,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,$589.47 B,$156.36 B,$660.99 B,"$2,055.22 B",saudi-arabian-oil-company-saudi-aramco
2,3,3,ICBC,China,$216.77 B,$52.47 B,"$6,116.82 B",$203.01 B,icbc
3,4,4,China Construction Bank,China,$203.08 B,$48.25 B,"$4,977.48 B",$172.99 B,china-construction-bank
4,5,5,Agricultural Bank of China,China,$186.14 B,$37.92 B,"$5,356.86 B",$141.82 B,agricultural-bank-of-china
...,...,...,...,...,...,...,...,...,...
1995,1996,1996,Alfa Laval,Sweden,$5.4 B,$489.5 M,$7.8 B,$15.6 B,alfa-laval
1996,1996,1997,Gap,United States,$15.6 B,$-202 M,$11.4 B,$3.2 B,gap
1997,1996,1998,Yes Bank,India,$3.3 B,$91.6 M,$43.2 B,$5.6 B,yes-bank
1998,1999,1999,BEKB-BCBE,Switzerland,$556 M,$167.1 M,$43 B,$2.5 B,bekb-bcbe


In [11]:
def getData(company_name):
  """
  Extracts data from a Forbes company page.
  Args:
    company_name: The name of the company.
  Returns:
    A dictionary containing the extracted data.
  """

  # Set the URL of the Forbes company page.
  url = f"https://www.forbes.com/companies/{company_name}"

  # Make a GET request to the URL.
  r = requests.get(url)

  # Parse the HTML response using BeautifulSoup.
  soup = BeautifulSoup(r.text, "html.parser")

  # Extract company basic information
  # Get the company name.
  company_name = soup.find("h1", class_="listuser-header__name")
  company_name = company_name.text if company_name else ""

  # Get the stock ticker.
  stock_ticker = soup.find("div", class_="listuser-ticker")
  stock_ticker = stock_ticker.text if stock_ticker else ""

  # Get the company headquarters location.
  company_location = soup.find("div", class_="listuser-header__headline--premium-location")
  company_location = company_location.text if company_location else ""

  # Get the current stock price.
  current_stock_price = soup.find("div", class_="profile-info__item-value")
  current_stock_price = current_stock_price.text if current_stock_price else ""

  # Create a dictionary with the extracted data.
  company_basic = {
    "company_name": company_name,
    "stock_ticker": stock_ticker,
    "company_location": company_location,
    "current_stock_price": current_stock_price
  }

  # Get the company description.
  if soup.find("div", class_="listuser-content__bio--shortened") is not None:
    value = soup.find("div", class_="listuser-content__bio--expanded hidden").text
  else:
    value = soup.find("p", class_="listuser-content__bio--copy").text

  # Create a dictionary with the description
  description_dict = {"description": value}

  # Get the company stats.
  company_stats = {}
  for element in soup.findAll("dl", class_="listuser-block__item"):
    dt = element.find("dt", class_="profile-stats__title")
    dd = element.find("dd", class_="profile-stats__text")
    company_stats[dt.text] = dd.text

  # Get the company financial data for a specific data index (e.g., "0").
  data_index = "0"
  company_financial_data = {}

  # Find the financial data div with the specified data index.
  financial_data_div = soup.find("div", {"class": "listuser-financial-data", "data-index": data_index})

  if financial_data_div:
    for element in financial_data_div.findAll("div", class_="listuser-financial-item"):
        title = element.find("div", class_="listuser-financial-item__title").text
        value = element.find("div", class_="listuser-financial-item__value").text
        company_financial_data[title] = value

  # Get the key data.
  key_data = {}
  for div in soup.find_all('div', class_='profile-datapoint__data'):
    title = div.find('div', class_='profile-datapoint__data-title').text.strip()
    value = div.find('span', class_='profile-datapoint__data-value').text.strip()
    key_data[title] = value

  # Create a new dictionary by merging dict1 and dict2
  data = {**company_basic,**description_dict,**company_stats,**company_financial_data,**key_data}

  # Return the extracted data.
  return data

In [12]:
# Extract the "URL-NAME" column and convert it to a list
companies = df_comp['URL-NAME'].tolist()

# Display the list
companies

['jpmorgan-chase',
 'saudi-arabian-oil-company-saudi-aramco',
 'icbc',
 'china-construction-bank',
 'agricultural-bank-of-china',
 'bank-of-america',
 'alphabet',
 'exxonmobil',
 'microsoft',
 'apple',
 'shell',
 'bank-of-china',
 'toyota-motor',
 'samsung-electronics',
 'unitedhealth-group',
 'ping-an-insurance-group',
 'wells-fargo',
 'chevron',
 'petrochina',
 'hsbc-holdings',
 'totalenergies',
 'verizon-communications',
 'walmart',
 'citigroup',
 'china-mobile',
 'china-merchants-bank',
 'postal-savings-bank-of-china-psbc',
 'bp',
 'volkswagen-group',
 'morgan-stanley',
 'meta-platforms',
 'sinopec',
 'bnp-paribas',
 'goldman-sachs-group',
 'tencent-holdings',
 'amazon',
 'allianz',
 'rbc',
 'pfizer',
 'johnson--johnson',
 'deutsche-telekom',
 'mercedes-benz-group',
 'td-bank-group',
 'taiwan-semiconductor',
 'reliance-industries',
 'bmw-group',
 'lvmh-moët-hennessy-louis-vuitton',
 'axa-group',
 'santander',
 'nestlé',
 'comcast',
 'equinor',
 'bank-of-communications',
 'alibaba-g

In [13]:
# Initialize an empty list to store data for each company
company_data_list = []

# Initialize an empty list to store company names with errors
comp_errors = []

# Iterate through the list of companies and extract data
for company_name in companies:
    try:
        company_data = getData(company_name)  # Assuming you have your getData function defined
        company_data_list.append(company_data)
    except Exception as e:
        # Handle the exception (e.g., print an error message)
        print(f"Error fetching data for {company_name}: {str(e)}")
        # Add the problematic company name to the error list
        comp_errors.append(company_name)

# Create a Pandas DataFrame from the list of company data
df = pd.DataFrame(company_data_list)

# Display the list of companies with errors
print("Companies with errors:", comp_errors)

Error fetching data for jpmorgan-chase: 'NoneType' object has no attribute 'text'
Error fetching data for saudi-arabian-oil-company-saudi-aramco: 'NoneType' object has no attribute 'text'
Error fetching data for icbc: 'NoneType' object has no attribute 'text'
Error fetching data for china-construction-bank: 'NoneType' object has no attribute 'text'
Error fetching data for agricultural-bank-of-china: 'NoneType' object has no attribute 'text'
Error fetching data for bank-of-america: 'NoneType' object has no attribute 'text'
Error fetching data for alphabet: 'NoneType' object has no attribute 'text'
Error fetching data for exxonmobil: 'NoneType' object has no attribute 'text'
Error fetching data for microsoft: 'NoneType' object has no attribute 'text'
Error fetching data for apple: 'NoneType' object has no attribute 'text'
Error fetching data for shell: 'NoneType' object has no attribute 'text'
Error fetching data for bank-of-china: 'NoneType' object has no attribute 'text'
Error fetching

KeyboardInterrupt: 

In [None]:
# Caso de que el scraping de de de funcionar creamso un archivo llamando "companies_base_1763.csv" con la información descargada
# Specify the path where you want to save the CSV file
csv_file_path = "C:\\Users\\sjess\\Identificaci-n-de-Segmentos-de-Negocio\\data\\processed\\companies_base_1763.csv"
# Save the DataFrame as a CSV file
df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column
# Confirmation message
print(f"DataFrame has been saved to {csv_file_path}")

In [18]:
# Specify the path to your Excel file
csv_file_path = "C:\\Users\\sjess\\Identificaci-n-de-Segmentos-de-Negocio\\data\\processed\\companies_base_1763.csv"
# Load the Excel file into a DataFrame
df_financial_data = pd.read_csv(csv_file_path)
# Display the DataFrame
df_financial_data.head()

Unnamed: 0,company_name,stock_ticker,company_location,current_stock_price,description,Industry,Founded,Headquarters,Country/Territory,CEO and Chair,...,President & CEO,CEO & Co-Founder,Co-CEOs,CEO & Founder,Chairman and Co-CEO,Chairman and Chief Executive Officer,"Co-Founder, Co-CEO and Chairman",Charter President,President & Representative Director,CEO and Chairman
0,JPMorgan Chase,NYSE: JPM,"New York, New York",$180.90,JPMorgan Chase & Co. is a financial holding co...,Banking and Financial Services,2000,"New York, New York",United States,James Dimon,...,,,,,,,,,,
1,Saudi Arabian Oil Company (Saudi Aramco),,"Dhahran, Saudi Arabia",,Saudi Arabian Oil Co. engages in the explorati...,"Construction, Oil & Gas Operations, Mining and...",1933,Dhahran,Saudi Arabia,,...,,,,,,,,,,
2,ICBC,,"Beijing, China",,Industrial & Commercial Bank of China Ltd. eng...,Banking and Financial Services,1984,Beijing,China,,...,,,,,,,,,,
3,China Construction Bank,,"Beijing, China",,China Construction Bank Corp. engages in the p...,Banking,1954,Beijing,China,,...,,,,,,,,,,
4,Agricultural Bank of China,,"Beijing, China",,Agricultural Bank of China engages in the prov...,Banking and Financial Services,1951,Beijing,China,,...,,,,,,,,,,
