This notebook contains code to extract data from https://www.responsibilityreports.com/, based on the company's ticker symbol.

Ticker symbols are taken from wikipedia list of S&P 500 companies - https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import pandas as pd
import os
import urllib.request
from typing import List, Tuple

class SustainabilityDataCollection:
  def __init__(self):
    pass

  def get_tickers(self) -> List[str]:
    """extract the tickers from S&P 500 wikipedia"""
    wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies').text
    sp_data = pd.read_html(wiki_page)
    ticker_df = sp_data[0]
    ticker_options = ticker_df['Symbol']
    return ticker_options.to_list()

  def get_company_details(self,ticker : str) -> Tuple[str, str,str,str]:
    """Extracts Company details based on the ticker infomation passed
       input : ticker
       output : Company Details - > company_name, company_link , industry_name, sector_name
     """
    try:
      web_data = requests.get('https://www.responsibilityreports.com/Companies?search='+ticker).text
      soup = BeautifulSoup(web_data,'html.parser')
      # soup.find('div',{'class':'apparel_stores_company_list'})
      company_span = soup.find_all("span", {"class": "companyName"})
      company_name = company_span[0].text
      company_link = company_span[0].find_all('a')[0]['href']
      industry_name= soup.find_all("span", {"class": "industryName"})[0].text
      sector_name= soup.find_all("span", {"class": "sectorName"})[0].text

      return company_name, company_link , industry_name, sector_name

    except:
      #print(f'Unable to extract: data not available in Responsibility.com for {ticker}')
      return (None,)*4
  
  def get_report_links(self,company_link : str) -> List[str]:
      """Extracts sustainability report pdf link from responsibilityreports.com
       input : company_link
       output : list with links to pdf reports
      """
      company_data = requests.get('https://www.responsibilityreports.com/'+company_link).text
      soup = BeautifulSoup(company_data, 'html.parser')
      report_list = []
      for div in soup.find_all("div", {"class": "text_block"}):
        report_tag = div.find_all('span')[1].find_all('a')[0]
        report_list.append(['https://www.responsibilityreports.com' + report_tag['href'],report_tag['title']])

      ## Check if we have access to recent report
      if soup.find_all("div", {"class": "most_recent_content_block"})[0].find("div",{"class":"report_locked_block"}) == "":
        recent_content = soup.find_all("div", {"class": "most_recent_content_block"})
        report_tag = recent_content[0].find("div",{"class":"view_btn"}).find_all('a')[0]
        title = recent_content[0].find("span",{"class":"bold_txt"}).text
        report_list.append(['https://www.responsibilityreports.com' + report_tag['href'],title])
        

      return report_list

  def get_sustainability_reports(self, download:bool = False,path: str ='.')  -> pd.DataFrame:

      """
      Build the dataframe with all the details and returns the dataframe 
      if download is set to True , download the pdf in the path provided.

      """

      ticker_list = self.get_tickers()
      sustain_df = pd.DataFrame()
      for ticker in ticker_list:
        company_name, company_link, industry_name, sector_name = self.get_company_details(ticker)
        if company_name is None:
          continue
        reports = self.get_report_links(company_link)
        df = pd.DataFrame([[company_name, industry_name, sector_name,reports]], columns=['company_name','industry_name','sector_name','reports'])
        df = df.explode('reports')
        df.dropna(subset=['reports'],axis=0,inplace=True)
        sustain_df=sustain_df.append(df,ignore_index=True)
      sustain_df = pd.merge(sustain_df, pd.DataFrame(sustain_df['reports'].values.tolist()).add_prefix('code_'), right_index=True,left_index=True).drop(['reports'],axis=1).dropna()
      sustain_df.rename(columns = {'code_0':'report_link', 'code_1':'Title'},inplace=True)
      sustain_df = sustain_df[(sustain_df["Title"].str.contains("sustainability",case=False)==True)].copy().reset_index(drop=True)

      if download == True:
        os.makedirs(path,exist_ok=True)
        for report in sustain_df['report_link'].tolist():
          urllib.request.urlretrieve(report,path+'/'+os.path.basename(report))
        print(f"Downloaded files to {path}")

      return sustain_df

In [None]:
sd = SustainabilityDataCollection()

## UNCOMMENT BELOW CODE TO download report to the specified path and return dataframe 

#sd.get_sustainability_reports(download=True,path='/content/sustainability_reports')

## return dataframe with the list of pdf links to sustainable reports
df = sd.get_sustainability_reports(download=False)
df

Unnamed: 0,company_name,industry_name,sector_name,report_link,Title
0,3M Corporation,Conglomerates,Conglomerates,https://www.responsibilityreports.com/HostedDa...,View 2021 Sustainability Report (PDF)
1,3M Corporation,Conglomerates,Conglomerates,https://www.responsibilityreports.com/HostedDa...,View 2020 Sustainability Report (PDF)
2,3M Corporation,Conglomerates,Conglomerates,https://www.responsibilityreports.com/HostedDa...,View 2019 Sustainability Report (PDF)
3,3M Corporation,Conglomerates,Conglomerates,https://www.responsibilityreports.com/HostedDa...,View 2018 Sustainability Report (PDF)
4,3M Corporation,Conglomerates,Conglomerates,https://www.responsibilityreports.com/HostedDa...,View 2017 Sustainability Report (PDF)
...,...,...,...,...,...
839,Zimmer Biomet,Medical Appliances & Equipment,Healthcare,https://www.responsibilityreports.com/HostedDa...,View 2020 Sustainability Report (PDF)
840,Zimmer Biomet,Medical Appliances & Equipment,Healthcare,https://www.responsibilityreports.com/HostedDa...,View 2018 Sustainability Report (PDF)
841,Zimmer Biomet,Medical Appliances & Equipment,Healthcare,https://www.responsibilityreports.com/HostedDa...,View 2017 Sustainability Report (PDF)
842,Zimmer Biomet,Medical Appliances & Equipment,Healthcare,https://www.responsibilityreports.com/HostedDa...,View 2016 Sustainability Report (PDF)
