# Objective

The goal of this notebook is to get [GICS](https://www.msci.com/gics) (Global Industry Clsasification Standard) of the list of companies of the S&P 500.

In order to do that, we will use the [GICS dataset](https://www.kaggle.com/merlos/gics-global-industry-classification-standard). and the list of SP500 from Wikipedia, which already includes the "sub-industry" name.



In [None]:
# Import the dataset 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# List the input files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# gics dataframe
gics = pd.read_csv("/kaggle/input/gics-global-industry-classification-standard/gics-map-2018.csv")

# Test, to see if it is working
gics[gics["SubIndustry"] == "Tobacco"]


# Get the list of S&P 500

In order to do that we will scrap the wikipedia page of the S&P500.

In [None]:
# Import Packages
import bs4 as bs
import requests 
import pandas as pd
import re

wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def get_sp500_tickers():
  resp = requests.get(wiki_url)
  soup = bs.BeautifulSoup(resp.text, "lxml")
  table = soup.find('table', {'id': 'constituents'})
  tickers = []
  for row in table.findAll('tr')[1:]:
    #print(row.find_all('a'))
    
    ticker = [t.text.strip() for t in row.find_all('td')]
    for link in row.findAll('a'):
        ticker.append(link.get('href'))
    tickers.append(ticker)
   
  return tickers


data = get_sp500_tickers()

# Keep only 14 columns
data = [row[0:13] for row in data]

# Create Pandas dataframe
columns = ["Ticker", "CompanyName", "Reports", "Sector", "SubIndustry", "HQLocation", "DateFirstAdded", "CIK", "FoundedYear", "QuoteURL", "WikiPage", "EDGARURL", "HQLocationWikiPage"]
df = pd.DataFrame(data=data, columns=columns)

# Remove "Reports" column because it only has the text "reports" 
df = df.drop('Reports', axis=1)

print(len(df))
#print(df)



# Merge S&P500 with the gics

In [None]:
# Add gics code column
gics_df = pd.DataFrame()
for i in range(0, len(df)):
    gics_row = gics[gics["SubIndustry"] == df.loc[i,"SubIndustry"]]
    if len(gics_row) != 1:
        print(f"**** Error in index {i}\n\n", df.loc[i])
    gics_df = pd.concat([gics_df, gics_row], ignore_index=True)


for col in gics_df.columns:
    columns.append(col)
    
#columns

sp500_df=pd.concat([df, gics_df], axis=1)
sp500_df

sp500_df.to_csv('sp500-with-gics.csv', index = True) 