In [1]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import json

In [2]:
# pretty column name
def rename_column(name):
    name = name.lower()
    
    # Remove 'data.' prefix if present
    if name.startswith('data.'):
        name = name.replace('data.', '')
    
    # Replace non-alphanumeric characters with underscores
    name = re.sub(r'[^\w]', '_', name)
    
    # Replace multiple underscores with a single underscore
    name = re.sub(r'__+', '_', name)
    
    # Remove leading and trailing underscores
    name = name.strip('_')
    return name

# use cache to prevent duplicate request
company_name_cache = {}

def convert_company_name_to_ticker(slug):
    if slug in company_name_cache:
        return company_name_cache[slug]

    url = f'https://fortune.com{slug}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    ticker_label = soup.find('script', id='__NEXT_DATA__')
    comapny_data = json.loads(ticker_label.text)
    ticker = comapny_data['props']['pageProps']['company']['companyInfo']['Ticker']
    company_name_cache[slug] = ticker
    return ticker
    

# this funciton get the ranking search from fortune from year provided
def fortuneGetRankingYear(year):
    url = f'https://fortune.com/api/getRankingSearchYear/fortune500/{year}/'
    response = requests.get(url)
    data = response.json()

    # load dataframe
    df = pd.json_normalize(data['items'])
    df.columns = [rename_column(col) for col in df.columns]
    df['ticker'] = df['slug'].apply(convert_company_name_to_ticker)

    
    df['year'] = year

    return df

In [3]:
# test function
convert_company_name_to_ticker("/company/walmart")

'WMT'

In [4]:
# this cell will take a long time to run

all_years_df = []

for year in range(2014, 2025):
    yearly_df = fortuneGetRankingYear(year)
    all_years_df.append(yearly_df)
    print(f"Successfully retrieved data for year: {year}")

1000
Successfully retrieved data for year: 2014
998
Successfully retrieved data for year: 2015
998
Successfully retrieved data for year: 2016
998
Successfully retrieved data for year: 2017
998
Successfully retrieved data for year: 2018
999
Successfully retrieved data for year: 2019
999
Successfully retrieved data for year: 2020
1000
Successfully retrieved data for year: 2021
1000
Successfully retrieved data for year: 2022
1000
Successfully retrieved data for year: 2023
1000
Successfully retrieved data for year: 2024


In [7]:
merged_df = pd.concat(all_years_df, ignore_index=True)

# filter all company dont have ticker (private company or etc)
filtered_df = merged_df[merged_df['ticker'].notna()]



In [10]:
merged_df.to_csv('merged_df.csv', index=False)
