In [None]:
!pip install requests beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# -------------------------------------------------------------------
# FUNCTION TO SCRAPE TECH SECTOR TICKERS FROM STOCKANALYSIS.COM
# -------------------------------------------------------------------
def get_tickers():
    url = "https://stockanalysis.com/stocks/sector/technology/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        table = soup.find('table')
        if table is None:
            raise ValueError("Could not find table on webpage")

        tickers = []
        rows = table.find_all('tr')[1:]

        for row in rows:
            cols = row.find_all('td')
            if cols:
                ticker = cols[1].text.strip()
                tickers.append(ticker)

        return tickers

    except Exception as e:
        print(f"Error scraping tickers: {str(e)}")
        return []

# -------------------------------------------------------------------
# EXECUTE SCRAPING FUNCTION AND SAVE OUTPUT
# -------------------------------------------------------------------
tickers = get_tickers()
df = pd.DataFrame({'Ticker': tickers})
df.to_csv('tickers.csv', index=False)

In [None]:
import requests
import pandas as pd
import time
from datetime import datetime
from tqdm import tqdm

# -------------------------------------------------------------------
# CLASS FOR FETCHING CASH FLOW DATA FROM SEC EDGAR API
# -------------------------------------------------------------------
class SECDataFetcher:
    def __init__(self, email):
        self.headers = {
            'User-Agent': f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) {email}'
        }
        self.base_url = "https://data.sec.gov/api/xbrl/companyfacts/CIK{}.json"
        self.company_tickers_url = "https://www.sec.gov/files/company_tickers.json"

    def get_cik_mapping(self):
        response = requests.get(self.company_tickers_url, headers=self.headers)
        data = response.json()
        cik_map = {entry['ticker']: str(entry['cik_str']).zfill(10) for entry in data.values()}
        return cik_map

    def get_cash_flows(self, cik):
        try:
            url = self.base_url.format(cik)
            response = requests.get(url, headers=self.headers)

            if response.status_code != 200:
                return None

            data = response.json()
            facts = data['facts']['us-gaap']

            years = range(datetime.now().year - 5, datetime.now().year)
            cash_flows = {year: {'Operating_CF': None, 'Investing_CF': None, 'Financing_CF': None} for year in years}

            for tag, key in [
                ('NetCashProvidedByUsedInOperatingActivities', 'Operating_CF'),
                ('NetCashProvidedByUsedInInvestingActivities', 'Investing_CF'),
                ('NetCashProvidedByUsedInFinancingActivities', 'Financing_CF')
            ]:
                if tag in facts:
                    for entry in facts[tag].get('units', {}).get('USD', []):
                        if 'form' in entry and entry['form'] == '10-K':
                            year = int(entry['end'][:4])
                            if year in cash_flows:
                                cash_flows[year][key] = entry['val']

            return cash_flows

        except Exception as e:
            print(f"Error fetching data for CIK {cik}: {str(e)}")
            return None

# -------------------------------------------------------------------
# FUNCTION TO CLASSIFY LIFE CYCLE STAGES BASED ON CASH FLOW SIGNS
# -------------------------------------------------------------------
def classify_lifecycle(cash_flows):
    classifications = {}

    for year, flows in cash_flows.items():
        if None in flows.values():
            classifications[year] = "Data Missing"
            continue

        o, i, f = flows['Operating_CF'], flows['Investing_CF'], flows['Financing_CF']
        o_sign = '+' if o > 0 else '-'
        i_sign = '+' if i > 0 else '-'
        f_sign = '+' if f > 0 else '-'

        patterns = {
            ('-', '-', '+'): "Introduction",
            ('+', '-', '+'): "Growth",
            ('+', '-', '-'): "Maturity",
            ('+', '+', '-'): "Shake-out",
            ('+', '+', '+'): "Shake-out",
            ('-', '-', '-'): "Shake-out",
            ('-', '+', '-'): "Decline",
            ('-', '+', '+'): "Decline",
        }

        classifications[year] = patterns[(o_sign, i_sign, f_sign)]

    return classifications

# -------------------------------------------------------------------
# MAIN EXECUTION FUNCTION
# -------------------------------------------------------------------
def main():
    email = "sofyasavina3@gmail.com"
    fetcher = SECDataFetcher(email)

    tickers_df = pd.read_csv("tickers.csv")
    tickers = tickers_df['Ticker'].str.upper().unique().tolist()

    print("Getting CIK mapping...")
    cik_map = fetcher.get_cik_mapping()

    results = []
    print("Fetching cash flow data for last 5 years...")

    for ticker in tqdm(tickers):
        if ticker in cik_map:
            cik = cik_map[ticker]
            cash_flows = fetcher.get_cash_flows(cik)

            if cash_flows:
                classifications = classify_lifecycle(cash_flows)
                for year in cash_flows.keys():
                    results.append({
                        'Ticker': ticker,
                        'Year': year,
                        'Life_Cycle_Stage': classifications[year],
                        **cash_flows[year]
                    })

            time.sleep(0.1)  # Respect SEC rate limits

    results_df = pd.DataFrame(results)
    results_df.to_csv('company_lifecycles.csv', index=False)

    print("\nLife Cycle Stage Distribution:")
    print(results_df.groupby(['Year', 'Life_Cycle_Stage']).size())

    return results_df

# -------------------------------------------------------------------
# RUN MAIN FUNCTION
# -------------------------------------------------------------------
if __name__ == "__main__":
    results_df = main()

Getting CIK mapping...
Fetching cash flow data for last 5 years...


  1%|          | 5/775 [00:02<06:30,  1.97it/s]

Error fetching data for CIK 0001046179: 'us-gaap'


  1%|          | 7/775 [00:03<06:07,  2.09it/s]

Error fetching data for CIK 0001000184: 'us-gaap'


  5%|▌         | 41/775 [00:24<07:41,  1.59it/s]

Error fetching data for CIK 0001067491: 'us-gaap'


  8%|▊         | 63/775 [00:38<06:39,  1.78it/s]

Error fetching data for CIK 0001123799: 'us-gaap'


  9%|▊         | 66/775 [00:39<06:16,  1.88it/s]

Error fetching data for CIK 0000924613: 'us-gaap'


 10%|▉         | 76/775 [00:45<06:08,  1.90it/s]

Error fetching data for CIK 0001061574: 'us-gaap'


 12%|█▏        | 91/775 [00:54<05:43,  1.99it/s]

Error fetching data for CIK 0001709048: 'us-gaap'


 12%|█▏        | 93/775 [00:55<05:04,  2.24it/s]

Error fetching data for CIK 0001122411: 'us-gaap'


 13%|█▎        | 97/775 [00:58<07:41,  1.47it/s]

Error fetching data for CIK 0001033767: 'us-gaap'


 21%|██        | 164/775 [01:32<04:24,  2.31it/s]

Error fetching data for CIK 0001836470: 'us-gaap'


 25%|██▌       | 196/775 [01:48<04:35,  2.10it/s]

Error fetching data for CIK 0001557860: 'us-gaap'


 33%|███▎      | 257/775 [02:18<03:13,  2.68it/s]

Error fetching data for CIK 0001867729: 'us-gaap'


 34%|███▍      | 264/775 [02:22<03:47,  2.25it/s]

Error fetching data for CIK 0001712807: 'us-gaap'


 34%|███▍      | 267/775 [02:23<03:45,  2.25it/s]

Error fetching data for CIK 0001846832: 'us-gaap'


 38%|███▊      | 298/775 [02:40<03:29,  2.27it/s]

Error fetching data for CIK 0001799983: 'us-gaap'


 41%|████      | 318/775 [02:51<03:28,  2.19it/s]

Error fetching data for CIK 0001901279: 'us-gaap'


 42%|████▏     | 322/775 [02:53<03:21,  2.25it/s]

Error fetching data for CIK 0001899123: 'us-gaap'


 42%|████▏     | 325/775 [02:54<02:59,  2.51it/s]

Error fetching data for CIK 0001823306: 'us-gaap'


 43%|████▎     | 332/775 [02:57<03:10,  2.32it/s]

Error fetching data for CIK 0001828102: 'us-gaap'


 45%|████▌     | 352/775 [03:07<03:25,  2.06it/s]

Error fetching data for CIK 0001656081: 'us-gaap'


 47%|████▋     | 364/775 [03:13<03:00,  2.28it/s]

Error fetching data for CIK 0001793663: 'us-gaap'


 51%|█████     | 394/775 [03:27<02:45,  2.30it/s]

Error fetching data for CIK 0001868995: 'us-gaap'


 51%|█████     | 397/775 [03:29<03:16,  1.92it/s]

Error fetching data for CIK 0001825155: 'us-gaap'


 62%|██████▏   | 477/775 [04:05<02:22,  2.09it/s]

Error fetching data for CIK 0001091223: 'us-gaap'


 62%|██████▏   | 478/775 [04:05<02:16,  2.17it/s]

Error fetching data for CIK 0001866030: 'us-gaap'


 65%|██████▌   | 505/775 [04:18<02:05,  2.15it/s]

Error fetching data for CIK 0001859690: 'us-gaap'


 66%|██████▌   | 511/775 [04:20<01:46,  2.49it/s]

Error fetching data for CIK 0001915403: 'us-gaap'


 66%|██████▌   | 513/775 [04:21<01:36,  2.71it/s]

Error fetching data for CIK 0001899830: 'us-gaap'


 67%|██████▋   | 518/775 [04:24<02:00,  2.13it/s]

Error fetching data for CIK 0001753368: 'us-gaap'


 71%|███████   | 551/775 [04:39<01:26,  2.59it/s]

Error fetching data for CIK 0001866501: 'us-gaap'


 75%|███████▍  | 580/775 [04:50<01:12,  2.67it/s]

Error fetching data for CIK 0001836934: 'us-gaap'


 79%|███████▉  | 611/775 [05:04<01:07,  2.42it/s]

Error fetching data for CIK 0001383395: 'us-gaap'


 84%|████████▍ | 651/775 [05:19<00:41,  2.95it/s]

Error fetching data for CIK 0001875609: 'us-gaap'


 85%|████████▌ | 659/775 [05:22<00:41,  2.82it/s]

Error fetching data for CIK 0001963439: 'us-gaap'


 86%|████████▌ | 666/775 [05:26<00:47,  2.31it/s]

Error fetching data for CIK 0001930179: 'us-gaap'


 87%|████████▋ | 674/775 [05:29<00:48,  2.10it/s]

Error fetching data for CIK 0001905660: 'us-gaap'


 88%|████████▊ | 685/775 [05:34<00:33,  2.65it/s]

Error fetching data for CIK 0001965143: 'us-gaap'


 90%|████████▉ | 697/775 [05:39<00:27,  2.81it/s]

Error fetching data for CIK 0001981462: 'us-gaap'


 92%|█████████▏| 712/775 [05:44<00:22,  2.79it/s]

Error fetching data for CIK 0001681348: 'us-gaap'


 93%|█████████▎| 717/775 [05:46<00:20,  2.86it/s]

Error fetching data for CIK 0001964630: 'us-gaap'


 93%|█████████▎| 723/775 [05:49<00:19,  2.65it/s]

Error fetching data for CIK 0001976443: 'us-gaap'


100%|██████████| 775/775 [06:11<00:00,  2.09it/s]

Error fetching data for CIK 0001888151: 'us-gaap'

Life Cycle Stage Distribution:
Year  Life_Cycle_Stage
2020  Data Missing        200
      Decline              25
      Growth              125
      Introduction        140
      Maturity            173
      Shake-out            49
2021  Data Missing        154
      Decline              25
      Growth              148
      Introduction        163
      Maturity            183
      Shake-out            39
2022  Data Missing        146
      Decline              47
      Growth               98
      Introduction        133
      Maturity            196
      Shake-out            92
2023  Data Missing        141
      Decline              58
      Growth               95
      Introduction         91
      Maturity            247
      Shake-out            80
2024  Data Missing        180
      Decline              52
      Growth               88
      Introduction         79
      Maturity            239
      Shake-out          




In [None]:
import requests
import pandas as pd
import time
from datetime import datetime
from tqdm import tqdm

# -------------------------------------------------------------------
# CLASS FOR FETCHING MULTIPLE FINANCIAL TAGS FROM SEC EDGAR API
# -------------------------------------------------------------------
class SECDataFetcher:
    def __init__(self, email):
        self.headers = {
            'User-Agent': f'Mozilla/5.0 (compatible; ThesisResearchBot/1.0; {email})'
        }
        self.base_url = "https://data.sec.gov/api/xbrl/companyfacts/CIK{}.json"
        self.tickers_url = "https://www.sec.gov/files/company_tickers.json"

    def get_cik_mapping(self):
        response = requests.get(self.tickers_url, headers=self.headers)
        data = response.json()
        return {entry['ticker'].upper(): str(entry['cik_str']).zfill(10) for entry in data.values()}

    def get_multi_tag_data(self, cik, tag_list):
        url = self.base_url.format(cik)
        response = requests.get(url, headers=self.headers)
        if response.status_code != 200:
            return {}

        data = response.json()
        facts = data.get('facts', {}).get('us-gaap', {})
        results = {}

        for tag in tag_list:
            entries = facts.get(tag, {}).get('units', {}).get('USD', [])
            for entry in entries:
                if entry.get('form') == '10-K' and entry.get('fp') == 'FY':
                    year = entry.get('fy')
                    val = entry.get('val')
                    end = entry.get('end')
                    if year and 2019 <= year <= 2024 and val:
                        end_dt = datetime.strptime(end, "%Y-%m-%d")
                        if year not in results or end_dt > results[year]['end']:
                            results[year] = {'val': val, 'end': end_dt}

        return {year: item['val'] for year, item in results.items()}

# -------------------------------------------------------------------
# LOAD TICKERS AND INITIALIZE FETCHER
# -------------------------------------------------------------------
tickers_df = pd.read_csv("tickers.csv")
tickers = tickers_df['Ticker'].str.upper().unique().tolist()

email = "sofyasavina3@gmail.com"
fetcher = SECDataFetcher(email)
cik_map = fetcher.get_cik_mapping()

# -------------------------------------------------------------------
# DEFINE TAGS FOR FINANCIAL VARIABLES
# -------------------------------------------------------------------
tags = {
    'revenue': [
        'RevenueFromContractWithCustomerExcludingAssessedTax',
        'Revenues',
        'SalesRevenueNet'
    ],
    'net_income': ['NetIncomeLoss'],
    'total_assets': ['Assets'],
    'current_assets': ['AssetsCurrent'],
    'current_liabilities': ['LiabilitiesCurrent'],
    'short_term_debt': [
        'DebtCurrent',
        'ShortTermBorrowings',
        'CurrentPortionOfLongTermDebtAndCapitalLeaseObligations'
    ],
    'long_term_debt': [
        'LongTermDebtNoncurrent',
        'LongTermBorrowings',
        'LongTermDebt'
    ],
    'other_liabilities': [
        'OtherLiabilities',
        'OperatingLeaseLiabilityNoncurrent',
        'DeferredRevenueNoncurrent',
        'DeferredTaxLiabilitiesNoncurrent'
    ],
    'cash': ['CashAndCashEquivalentsAtCarryingValue'],
    'r_and_d': ['ResearchAndDevelopmentExpense'],
    'ocf': ['NetCashProvidedByUsedInOperatingActivities'],
    'capex': [
        'CapitalExpenditures',
        'PaymentsToAcquirePropertyPlantAndEquipment',
        'PaymentsForCapitalExpenditures',
        'CapitalExpendituresIncurredButNotYetPaid'
    ]
}

# -------------------------------------------------------------------
# FETCH AND CALCULATE FINANCIAL RATIOS
# -------------------------------------------------------------------
results = []

for ticker in tqdm(tickers):
    cik = cik_map.get(ticker)
    if not cik:
        continue

    data = {key: fetcher.get_multi_tag_data(cik, tag) for key, tag in tags.items()}

    for year in range(2020, 2025):
        try:
            rev = data['revenue'].get(year)
            prev_rev = data['revenue'].get(year - 1)
            ni = data['net_income'].get(year)
            ta = data['total_assets'].get(year)
            ta_prev = data['total_assets'].get(year - 1)
            avg_assets = (ta + ta_prev) / 2 if ta and ta_prev else None

            ca = data['current_assets'].get(year)
            cl = data['current_liabilities'].get(year)
            std = data['short_term_debt'].get(year, 0)
            ltd = data['long_term_debt'].get(year, 0)
            other_liab = data['other_liabilities'].get(year, 0)
            cash = data['cash'].get(year)
            rnd = data['r_and_d'].get(year)
            ocf = data['ocf'].get(year)
            capex = data['capex'].get(year)

            total_debt = sum(x for x in [std, ltd, other_liab] if x is not None)

            row = {
                'ticker': ticker,
                'year': year,
                'revenue': rev,
                'net_income': ni,
                'total_assets': ta,
                'current_ratio': ca / cl if ca and cl else None,
                'debt_assets': total_debt / ta if total_debt and ta else None,
                'asset_turnover': rev / avg_assets if rev and avg_assets else None,
                'roa': ni / avg_assets if ni and avg_assets else None,
                'revenue_growth': (rev - prev_rev) / prev_rev if rev and prev_rev else None,
                'r_and_d_ratio': rnd / rev if rnd and rev else None,
                'capex_ratio': capex / ta if capex and ta else None,
                'net_income_margin': ni / rev if ni and rev else None,
                'ocf_margin': ocf / rev if ocf and rev else None
            }
            results.append(row)
        except Exception as e:
            print(f"Error processing {ticker} {year}: {e}")

    time.sleep(0.1)

# -------------------------------------------------------------------
# SAVE OUTPUT TO CSV
# -------------------------------------------------------------------
df = pd.DataFrame(results)
df.to_csv("raw_financial

100%|██████████| 775/775 [39:52<00:00,  3.09s/it]

✅ Financial ratio dataset saved to 'raw_financial_ratios.csv'





In [None]:
import requests
import pandas as pd
import time
import yfinance as yf
from tqdm import tqdm
from datetime import datetime

# -------------------------------------------------------------------
# CLASS FOR FETCHING SHARES OUTSTANDING FROM SEC EDGAR API
# -------------------------------------------------------------------
class SECDataFetcher:
    def __init__(self, email):
        self.headers = {
            'User-Agent': f'Mozilla/5.0 (compatible; {email})'
        }
        self.base_url = "https://data.sec.gov/api/xbrl/companyfacts/CIK{}.json"
        self.company_tickers_url = "https://www.sec.gov/files/company_tickers.json"

    def get_cik_mapping(self):
        response = requests.get(self.company_tickers_url, headers=self.headers)
        data = response.json()
        return {entry['ticker']: str(entry['cik_str']).zfill(10) for entry in data.values()}

    def get_shares_outstanding(self, cik):
        try:
            url = self.base_url.format(cik)
            response = requests.get(url, headers=self.headers)
            if response.status_code != 200:
                return None

            data = response.json()
            facts = data.get('facts', {}).get('us-gaap', {})
            shares_data = {}

            for tag in ['CommonStockSharesOutstanding', 'EntityCommonStockSharesOutstanding']:
                if tag in facts:
                    units = facts[tag].get('units', {})
                    for unit_type in units:
                        if 'share' in unit_type.lower():
                            for entry in units[unit_type]:
                                if entry.get('form') == '10-K':
                                    fy = int(entry.get('fy', 0))
                                    if 2020 <= fy <= 2024:
                                        end_date = entry['end']
                                        shares = entry['val']
                                        if fy not in shares_data:
                                            shares_data[fy] = []
                                        shares_data[fy].append(shares)

            return {fy: sum(vals)/len(vals) for fy, vals in shares_data.items()} if shares_data else None

        except Exception as e:
            print(f"Error fetching shares for CIK {cik}: {e}")
            return None

# -------------------------------------------------------------------
# FUNCTION TO GET AVERAGE STOCK PRICE USING YFINANCE
# -------------------------------------------------------------------
def get_average_price(ticker, fiscal_year):
    try:
        fiscal_year = int(fiscal_year)
        end_date = datetime(fiscal_year, 12, 31)
        start_date = end_date - pd.DateOffset(days=30)

        history = yf.Ticker(ticker).history(start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))

        if history.empty:
            return None

        return history['Close'].mean()

    except Exception as e:
        print(f"Error fetching average price for {ticker} in {fiscal_year}: {e}")
        return None

# -------------------------------------------------------------------
# FUNCTION TO CALCULATE MARKET CAP
# -------------------------------------------------------------------
def main():
    email = "sofyasavina3@gmail.com"
    fetcher = SECDataFetcher(email)
    cik_map = fetcher.get_cik_mapping()

    tickers_df = pd.read_csv("tickers.csv")
    tickers = tickers_df['Ticker'].str.upper().unique().tolist()

    results = []
    for ticker in tqdm(tickers):
        if ticker not in cik_map:
            continue

        cik = cik_map[ticker]
        shares_dict = fetcher.get_shares_outstanding(cik)

        if shares_dict:
            for fy, avg_shares in shares_dict.items():
                avg_price = get_average_price(ticker, fy)
                market_cap = avg_shares * avg_price if avg_price is not None else None
                results.append({
                    'Ticker': ticker,
                    'Fiscal Year': fy,
                    'Avg_Shares_Outstanding': avg_shares,
                    'Avg_Stock_Price': avg_price,
                    'Market_Capitalization': market_cap,
                })

        time.sleep(0.1)

    df = pd.DataFrame(results)
    df.sort_values(by=['Ticker', 'Fiscal Year'], inplace=True)
    df.to_csv("market_cap.csv", index=False)
    print("✅ Saved to 'market_cap.csv'")
    return df

# -------------------------------------------------------------------
# RUN MAIN FUNCTION
# -------------------------------------------------------------------
if __name__ == "__main__":
    main()

 30%|███       | 236/775 [02:54<05:38,  1.59it/s]ERROR:yfinance:$SOUN: possibly delisted; no price data found  (1d 2021-12-01 -> 2021-12-31) (Yahoo error = "Data doesn't exist for startDate = 1638334800, endDate = 1640926800")
 33%|███▎      | 253/775 [03:07<03:54,  2.23it/s]ERROR:yfinance:$LIF: possibly delisted; no price data found  (1d 2022-12-01 -> 2022-12-31) (Yahoo error = "Data doesn't exist for startDate = 1669870800, endDate = 1672462800")
ERROR:yfinance:$LIF: possibly delisted; no price data found  (1d 2023-12-01 -> 2023-12-31) (Yahoo error = "Data doesn't exist for startDate = 1701406800, endDate = 1703998800")
 37%|███▋      | 284/775 [03:31<05:12,  1.57it/s]ERROR:yfinance:$CORZ: possibly delisted; no price data found  (1d 2022-12-01 -> 2022-12-31) (Yahoo error = "Data doesn't exist for startDate = 1669870800, endDate = 1672462800")
ERROR:yfinance:$CORZ: possibly delisted; no price data found  (1d 2023-12-01 -> 2023-12-31) (Yahoo error = "Data doesn't exist for startDate = 

✅ Saved to 'market_cap.csv'





In [None]:
import requests
import pandas as pd
import time
import yfinance as yf
from datetime import datetime
from tqdm import tqdm

# -------------------------------------------------------------------
# CLASS FOR FETCHING FINANCIAL DATA FROM SEC EDGAR API
# -------------------------------------------------------------------
class SECDataFetcher:
    def __init__(self, email):
        self.headers = {
            'User-Agent': f'Mozilla/5.0 (compatible; {email})'
        }
        self.base_url = "https://data.sec.gov/api/xbrl/companyfacts/CIK{}.json"
        self.company_tickers_url = "https://www.sec.gov/files/company_tickers.json"

    def get_cik_mapping(self):
        response = requests.get(self.company_tickers_url, headers=self.headers)
        data = response.json()
        return {entry['ticker']: str(entry['cik_str']).zfill(10) for entry in data.values()}

    def get_financials(self, cik):
        try:
            url = self.base_url.format(cik)
            response = requests.get(url, headers=self.headers)
            if response.status_code != 200:
                return None

            data = response.json()
            facts = data['facts']['us-gaap']

            financials = {year: {'TotalDebt': None, 'CashEquivalents': None} for year in range(2020, 2025)}

            debt_tags = {
                'LongTermDebtNoncurrent': 'long_term',
                'DebtCurrent': 'short_term'
            }

            for tag, label in debt_tags.items():
                if tag in facts:
                    for entry in facts[tag]['units'].get('USD', []):
                        if entry.get('form') == '10-K' and 'fy' in entry:
                            fy = int(entry['fy'])
                            if fy in financials:
                                if financials[fy]['TotalDebt'] is None:
                                    financials[fy]['TotalDebt'] = 0
                                financials[fy]['TotalDebt'] += entry['val']

            if 'CashAndCashEquivalentsAtCarryingValue' in facts:
                for entry in facts['CashAndCashEquivalentsAtCarryingValue']['units'].get('USD', []):
                    if entry.get('form') == '10-K' and 'fy' in entry:
                        fy = int(entry['fy'])
                        if fy in financials:
                            financials[fy]['CashEquivalents'] = entry['val']

            return financials

        except Exception as e:
            print(f"Error fetching financials for CIK {cik}: {e}")
            return None

# -------------------------------------------------------------------
# FUNCTION TO CALCULATE ENTERPRISE VALUE
# -------------------------------------------------------------------
def main():
    email = "sofyasavina3@gmail.com"
    fetcher = SECDataFetcher(email)

    tickers_df = pd.read_csv("tickers.csv")
    tickers = tickers_df['Ticker'].str.upper().unique().tolist()

    print("Getting CIK mapping...")
    cik_map = fetcher.get_cik_mapping()

    print("Loading Market Cap data...")
    market_cap_df = pd.read_csv('market_cap.csv')
    market_cap_dict = {
        (row['Ticker'], row['Fiscal Year']): row['Market_Capitalization']
        for _, row in market_cap_df.iterrows()
    }

    results = []
    print("Fetching Enterprise Value data for 2020–2024...")
    for ticker in tqdm(tickers):
        if ticker not in cik_map:
            continue

        cik = cik_map[ticker]
        financial_data = fetcher.get_financials(cik)

        if financial_data:
            for year in range(2020, 2025):
                market_cap = market_cap_dict.get((ticker, year))
                total_debt = financial_data[year]['TotalDebt']
                cash_eq = financial_data[year]['CashEquivalents']

                if market_cap is not None and total_debt is not None and cash_eq is not None:
                    enterprise_value = market_cap + total_debt - cash_eq
                else:
                    enterprise_value = None

                results.append({
                    'Ticker': ticker,
                    'Year': year,
                    'Market_Capitalization': market_cap,
                    'Total_Debt': total_debt,
                    'Cash_Equivalents': cash_eq,
                    'Enterprise_Value': enterprise_value
                })

        time.sleep(0.1)

    df = pd.DataFrame(results)
    df.to_csv('enterprise_value.csv', index=False)
    print("✅ Data saved to 'enterprise_value.csv'")
    return df

# -------------------------------------------------------------------
# RUN MAIN FUNCTION
# -------------------------------------------------------------------
if __name__ == "__main__":
    main()

Getting CIK mapping...
Loading Market Cap data...
Fetching Enterprise Value data for 2020–2024...


  1%|          | 5/775 [00:02<06:02,  2.13it/s]

Error fetching financials for CIK 0001046179: 'us-gaap'


  1%|          | 7/775 [00:03<05:10,  2.47it/s]

Error fetching financials for CIK 0001000184: 'us-gaap'


  5%|▌         | 41/775 [00:21<07:15,  1.69it/s]

Error fetching financials for CIK 0001067491: 'us-gaap'


  8%|▊         | 63/775 [00:32<04:37,  2.57it/s]

Error fetching financials for CIK 0001123799: 'us-gaap'


  9%|▊         | 66/775 [00:33<05:37,  2.10it/s]

Error fetching financials for CIK 0000924613: 'us-gaap'


 10%|▉         | 76/775 [00:38<05:00,  2.33it/s]

Error fetching financials for CIK 0001061574: 'us-gaap'


 12%|█▏        | 91/775 [00:46<05:22,  2.12it/s]

Error fetching financials for CIK 0001709048: 'us-gaap'


 12%|█▏        | 93/775 [00:46<04:50,  2.35it/s]

Error fetching financials for CIK 0001122411: 'us-gaap'


 13%|█▎        | 97/775 [00:48<05:31,  2.04it/s]

Error fetching financials for CIK 0001033767: 'us-gaap'


 21%|██        | 164/775 [01:19<04:08,  2.46it/s]

Error fetching financials for CIK 0001836470: 'us-gaap'


 25%|██▌       | 196/775 [01:34<03:46,  2.56it/s]

Error fetching financials for CIK 0001557860: 'us-gaap'


 33%|███▎      | 257/775 [02:02<03:10,  2.73it/s]

Error fetching financials for CIK 0001867729: 'us-gaap'


 34%|███▍      | 264/775 [02:05<03:04,  2.77it/s]

Error fetching financials for CIK 0001712807: 'us-gaap'


 34%|███▍      | 267/775 [02:06<03:05,  2.74it/s]

Error fetching financials for CIK 0001846832: 'us-gaap'


 38%|███▊      | 298/775 [02:19<02:42,  2.94it/s]

Error fetching financials for CIK 0001799983: 'us-gaap'


 41%|████      | 318/775 [02:29<03:20,  2.28it/s]

Error fetching financials for CIK 0001901279: 'us-gaap'


 42%|████▏     | 322/775 [02:31<02:51,  2.64it/s]

Error fetching financials for CIK 0001899123: 'us-gaap'


 42%|████▏     | 325/775 [02:32<02:29,  3.02it/s]

Error fetching financials for CIK 0001823306: 'us-gaap'


 43%|████▎     | 332/775 [02:35<03:15,  2.26it/s]

Error fetching financials for CIK 0001828102: 'us-gaap'


 45%|████▌     | 352/775 [02:44<02:47,  2.53it/s]

Error fetching financials for CIK 0001656081: 'us-gaap'


 47%|████▋     | 364/775 [02:50<03:06,  2.21it/s]

Error fetching financials for CIK 0001793663: 'us-gaap'


 51%|█████     | 394/775 [03:09<04:07,  1.54it/s]

Error fetching financials for CIK 0001868995: 'us-gaap'


 51%|█████     | 397/775 [03:13<06:55,  1.10s/it]

Error fetching financials for CIK 0001825155: 'us-gaap'


 62%|██████▏   | 477/775 [03:48<01:56,  2.56it/s]

Error fetching financials for CIK 0001091223: 'us-gaap'


 62%|██████▏   | 478/775 [03:49<01:59,  2.49it/s]

Error fetching financials for CIK 0001866030: 'us-gaap'


 65%|██████▌   | 505/775 [04:01<01:41,  2.66it/s]

Error fetching financials for CIK 0001859690: 'us-gaap'


 66%|██████▌   | 511/775 [04:03<01:30,  2.92it/s]

Error fetching financials for CIK 0001915403: 'us-gaap'


 66%|██████▌   | 513/775 [04:04<01:28,  2.96it/s]

Error fetching financials for CIK 0001899830: 'us-gaap'


 67%|██████▋   | 518/775 [04:06<01:33,  2.73it/s]

Error fetching financials for CIK 0001753368: 'us-gaap'


 71%|███████   | 551/775 [04:19<01:21,  2.74it/s]

Error fetching financials for CIK 0001866501: 'us-gaap'


 75%|███████▍  | 580/775 [04:31<01:24,  2.31it/s]

Error fetching financials for CIK 0001836934: 'us-gaap'


 79%|███████▉  | 611/775 [04:44<00:55,  2.94it/s]

Error fetching financials for CIK 0001383395: 'us-gaap'


 84%|████████▍ | 651/775 [05:00<00:41,  2.97it/s]

Error fetching financials for CIK 0001875609: 'us-gaap'


 85%|████████▌ | 659/775 [05:03<00:38,  3.01it/s]

Error fetching financials for CIK 0001963439: 'us-gaap'


 86%|████████▌ | 666/775 [05:06<00:44,  2.46it/s]

Error fetching financials for CIK 0001930179: 'us-gaap'


 87%|████████▋ | 674/775 [05:09<00:41,  2.44it/s]

Error fetching financials for CIK 0001905660: 'us-gaap'


 88%|████████▊ | 685/775 [05:13<00:29,  3.04it/s]

Error fetching financials for CIK 0001965143: 'us-gaap'


 90%|████████▉ | 697/775 [05:17<00:27,  2.83it/s]

Error fetching financials for CIK 0001981462: 'us-gaap'


 92%|█████████▏| 712/775 [05:23<00:22,  2.84it/s]

Error fetching financials for CIK 0001681348: 'us-gaap'


 93%|█████████▎| 717/775 [05:25<00:24,  2.39it/s]

Error fetching financials for CIK 0001964630: 'us-gaap'


 93%|█████████▎| 723/775 [05:27<00:19,  2.70it/s]

Error fetching financials for CIK 0001976443: 'us-gaap'


100%|██████████| 775/775 [05:47<00:00,  2.23it/s]

Error fetching financials for CIK 0001888151: 'us-gaap'
✅ Data saved to 'enterprise_value.csv'





In [None]:
import pandas as pd

# -------------------------------------------------------------------
# LOAD RAW FINANCIALS AND ENTERPRISE VALUE DATASETS
# -------------------------------------------------------------------
raw_df = pd.read_csv("raw_financials.csv")
ev_df = pd.read_csv("enterprise_value.csv")

print("Raw Financials Columns:", raw_df.columns.tolist())
print("Enterprise Value Columns:", ev_df.columns.tolist())

# -------------------------------------------------------------------
# STANDARDIZE COLUMN NAMES FOR MERGING
# -------------------------------------------------------------------
raw_df.rename(columns=lambda x: x.strip().capitalize(), inplace=True)
ev_df.rename(columns=lambda x: x.strip().capitalize(), inplace=True)

# -------------------------------------------------------------------
# MERGE AND COMPUTE EV/SALES
# -------------------------------------------------------------------
if 'Ticker' in raw_df.columns and 'Year' in raw_df.columns and 'Enterprise_value' in ev_df.columns:
    merged = pd.merge(raw_df, ev_df[['Ticker', 'Year', 'Enterprise_value']], on=['Ticker', 'Year'], how='left')
    merged['Ev_sales'] = merged['Enterprise_value'] / merged['Revenue']
    merged.drop(columns='Enterprise_value', inplace=True)

    merged.to_csv("financials_with_ev_sales.csv", index=False)
    print("✅ Merged dataset with EV/Sales saved as 'financials_with_ev_sales.csv'")
else:
    print("❌ Ticker or Year or Enterprise_value column not found in one of the datasets.")

Raw Financials Columns: ['ticker', 'year', 'revenue', 'net_income', 'total_assets', 'current_ratio', 'debt_assets', 'asset_turnover', 'roa', 'revenue_growth', 'r_and_d_ratio', 'capex_ratio', 'net_income_margin', 'ocf_margin']
Enterprise Value Columns: ['Ticker', 'Year', 'Market_Capitalization', 'Total_Debt', 'Cash_Equivalents', 'Enterprise_Value']
✅ Merged dataset with EV/Sales saved as 'financials_with_ev_sales.csv'


In [None]:
import pandas as pd

# -------------------------------------------------------------------
# LOAD LIFECYCLE AND FINANCIAL DATASETS
# -------------------------------------------------------------------
lifecycles_df = pd.read_csv("company_lifecycles.csv")
financials_df = pd.read_csv("financials_with_ev_sales.csv")

# -------------------------------------------------------------------
# DROP UNNECESSARY COLUMNS
# -------------------------------------------------------------------
columns_to_drop = ['Operating_CF', 'Investing_CF', 'Financing_CF']
lifecycles_df = lifecycles_df.drop(columns=[col for col in columns_to_drop if col in lifecycles_df.columns])
financials_df = financials_df.drop(columns='Net_income')

# -------------------------------------------------------------------
# MERGE DATASETS ON TICKER AND YEAR
# -------------------------------------------------------------------
final_df = pd.merge(financials_df, lifecycles_df, on=['Ticker', 'Year'], how='left')

# -------------------------------------------------------------------
# SAVE FINAL MERGED DATASET
# -------------------------------------------------------------------
final_df.to_csv("raw_dataset.csv", index=False)
print("✅ Final dataset saved as 'raw_dataset.csv'")

✅ Final dataset saved as 'raw_dataset.csv'
