In [None]:
# getting all of the stocks in the US
import investpy
import yfinance as yf
import pandas as pd
import pytz



In [None]:
# Get list of all stocks in the US
stocks = investpy.stocks.get_stocks(country='united states')
russel = pd.read_csv("russel1000.csv")
russel_tick = russel['Ticker'].to_list()


In [None]:
# getting the dividend data for the above stocks


def get_dividend_data(tickers):
    """
    Fetches dividend data and calculates dividend yield for a list of tickers over the past 5 years.

    Parameters:
    tickers (list): List of stock tickers.

    Returns:
    pd.DataFrame: DataFrame with columns 'Date', 'Ticker', 'Dividend', 'Stock Price', and 'Dividend Yield (%)'.
    """
    all_dividends = []

    for ticker in tickers:
        #print(f"Fetching data for {ticker}")
        try:
            stock = yf.Ticker(ticker)
            dividends = stock.dividends
            stock_prices = stock.history(period="5y")['Close']  # Get past 5 years of stock price data
        except:
            print(f"Failed to fetch data for {ticker}")
            continue

        if dividends.empty:
            print(f"No dividend data for {ticker}")
            continue

        # Create a dataframe for dividends
        df = pd.DataFrame({
            'Date': dividends.index,
            'Ticker': ticker,
            'Dividend': dividends.values
        })

        # Fetch stock price at the closest available date before or on dividend date
        df['Stock Price'] = df['Date'].apply(lambda x: stock_prices.loc[:x].iloc[-2] if len(stock_prices.loc[:x]) > 1 else None)

        # Calculate Dividend Yield (%)
        df['Dividend Yield (%)'] = (df['Dividend'] / df['Stock Price']) * 100

        all_dividends.append(df)

    if all_dividends:
        result = pd.concat(all_dividends, ignore_index=True)
        result.sort_values(by='Date', inplace=True)
        return result.reset_index(drop=True)
    else:
        print("No dividends found for any ticker.")
        return pd.DataFrame(columns=['Date', 'Ticker', 'Dividend', 'Stock Price', 'Dividend Yield (%)'])

# Example usage:
#tickers = stocks['symbol'].to_list()
#df = get_dividend_data(russel_tick)

stocks = pd.read_csv('penny_stocks.csv')
tickers = stocks['Symbol'].to_list()
df = get_dividend_data(tickers)

In [None]:
#df.to_csv('russel1000_dividend_last5years.csv')
df.to_csv('penny_stocks_dividend_last5years.csv')

In [None]:
#df = pd.read_csv('russel1000_dividend_last5years.csv')
df.dropna(inplace=True)
df.sort_values(by='Date', ascending=False, inplace=True)
df

In [None]:
# computing the last year dividend
df["Date"] = pd.to_datetime(df["Date"])

# Define the date range for filtering
start_date = "2024-02-17"
end_date = "2025-05-17"

# Filter data for the last year
df_filtered = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]

# Group by Ticker and sum Dividend Yield (%) for the last year
df_grouped = df_filtered.groupby("Ticker", as_index=False).agg(
    {"Dividend Yield (%)": "sum"}
)

# Rename the column to "Last year Dividend Yield (%)"
df_grouped.rename(columns={"Dividend Yield (%)": "Last year Dividend Yield (%)"}, inplace=True)

# Merge back with the original df to retain all tickers
df = df.merge(df_grouped, on="Ticker", how="left")

# Fill NaN with 0 for tickers with no dividends in the date range
df["Last year Dividend Yield (%)"].fillna(0, inplace=True)

In [None]:
# sorted list of the last year dividend
import heapq
Threshold = 20
stack = []
heapq.heapify(stack)
df.sort_values(by='Last year Dividend Yield (%)', ascending=False, inplace=True)
for gr, gr_df in df.groupby("Ticker"):
    gr_df.reset_index(drop=True, inplace=True)
    gr_df.sort_values(by="Date", ascending=True, inplace=True)
    dividend = gr_df.loc[0, 'Last year Dividend Yield (%)']
    if dividend > Threshold:
        heapq.heappush(stack, (-dividend, gr, gr_df))

In [None]:
len(stack)

In [126]:
# balance sheet extractor
# pip install edgartools
import sys
from edgar import *
from edgar.financials import Financials
import pandas as pd

set_identity("sarashs arash.sheikh65@gmail.com")

from edgar import Company

def fetch_10K_and_10Q_filings(ticker: str, start_date: str, end_date: str, form: list = ['10-K']):
    """
    Fetches the 10-K and 10-Q filings for the given ticker within the specified date range.

    Note:
      - Make sure you have set your EDGAR identity (using set_identity) before calling this function.
      - The date filter should be in the form "YYYY-MM-DD:YYYY-MM-DD".

    Parameters:
        ticker (str): The stock ticker (e.g., "AAPL").
        start_date (str): The start date in "YYYY-MM-DD" format.
        end_date (str): The end date in "YYYY-MM-DD" format.

    Returns:
        list: A list-like object of filing objects (or an empty list if no filings are found).
    """
    try:
        # Create a Company object for the given ticker
        company = Company(ticker)
        # Retrieve both 10-K and 10-Q filings for the company
        filings = company.get_filings(form=form) #"10-K",
        # Filter the filings based on the provided date range
        # The filter date string uses the format "start_date:end_date"
        filtered_filings = filings.filter(date=f"{start_date}:{end_date}")
        
        if not filtered_filings:
            print(f"No 10-K or 10-Q filings found for {ticker} between {start_date} and {end_date}.")
            return []
            
        return filtered_filings

    except Exception as e:
        print(f"An error occurred while fetching filings for {ticker}: {e}")
        return []


def extract_financials(filings):
    """
    Extracts financial statements from a list of filings.
    
    For each filing, the function:
      - Calls filing.obj() to get the data object (e.g. TenK/TenQ).
      - Checks that the object has a 'financials' attribute.
      - Extracts the balance sheet, income statement, and cashflow statement using:
            financials.get_balance_sheet()
            financials.get_income_statement()
            financials.get_cash_flow_statement()
    
    Parameters:
        filings (list): A list-like object of filing objects (e.g. from Company.get_filings()).
    
    Returns:
        tuple: Six items containing the extracted financial statements:
               (balance_sheets, income_statements, cashflow_statements, 
                balance_sheets_str, income_statements_str, cashflow_statements_str).
               Filings that do not have a data object or the requested financial statement(s) are skipped.
    """
    balance_sheets = []
    income_statements = []
    cashflow_statements = []
    
    for filing in filings:
        try:
            # Convert the filing to its data object (e.g., TenK or TenQ)
            data_obj = filing.obj()
            if data_obj is None:
                print("Filing has no data object. Skipping...")
                continue
            
            # Check that the data object contains financials
            if not hasattr(data_obj, "financials") or data_obj.financials is None:
                print("Filing has no financials. Skipping...")
                continue

            financials = data_obj.financials
            
            # Extract the individual financial statements.
            # If any of these methods are unavailable or return None, skip that particular statement.
            balance_sheet = financials.get_balance_sheet() if hasattr(financials, "get_balance_sheet") else None
            income_statement = financials.get_income_statement() if hasattr(financials, "get_income_statement") else None
            cashflow_statement = financials.get_cash_flow_statement() if hasattr(financials, "get_cash_flow_statement") else None
            
            if balance_sheet is not None:
                balance_sheets.append(balance_sheet)
            if income_statement is not None:
                income_statements.append(income_statement)
            if cashflow_statement is not None:
                cashflow_statements.append(cashflow_statement)
        
        except Exception as e:
            print(f"Error extracting financials from filing: {e}")
            continue

    # Convert to structured strings for LLM processing with enhanced metadata
    def format_statement_for_llm(statement, statement_type, index):
        parts = []
        parts.append(f"=== {statement_type} {index + 1} ===")
        
        # Add metadata if available
        if hasattr(statement, 'period_end_date'):
            parts.append(f"Period End Date: {statement.period_end_date}")
        if hasattr(statement, 'filing_date'):
            parts.append(f"Filing Date: {statement.filing_date}")
        if hasattr(statement, 'period_focus'):
            parts.append(f"Period Focus: {statement.period_focus}")
        if hasattr(statement, 'fiscal_year'):
            parts.append(f"Fiscal Year: {statement.fiscal_year}")
        if hasattr(statement, 'fiscal_period'):
            parts.append(f"Fiscal Period: {statement.fiscal_period}")
        
        parts.append("")  # Empty line before data
        
        # Use to_dataframe() but with better formatting
        df = statement.to_dataframe()
        if not df.empty:
            # Round numeric values for cleaner display
            df_display = df.copy()
            for col in df_display.select_dtypes(include=['float64', 'int64']).columns:
                df_display[col] = df_display[col].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "")
            parts.append(df_display.to_string())
        else:
            parts.append("No data available")
        
        return '\n'.join(parts)
    
    balance_sheet_parts = [format_statement_for_llm(item, "BALANCE SHEET", i) 
                          for i, item in enumerate(balance_sheets)]
    balance_sheets_str = '\n\n'.join(balance_sheet_parts)
    
    income_statement_parts = [format_statement_for_llm(item, "INCOME STATEMENT", i) 
                             for i, item in enumerate(income_statements)]
    income_statements_str = '\n\n'.join(income_statement_parts)
    
    cashflow_statement_parts = [format_statement_for_llm(item, "CASH FLOW STATEMENT", i) 
                               for i, item in enumerate(cashflow_statements)]
    cashflow_statements_str = '\n\n'.join(cashflow_statement_parts)
    
    return balance_sheets, income_statements, cashflow_statements, balance_sheets_str, income_statements_str, cashflow_statements_str

#test
filings_list = fetch_10K_and_10Q_filings("nvda", "2023-01-01", "2025-2-15",form=["10-Q", "10-K"])
balance_sheets, income_statements, cashflow_statements, balance_sheets_str, income_statements_str, cashflow_statements_str = extract_financials(filings_list)#

In [143]:
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
import re
from abc import ABC, abstractmethod

class BaseFinancialAnalyzer(ABC):
    """Base class for all financial analyzers"""
    
    def __init__(self):
        self.line_item_mappings = {}
        self.company_type = "base"
    
    @abstractmethod
    def get_line_item_mappings(self) -> Dict[str, List[str]]:
        """Return company-type specific line item mappings"""
        pass
    
    @abstractmethod
    def calculate_metrics(self, financial_data: Dict) -> Dict:
        """Calculate industry-specific metrics"""
        pass
    
    def clean_line_item_name(self, name: str) -> str:
        """Clean and normalize line item names"""
        if pd.isna(name):
            return ""
        cleaned = str(name).strip()
        cleaned = re.sub(r'\s+', ' ', cleaned)
        return cleaned
    
    def find_line_item(self, df: pd.DataFrame, item_type: str, debug=False) -> Optional[pd.Series]:
        """Find a line item with fuzzy matching"""
        if item_type not in self.line_item_mappings:
            if debug:
                print(f"Item type '{item_type}' not in mappings")
            return None
        
        possible_labels = self.line_item_mappings[item_type]
        df_clean = df.copy()
        df_clean.index = [self.clean_line_item_name(idx) for idx in df.index]
        
        if debug:
            print(f"Looking for {item_type} with labels: {possible_labels}")
            print(f"Available indices: {list(df_clean.index[:10])}")
        
        # Try exact matches first
        for label in possible_labels:
            for idx in df_clean.index:
                if idx.lower() == label.lower():
                    if debug:
                        print(f"Found exact match: '{idx}' for '{label}'")
                    return df_clean.loc[idx]
        
        # Try partial matches with higher precision
        for label in possible_labels:
            for idx in df_clean.index:
                # For specific cases, use stricter matching
                if item_type == 'cfo' and 'operating activities' in label.lower() and 'operating activities' in idx.lower():
                    if debug:
                        print(f"Found CFO match: '{idx}' contains '{label}'")
                    return df_clean.loc[idx]
                elif item_type in ['mortgage_securities', 'repurchase_agreements']:
                    # For balance sheet items, be more specific
                    if label.lower() in idx.lower() and not any(x in idx.lower() for x in ['purchase', 'sale', 'payment', 'proceeds']):
                        if debug:
                            print(f"Found balance sheet match: '{idx}' contains '{label}'")
                        return df_clean.loc[idx]
                elif label.lower() in idx.lower():
                    if debug:
                        print(f"Found partial match: '{idx}' contains '{label}'")
                    return df_clean.loc[idx]
        
        if debug:
            print(f"No match found for {item_type}")
        return None
    
    def extract_value_from_period(self, row: pd.Series, period_hint: str = None) -> Optional[float]:
        """Extract numerical value from a row"""
        if row is None:
            return None
        
        numeric_cols = []
        for col in row.index:
            try:
                val = row[col]
                if pd.notna(val) and isinstance(val, (int, float)):
                    numeric_cols.append((col, val))
                elif pd.notna(val) and isinstance(val, str):
                    clean_val = re.sub(r'[,\s\(\)]', '', str(val))
                    if clean_val.replace('-', '').replace('.', '').isdigit():
                        parsed_val = float(clean_val)
                        numeric_cols.append((col, parsed_val))
            except:
                continue
        
        if numeric_cols:
            return numeric_cols[-1][1]  # Return most recent value
        return None
    
    def extract_financial_data(self, balance_sheets: List, income_statements: List, 
                              cashflow_statements: List) -> Dict:
        """Extract financial data using company-specific mappings"""
        financial_data = {}
        
        for i, (bs, inc, cf) in enumerate(zip(balance_sheets, income_statements, cashflow_statements)):
            try:
                bs_df = bs.to_dataframe() if hasattr(bs, 'to_dataframe') else bs
                inc_df = inc.to_dataframe() if hasattr(inc, 'to_dataframe') else inc
                cf_df = cf.to_dataframe() if hasattr(cf, 'to_dataframe') else cf
                
                period_data = {}
                
                # Extract data using company-specific mappings
                for metric, _ in self.line_item_mappings.items():
                    value = None
                    
                    # Route to appropriate statement based on metric type
                    if self.company_type == "REIT":
                        # REIT routing logic
                        if metric in ['interest_income', 'interest_expense', 'net_interest_income', 'net_income', 'dividends_per_share', 'shares_outstanding']:
                            value = self.extract_value_from_period(
                                self.find_line_item(inc_df, metric, debug=(i==0))
                            )
                        elif metric in ['cfo']:
                            value = self.extract_value_from_period(
                                self.find_line_item(cf_df, metric, debug=(i==0))
                            )
                        elif metric in ['total_assets', 'mortgage_securities', 'repurchase_agreements', 'total_equity', 'cash', 'total_liabilities']:
                            value = self.extract_value_from_period(
                                self.find_line_item(bs_df, metric, debug=(i==0))
                            )
                    
                    elif self.company_type == "Hardware/Semiconductor":
                        # Hardware/Semiconductor routing logic
                        if metric in ['revenue', 'cogs', 'gross_profit', 'rnd_expense', 'operating_income', 'net_income']:
                            value = self.extract_value_from_period(
                                self.find_line_item(inc_df, metric, debug=(i==0))
                            )
                        elif metric in ['cfo', 'capex']:
                            value = self.extract_value_from_period(
                                self.find_line_item(cf_df, metric, debug=(i==0))
                            )
                        elif metric in ['inventory', 'ppe', 'total_assets', 'accounts_receivable', 'accounts_payable']:
                            value = self.extract_value_from_period(
                                self.find_line_item(bs_df, metric, debug=(i==0))
                            )
                    
                    # If not found in primary statement, try others
                    if value is None:
                        for df in [inc_df, cf_df, bs_df]:
                            if value is None:
                                value = self.extract_value_from_period(
                                    self.find_line_item(df, metric, debug=False)
                                )
                            if value is not None:
                                break
                    
                    period_data[metric] = value
                
                # Calculate book value per share for REITs if we have the components
                if self.company_type == "REIT":
                    total_equity = period_data.get('total_equity')
                    shares_outstanding = period_data.get('shares_outstanding')
                    
                    if total_equity and shares_outstanding and shares_outstanding > 0:
                        book_value_per_share = total_equity / shares_outstanding
                        period_data['book_value_per_share'] = book_value_per_share
                
                financial_data[f'period_{i}'] = period_data
                
            except Exception as e:
                print(f"Error processing statement set {i}: {e}")
                continue
        
        return financial_data
    
    def analyze_trend(self, values_dict: Dict) -> str:
        """Analyze trend in a series of values"""
        if len(values_dict) < 2:
            return 'insufficient_data'
        
        values = list(values_dict.values())
        if len(values) == 2:
            return 'improving' if values[1] > values[0] else 'declining'
        
        # For more than 2 values, check overall trend
        changes = [values[i] - values[i-1] for i in range(1, len(values))]
        positive_changes = sum(1 for change in changes if change > 0)
        
        if positive_changes >= len(changes) * 0.6:
            return 'improving'
        elif positive_changes >= len(changes) * 0.4:
            return 'stable'
        else:
            return 'declining'
    
    def run_analysis(self, balance_sheets: List, income_statements: List, 
                    cashflow_statements: List) -> Dict:
        """Run complete financial analysis"""
        print(f"Running {self.company_type} analysis...")
        
        # Extract financial data
        financial_data = self.extract_financial_data(balance_sheets, income_statements, cashflow_statements)
        
        print(f"Extracted data for {len(financial_data)} periods")
        
        # Calculate industry-specific metrics
        results = self.calculate_metrics(financial_data)
        
        return results


class REITAnalyzer(BaseFinancialAnalyzer):
    """REIT Financial Analyzer based on REIT-specific metrics"""
    
    def __init__(self):
        super().__init__()
        self.company_type = "REIT"
        self.line_item_mappings = self.get_line_item_mappings()
    
    def get_line_item_mappings(self) -> Dict[str, List[str]]:
        """REIT-specific line item mappings"""
        return {
            'interest_income': [
                'Interest income', 'Total interest income', 'Interest and dividend income'
            ],
            'interest_expense': [
                'Interest expense', 'Interest and debt expense', 'Interest expense, net'
            ],
            'net_interest_income': [
                'Net interest income', 'Net interest margin'
            ],
            'net_income': [
                'Net income', 'Net income (loss)', 'Net earnings',
                'Net income (loss) available (attributable) to common shareholders'
            ],
            'cfo': [
                'Net cash provided (used in) by operating activities',
                'Net cash provided by operating activities',
                'Net cash provided (used in) by operating',
                'Cash flows from operating activities',
                'Net cash provided (used in) operating activities'
            ],
            'total_assets': [
                'Total assets', 'Total Assets'
            ],
            'mortgage_securities': [
                'Agency securities, at fair value (including pledged securities',
                'Agency securities, at fair value',
                'Agency mortgage-backed securities', 
                'Agency securities'
            ],
            'repurchase_agreements': [
                'Repurchase Agreements',
                'Repurchase agreements',
                'Securities sold under repurchase agreements'
            ],
            'total_liabilities': [
                'Total liabilities', 'Total Liabilities'
            ],
            'total_equity': [
                'Total stockholders\' equity', 'Total stockholders equity',
                'Total shareholders\' equity'
            ],
            'cash': [
                'Cash and cash equivalents', 'Cash and cash equivalents at end of period',
                'Cash', 'Restricted cash'
            ],
            'dividends_per_share': [
                'Dividends declared per common share', 'Dividend per share'
            ],
            'shares_outstanding': [
                'Weighted average number of common shares outstanding - basic',
                'Shares outstanding', 'Common shares outstanding'
            ]
        }
    
    def calculate_metrics(self, financial_data: Dict) -> Dict:
        """Calculate REIT-specific metrics"""
        return {
            'nim_consistency': self._calculate_nim_consistency(financial_data),
            'dividend_sustainability': self._calculate_dividend_sustainability(financial_data),
            'book_value_growth': self._calculate_book_value_growth(financial_data),
            'leverage_management': self._calculate_leverage_management(financial_data),
            'asset_growth_analysis': self._calculate_asset_growth(financial_data),
            'overall_reit_score': None  # Will be calculated after other metrics
        }
    
    def _calculate_nim_consistency(self, financial_data: Dict) -> Dict:
        """Net Interest Margin Consistency Analysis"""
        nim_values = {}
        
        for period, data in financial_data.items():
            net_interest_income = data.get('net_interest_income')
            total_assets = data.get('total_assets')
            
            if net_interest_income and total_assets and total_assets > 0:
                nim = net_interest_income / total_assets
                nim_values[period] = nim
        
        if len(nim_values) < 2:
            return {
                'nim_values': nim_values,
                'avg_nim': None,
                'nim_volatility': None,
                'stable_nim': False,
                'meets_threshold': False
            }
        
        nim_list = list(nim_values.values())
        avg_nim = np.mean(nim_list)
        nim_volatility = np.std(nim_list)
        
        # REIT Threshold: NIM >1.0% and volatility <25bps (0.0025)
        stable_nim = nim_volatility < 0.0025
        adequate_nim = avg_nim > 0.01
        
        return {
            'nim_values': nim_values,
            'avg_nim': avg_nim,
            'nim_volatility': nim_volatility,
            'stable_nim': stable_nim,
            'adequate_nim': adequate_nim,
            'meets_threshold': stable_nim and adequate_nim and len(nim_values) >= 8  # 8 quarters
        }
    
    def _calculate_dividend_sustainability(self, financial_data: Dict) -> Dict:
        """Dividend Coverage Sustainability Analysis"""
        coverage_ratios = {}
        dividend_history = {}
        
        for period, data in financial_data.items():
            net_income = data.get('net_income')
            cfo = data.get('cfo')
            dividends_per_share = data.get('dividends_per_share')
            
            # Simplified coverage calculation (would need shares outstanding for exact)
            if net_income and dividends_per_share and dividends_per_share > 0:
                # Using net income as proxy for dividend coverage
                coverage = abs(net_income) / abs(dividends_per_share) if dividends_per_share != 0 else 0
                coverage_ratios[period] = coverage
                dividend_history[period] = dividends_per_share
        
        if not coverage_ratios:
            return {
                'coverage_ratios': {},
                'avg_coverage': None,
                'coverage_consistency': False,
                'meets_threshold': False
            }
        
        coverage_values = list(coverage_ratios.values())
        avg_coverage = np.mean(coverage_values)
        min_coverage = min(coverage_values)
        
        # REIT Threshold: Coverage >0.9x consistently, >1.1x preferred
        coverage_consistency = min_coverage > 0.8
        sustainable_coverage = avg_coverage > 0.9 and min_coverage > 0.7
        
        return {
            'coverage_ratios': coverage_ratios,
            'avg_coverage': avg_coverage,
            'min_coverage': min_coverage,
            'coverage_consistency': coverage_consistency,
            'meets_threshold': sustainable_coverage and len(coverage_ratios) >= 8  # 8 quarters
        }
    
    def _calculate_book_value_growth(self, financial_data: Dict) -> Dict:
        """Book Value Growth Analysis"""
        book_values = {}
        
        for period, data in financial_data.items():
            book_value = data.get('book_value_per_share')
            if book_value:
                book_values[period] = book_value
        
        if len(book_values) < 3:
            return {
                'book_values': book_values,
                'book_value_cagr': None,
                'meets_threshold': False
            }
        
        # Calculate 3-year CAGR
        sorted_periods = sorted(book_values.keys())
        start_bv = book_values[sorted_periods[0]]
        end_bv = book_values[sorted_periods[-1]]
        
        if start_bv > 0:
            years = len(sorted_periods) - 1
            book_value_cagr = (end_bv / start_bv) ** (1/years) - 1
        else:
            book_value_cagr = None
        
        # REIT Threshold: Book value growth >3% annually
        meets_threshold = book_value_cagr is not None and book_value_cagr > 0.03
        
        return {
            'book_values': book_values,
            'book_value_cagr': book_value_cagr,
            'meets_threshold': meets_threshold
        }
    
    def _calculate_leverage_management(self, financial_data: Dict) -> Dict:
        """Leverage Cycle Management Analysis"""
        debt_equity_ratios = {}
        
        for period, data in financial_data.items():
            repurchase_agreements = data.get('repurchase_agreements', 0) or 0
            total_equity = data.get('total_equity')
            
            if total_equity and total_equity > 0:
                debt_equity = repurchase_agreements / total_equity
                debt_equity_ratios[period] = debt_equity
        
        if not debt_equity_ratios:
            return {
                'debt_equity_ratios': {},
                'avg_leverage': None,
                'leverage_volatility': None,
                'meets_threshold': False
            }
        
        de_ratios = list(debt_equity_ratios.values())
        avg_leverage = np.mean(de_ratios)
        leverage_volatility = np.std(de_ratios)
        max_leverage = max(de_ratios)
        
        # REIT Thresholds: D/E ratio 6:1-10:1 range
        within_range = 6.0 <= avg_leverage <= 10.0
        stable_leverage = leverage_volatility < 2.0  # Reasonable volatility
        
        return {
            'debt_equity_ratios': debt_equity_ratios,
            'avg_leverage': avg_leverage,
            'max_leverage': max_leverage,
            'leverage_volatility': leverage_volatility,
            'within_range': within_range,
            'meets_threshold': within_range and stable_leverage
        }
    
    def _calculate_asset_growth(self, financial_data: Dict) -> Dict:
        """Asset Growth Analysis"""
        asset_values = {}
        
        for period, data in financial_data.items():
            total_assets = data.get('total_assets')
            if total_assets:
                asset_values[period] = total_assets
        
        if len(asset_values) < 2:
            return {
                'asset_values': asset_values,
                'asset_growth_rate': None,
                'meets_threshold': False
            }
        
        # Calculate year-over-year growth
        sorted_periods = sorted(asset_values.keys())
        if len(sorted_periods) >= 2:
            latest_assets = asset_values[sorted_periods[-1]]
            previous_assets = asset_values[sorted_periods[-2]]
            
            asset_growth_rate = (latest_assets / previous_assets - 1) if previous_assets > 0 else None
        else:
            asset_growth_rate = None
        
        # REIT Focus: Controlled growth 5-15% annually
        controlled_growth = asset_growth_rate is not None and 0.05 <= asset_growth_rate <= 0.15
        
        return {
            'asset_values': asset_values,
            'asset_growth_rate': asset_growth_rate,
            'controlled_growth': controlled_growth,
            'meets_threshold': controlled_growth
        }


class HardwareSemiconductorAnalyzer(BaseFinancialAnalyzer):
    """Hardware/Semiconductor Financial Analyzer"""
    
    def __init__(self):
        super().__init__()
        self.company_type = "Hardware/Semiconductor"
        self.line_item_mappings = self.get_line_item_mappings()
    
    def get_line_item_mappings(self) -> Dict[str, List[str]]:
        """Hardware/Semiconductor-specific line item mappings"""
        return {
            'revenue': [
                'Revenue', 'Total revenue', 'Net sales', 'Total net revenues', 'Sales'
            ],
            'cogs': [
                'Cost of revenue', 'Cost of goods sold', 'Cost of sales',
                'Cost of revenues', 'Total cost of revenue'
            ],
            'gross_profit': [
                'Gross profit', 'Gross income'
            ],
            'rnd_expense': [
                'Research and development', 'R&D', 'Research and development expenses',
                'Research and development costs'
            ],
            'operating_income': [
                'Operating income', 'Income from operations', 'Operating income (loss)'
            ],
            'net_income': [
                'Net income', 'Net income (loss)', 'Net earnings'
            ],
            'cfo': [
                'Net cash provided by operating activities',
                'Cash flows from operating activities'
            ],
            'capex': [
                'Purchases related to property and equipment and intangible assets',
                'Capital expenditures', 'Purchases of property and equipment',
                'Additions to property and equipment'
            ],
            'inventory': [
                'Inventories', 'Inventory', 'Total inventory'
            ],
            'ppe': [
                'Property and equipment, net', 'Property, plant and equipment',
                'Total property, plant and equipment'
            ],
            'total_assets': [
                'Total assets'
            ],
            'accounts_receivable': [
                'Accounts receivable, net', 'Accounts receivable', 'Trade receivables', 'Receivables'
            ],
            'accounts_payable': [
                'Accounts payable', 'Trade payables'
            ]
        }
    
    def calculate_metrics(self, financial_data: Dict) -> Dict:
        """Calculate Hardware/Semiconductor-specific metrics"""
        return {
            'gross_margin_analysis': self._calculate_gross_margin_analysis(financial_data),
            'rnd_efficiency': self._calculate_rnd_efficiency(financial_data),
            'inventory_management': self._calculate_inventory_management(financial_data),
            'capital_efficiency': self._calculate_capital_efficiency(financial_data),
            'working_capital_efficiency': self._calculate_working_capital_efficiency(financial_data),
            'overall_hardware_score': None  # Will be calculated after other metrics
        }
    
    def _calculate_gross_margin_analysis(self, financial_data: Dict) -> Dict:
        """Gross Margin Cycle Management Analysis"""
        gross_margins = {}
        
        for period, data in financial_data.items():
            revenue = data.get('revenue')
            cogs = data.get('cogs')
            
            if revenue and cogs and revenue > 0:
                # Handle negative COGS (shown in parentheses)
                cogs_abs = abs(cogs)
                gross_margin = (revenue - cogs_abs) / revenue
                gross_margins[period] = gross_margin
        
        if not gross_margins:
            return {
                'gross_margins': {},
                'avg_margin': None,
                'margin_volatility': None,
                'trough_margin': None,
                'peak_margin': None,
                'meets_threshold': False
            }
        
        margin_values = list(gross_margins.values())
        avg_margin = np.mean(margin_values)
        margin_volatility = np.std(margin_values)
        trough_margin = min(margin_values)
        peak_margin = max(margin_values)
        
        # Hardware Threshold: Gross margins >40% for semiconductors, >25% for hardware
        # Using 40% threshold for semiconductors
        adequate_margins = avg_margin > 0.40
        cycle_resilience = trough_margin > 0.30  # Maintains 30%+ at trough
        
        return {
            'gross_margins': gross_margins,
            'avg_margin': avg_margin,
            'margin_volatility': margin_volatility,
            'trough_margin': trough_margin,
            'peak_margin': peak_margin,
            'cycle_resilience': cycle_resilience,
            'meets_threshold': adequate_margins and cycle_resilience and len(gross_margins) >= 8  # 8 quarters minimum
        }
    
    def _calculate_rnd_efficiency(self, financial_data: Dict) -> Dict:
        """R&D Efficiency Analysis"""
        rnd_intensity = {}
        rnd_expenses = {}
        revenues = {}
        
        for period, data in financial_data.items():
            revenue = data.get('revenue')
            rnd_expense = data.get('rnd_expense')
            
            if revenue and rnd_expense and revenue > 0:
                # Handle negative R&D expense (shown in parentheses)
                rnd_abs = abs(rnd_expense)
                intensity = rnd_abs / revenue
                rnd_intensity[period] = intensity
                rnd_expenses[period] = rnd_abs
                revenues[period] = revenue
        
        if len(rnd_intensity) < 3:
            return {
                'rnd_intensity': rnd_intensity,
                'avg_rnd_intensity': None,
                'rnd_trend': 'insufficient_data',
                'meets_threshold': False
            }
        
        intensity_values = list(rnd_intensity.values())
        avg_rnd_intensity = np.mean(intensity_values)
        rnd_trend = self.analyze_trend(rnd_intensity)
        
        # Hardware Focus: R&D intensity 15-25% for leading-edge semiconductors
        appropriate_intensity = 0.15 <= avg_rnd_intensity <= 0.25
        stable_trend = rnd_trend in ['improving', 'stable']
        
        return {
            'rnd_intensity': rnd_intensity,
            'avg_rnd_intensity': avg_rnd_intensity,
            'rnd_trend': rnd_trend,
            'appropriate_intensity': appropriate_intensity,
            'meets_threshold': appropriate_intensity and stable_trend
        }
    
    def _calculate_inventory_management(self, financial_data: Dict) -> Dict:
        """Inventory Management Assessment"""
        inventory_turns = {}
        
        for period, data in financial_data.items():
            cogs = data.get('cogs')
            inventory = data.get('inventory')
            
            if cogs and inventory and inventory > 0:
                # Handle negative COGS
                cogs_abs = abs(cogs)
                turns = cogs_abs / inventory
                inventory_turns[period] = turns
        
        if not inventory_turns:
            return {
                'inventory_turns': {},
                'avg_turns': None,
                'min_turns': None,
                'turns_consistency': False,
                'meets_threshold': False
            }
        
        turns_values = list(inventory_turns.values())
        avg_turns = np.mean(turns_values)
        min_turns = min(turns_values)
        turns_volatility = np.std(turns_values)
        
        # Thresholds: 4-6x turns for semiconductors
        adequate_turns = avg_turns >= 4.0
        consistent_turns = turns_volatility < 1.0  # Stable inventory management
        never_below_threshold = min_turns >= 3.0
        
        return {
            'inventory_turns': inventory_turns,
            'avg_turns': avg_turns,
            'min_turns': min_turns,
            'turns_volatility': turns_volatility,
            'turns_consistency': consistent_turns,
            'meets_threshold': adequate_turns and consistent_turns and never_below_threshold
        }
    
    def _calculate_capital_efficiency(self, financial_data: Dict) -> Dict:
        """Capital Efficiency Analysis"""
        asset_turnover = {}
        capex_intensity = {}
        
        for period, data in financial_data.items():
            revenue = data.get('revenue')
            ppe = data.get('ppe')
            capex = data.get('capex')
            
            if revenue and ppe and ppe > 0:
                turnover = revenue / ppe
                asset_turnover[period] = turnover
            
            if revenue and capex and revenue > 0:
                # Handle negative CapEx
                capex_abs = abs(capex)
                intensity = capex_abs / revenue
                capex_intensity[period] = intensity
        
        results = {
            'asset_turnover': asset_turnover,
            'capex_intensity': capex_intensity
        }
        
        if asset_turnover:
            avg_turnover = np.mean(list(asset_turnover.values()))
            # Threshold: Asset turnover >1.5x for semiconductors
            adequate_turnover = avg_turnover > 1.5
            results.update({
                'avg_asset_turnover': avg_turnover,
                'adequate_turnover': adequate_turnover
            })
        else:
            results.update({
                'avg_asset_turnover': None,
                'adequate_turnover': False
            })
        
        if capex_intensity:
            avg_capex_intensity = np.mean(list(capex_intensity.values()))
            results.update({
                'avg_capex_intensity': avg_capex_intensity
            })
        
        meets_threshold = (asset_turnover and 
                          results.get('adequate_turnover', False))
        
        results['meets_threshold'] = meets_threshold
        return results
    
    def _calculate_working_capital_efficiency(self, financial_data: Dict) -> Dict:
        """Working Capital Efficiency Analysis"""
        dso_values = {}  # Days Sales Outstanding
        dpo_values = {}  # Days Payable Outstanding
        
        for period, data in financial_data.items():
            revenue = data.get('revenue')
            accounts_receivable = data.get('accounts_receivable')
            cogs = data.get('cogs')
            accounts_payable = data.get('accounts_payable')
            
            # Calculate quarterly revenue (if this is quarterly data)
            # For proper DSO calculation, we need to annualize or use quarterly multiplier
            if revenue and accounts_receivable and revenue > 0:
                # Assuming quarterly data, multiply by 4 for annual revenue equivalent
                annual_revenue_equiv = revenue * 4
                dso = (accounts_receivable / annual_revenue_equiv) * 365
                dso_values[period] = dso
            
            if cogs and accounts_payable and cogs != 0:
                # Handle negative COGS and annualize
                annual_cogs_equiv = abs(cogs) * 4
                dpo = (accounts_payable / annual_cogs_equiv) * 365
                dpo_values[period] = dpo
        
        results = {
            'dso_values': dso_values,
            'dpo_values': dpo_values
        }
        
        # Calculate cash conversion cycle if we have both DSO and DPO
        if dso_values and dpo_values:
            # Find common periods
            common_periods = set(dso_values.keys()) & set(dpo_values.keys())
            cash_conversion_cycle = {}
            
            for period in common_periods:
                # Simplified CCC without inventory days (DIO)
                ccc = dso_values[period] - dpo_values[period]
                cash_conversion_cycle[period] = ccc
            
            results['cash_conversion_cycle'] = cash_conversion_cycle
            
            if cash_conversion_cycle:
                avg_ccc = np.mean(list(cash_conversion_cycle.values()))
                # Target: Negative cash conversion cycle preferred
                efficient_working_capital = avg_ccc < 0
                results.update({
                    'avg_cash_conversion_cycle': avg_ccc,
                    'efficient_working_capital': efficient_working_capital,
                    'meets_threshold': efficient_working_capital
                })
        
        if 'meets_threshold' not in results:
            results['meets_threshold'] = False
        
        return results


class FinancialAnalyzerFactory:
    """Factory class to create appropriate financial analyzers"""
    
    @staticmethod
    def get_analyzer(company_type: str) -> BaseFinancialAnalyzer:
        """Get the appropriate analyzer for the company type"""
        company_type = company_type.lower().replace('_', '').replace('-', '').replace(' ', '')
        
        if company_type in ['reit', 'realestate', 'mortgagereit']:
            return REITAnalyzer()
        elif company_type in ['hardware', 'semiconductor', 'hardwaresemiconductor', 'chip', 'chipmaker']:
            return HardwareSemiconductorAnalyzer()
        else:
            raise ValueError(f"Analyzer for company type '{company_type}' not implemented yet")
    
    @staticmethod
    def auto_detect_company_type(balance_sheets: List, income_statements: List) -> str:
        """Auto-detect company type based on financial statement structure"""
        try:
            inc_df = income_statements[0].to_dataframe() if hasattr(income_statements[0], 'to_dataframe') else income_statements[0]
            bs_df = balance_sheets[0].to_dataframe() if hasattr(balance_sheets[0], 'to_dataframe') else balance_sheets[0]
            
            inc_text = ' '.join(str(idx).lower() for idx in inc_df.index)
            bs_text = ' '.join(str(idx).lower() for idx in bs_df.index)
            combined_text = inc_text + ' ' + bs_text
            
            # Define indicators for each company type
            indicators = {
                'reit': [
                    'interest income', 'net interest income', 'mortgage-backed securities',
                    'repurchase agreements', 'agency securities', 'dividend income'
                ],
                'hardware': [
                    'cost of goods sold', 'inventory', 'research and development',
                    'gross profit', 'product sales', 'manufacturing'
                ]
            }
            
            # Score each company type
            scores = {}
            for company_type, type_indicators in indicators.items():
                score = sum(1 for indicator in type_indicators if indicator in combined_text)
                scores[company_type] = score
            
            # Find the highest scoring type
            max_score = max(scores.values())
            if max_score >= 2:  # Need at least 2 indicators to classify
                detected_type = max(scores, key=scores.get)
                return detected_type
            else:
                return 'unknown'
                
        except Exception as e:
            print(f"Error in auto-detection: {e}")
            return 'unknown'
    
    @staticmethod
    def list_available_analyzers() -> List[str]:
        """Return list of available analyzer types"""
        return ['reit', 'hardware']


def print_extracted_data_debug(financial_data: Dict):
    """Print extracted financial data for debugging"""
    print("\n" + "="*60)
    print("EXTRACTED FINANCIAL DATA DEBUG")
    print("="*60)
    
    for period, data in financial_data.items():
        print(f"\n{period.upper()}:")
        print("-" * 30)
        for metric, value in data.items():
            if value is not None:
                if isinstance(value, float):
                    if abs(value) > 1000:
                        print(f"  {metric}: {value:,.0f}")
                    else:
                        print(f"  {metric}: {value:.4f}")
                else:
                    print(f"  {metric}: {value}")
            else:
                print(f"  {metric}: None")


def analyze_company_financials(balance_sheets, income_statements, cashflow_statements, 
                             company_type: str = None, debug: bool = False):
    """
    Analyze company financials using the appropriate analyzer
    
    Args:
        balance_sheets: List of balance sheet data
        income_statements: List of income statement data  
        cashflow_statements: List of cash flow statement data
        company_type: Type of company ('reit', 'hardware') or None for auto-detection
        debug: Whether to print debug information
    """
    
    if company_type is None:
        # Auto-detect company type
        company_type = FinancialAnalyzerFactory.auto_detect_company_type(
            balance_sheets, income_statements
        )
        print(f"Auto-detected company type: {company_type}")
    
    if company_type == 'unknown':
        print("Warning: Could not determine company type. Available types: reit, hardware")
        return None
    
    try:
        analyzer = FinancialAnalyzerFactory.get_analyzer(company_type)
        results = analyzer.run_analysis(balance_sheets, income_statements, cashflow_statements)
        
        # Print debug info if requested
        if debug:
            financial_data = analyzer.extract_financial_data(balance_sheets, income_statements, cashflow_statements)
            print_extracted_data_debug(financial_data)
        
        # Calculate overall scores
        if company_type == 'reit':
            results['overall_reit_score'] = calculate_reit_overall_score(results)
        elif company_type == 'hardware':
            results['overall_hardware_score'] = calculate_hardware_overall_score(results)
        
        return results
    except ValueError as e:
        print(f"Error: {e}")
        return None


def calculate_reit_overall_score(results: Dict) -> Dict:
    """Calculate overall REIT screening score"""
    criteria_met = 0
    total_criteria = 5
    
    categories = [
        'nim_consistency', 'dividend_sustainability', 'book_value_growth',
        'leverage_management', 'asset_growth_analysis'
    ]
    
    for category in categories:
        if results.get(category, {}).get('meets_threshold', False):
            criteria_met += 1
    
    score = criteria_met / total_criteria
    
    return {
        'criteria_met': criteria_met,
        'total_criteria': total_criteria,
        'score': score,
        'pass': score >= 0.6,
        'grade': get_grade(score)
    }


def calculate_hardware_overall_score(results: Dict) -> Dict:
    """Calculate overall Hardware/Semiconductor screening score"""
    criteria_met = 0
    total_criteria = 5
    
    categories = [
        'gross_margin_analysis', 'rnd_efficiency', 'inventory_management',
        'capital_efficiency', 'working_capital_efficiency'
    ]
    
    for category in categories:
        if results.get(category, {}).get('meets_threshold', False):
            criteria_met += 1
    
    score = criteria_met / total_criteria
    
    return {
        'criteria_met': criteria_met,
        'total_criteria': total_criteria,
        'score': score,
        'pass': score >= 0.6,
        'grade': get_grade(score)
    }


def get_grade(score: float) -> str:
    """Assign letter grade based on score"""
    if score >= 0.9:
        return 'A'
    elif score >= 0.7:
        return 'B'
    elif score >= 0.5:
        return 'C'
    else:
        return 'D'


# Example usage functions
def print_reit_analysis_summary(results: Dict):
    """Print a summary of REIT analysis results"""
    print("\n" + "="*60)
    print("REIT FINANCIAL ANALYSIS SUMMARY")
    print("="*60)
    
    overall = results.get('overall_reit_score', {})
    print(f"Overall Grade: {overall.get('grade', 'N/A')}")
    print(f"Score: {overall.get('score', 0):.1%}")
    print(f"Criteria Met: {overall.get('criteria_met', 0)}/{overall.get('total_criteria', 5)}")
    print(f"Pass: {'YES' if overall.get('pass', False) else 'NO'}")
    
    print("\nDETAILED METRICS:")
    print("-" * 40)
    
    # NIM Consistency
    nim = results.get('nim_consistency', {})
    print(f"Net Interest Margin:")
    print(f"  Average NIM: {nim.get('avg_nim', 0):.2%}")
    print(f"  Volatility: {nim.get('nim_volatility', 0):.4f}")
    print(f"  Threshold Met: {'✓' if nim.get('meets_threshold', False) else '✗'}")
    
    # Dividend Sustainability
    div = results.get('dividend_sustainability', {})
    print(f"\nDividend Sustainability:")
    print(f"  Average Coverage: {div.get('avg_coverage', 0):.2f}x")
    print(f"  Min Coverage: {div.get('min_coverage', 0):.2f}x")
    print(f"  Threshold Met: {'✓' if div.get('meets_threshold', False) else '✗'}")
    
    # Book Value Growth
    bv = results.get('book_value_growth', {})
    print(f"\nBook Value Growth:")
    if bv.get('book_value_cagr'):
        print(f"  CAGR: {bv.get('book_value_cagr', 0):.2%}")
    else:
        print(f"  CAGR: N/A")
    print(f"  Threshold Met: {'✓' if bv.get('meets_threshold', False) else '✗'}")
    
    # Leverage Management
    lev = results.get('leverage_management', {})
    print(f"\nLeverage Management:")
    print(f"  Average D/E: {lev.get('avg_leverage', 0):.1f}x")
    print(f"  Max D/E: {lev.get('max_leverage', 0):.1f}x")
    print(f"  Threshold Met: {'✓' if lev.get('meets_threshold', False) else '✗'}")
    
    # Asset Growth
    asset = results.get('asset_growth_analysis', {})
    print(f"\nAsset Growth:")
    if asset.get('asset_growth_rate'):
        print(f"  Growth Rate: {asset.get('asset_growth_rate', 0):.2%}")
    else:
        print(f"  Growth Rate: N/A")
    print(f"  Threshold Met: {'✓' if asset.get('meets_threshold', False) else '✗'}")


def print_hardware_analysis_summary(results: Dict):
    """Print a summary of Hardware/Semiconductor analysis results"""
    print("\n" + "="*60)
    print("HARDWARE/SEMICONDUCTOR FINANCIAL ANALYSIS SUMMARY")
    print("="*60)
    
    overall = results.get('overall_hardware_score', {})
    print(f"Overall Grade: {overall.get('grade', 'N/A')}")
    print(f"Score: {overall.get('score', 0):.1%}")
    print(f"Criteria Met: {overall.get('criteria_met', 0)}/{overall.get('total_criteria', 5)}")
    print(f"Pass: {'YES' if overall.get('pass', False) else 'NO'}")
    
    print("\nDETAILED METRICS:")
    print("-" * 40)
    
    # Gross Margin Analysis
    gm = results.get('gross_margin_analysis', {})
    print(f"Gross Margin Analysis:")
    print(f"  Average Margin: {gm.get('avg_margin', 0):.1%}")
    print(f"  Trough Margin: {gm.get('trough_margin', 0):.1%}")
    print(f"  Peak Margin: {gm.get('peak_margin', 0):.1%}")
    print(f"  Threshold Met: {'✓' if gm.get('meets_threshold', False) else '✗'}")
    
    # R&D Efficiency
    rnd = results.get('rnd_efficiency', {})
    print(f"\nR&D Efficiency:")
    print(f"  Average R&D Intensity: {rnd.get('avg_rnd_intensity', 0):.1%}")
    print(f"  Trend: {rnd.get('rnd_trend', 'N/A')}")
    print(f"  Threshold Met: {'✓' if rnd.get('meets_threshold', False) else '✗'}")
    
    # Inventory Management
    inv = results.get('inventory_management', {})
    print(f"\nInventory Management:")
    print(f"  Average Turns: {inv.get('avg_turns', 0):.1f}x")
    print(f"  Min Turns: {inv.get('min_turns', 0):.1f}x")
    print(f"  Threshold Met: {'✓' if inv.get('meets_threshold', False) else '✗'}")
    
    # Capital Efficiency
    cap = results.get('capital_efficiency', {})
    print(f"\nCapital Efficiency:")
    print(f"  Average Asset Turnover: {cap.get('avg_asset_turnover', 0):.1f}x")
    print(f"  Threshold Met: {'✓' if cap.get('meets_threshold', False) else '✗'}")
    
    # Working Capital Efficiency
    wc = results.get('working_capital_efficiency', {})
    print(f"\nWorking Capital Efficiency:")
    if wc.get('avg_cash_conversion_cycle'):
        print(f"  Cash Conversion Cycle: {wc.get('avg_cash_conversion_cycle', 0):.0f} days")
    else:
        print(f"  Cash Conversion Cycle: N/A")
    print(f"  Threshold Met: {'✓' if wc.get('meets_threshold', False) else '✗'}")

In [144]:
# Example usage:
results = analyze_company_financials(balance_sheets, income_statements, cashflow_statements, 'hardware')
print_hardware_analysis_summary(results)

Running Hardware/Semiconductor analysis...
Looking for revenue with labels: ['Revenue', 'Total revenue', 'Net sales', 'Total net revenues', 'Sales']
Available indices: ['Revenue', 'Cost of revenue', 'Gross profit', 'Research and development', 'Sales, general and administrative', 'Total operating expenses', 'Operating income', 'Interest income', 'Interest expense', 'Other, net']
Found exact match: 'Revenue' for 'Revenue'
Looking for cogs with labels: ['Cost of revenue', 'Cost of goods sold', 'Cost of sales', 'Cost of revenues', 'Total cost of revenue']
Available indices: ['Revenue', 'Cost of revenue', 'Gross profit', 'Research and development', 'Sales, general and administrative', 'Total operating expenses', 'Operating income', 'Interest income', 'Interest expense', 'Other, net']
Found exact match: 'Cost of revenue' for 'Cost of revenue'
Looking for gross_profit with labels: ['Gross profit', 'Gross income']
Available indices: ['Revenue', 'Cost of revenue', 'Gross profit', 'Research and 

In [145]:
results

{'gross_margin_analysis': {'gross_margins': {'period_0': 0.739514348785872,
   'period_1': 0.7005256533649219,
   'period_2': 0.646273637374861,
   'period_3': 0.6492903321691313,
   'period_4': 0.5356600910470409,
   'period_5': 0.4348150357995227,
   'period_6': 0.655284749034749,
   'period_7': 0.623448275862069},
  'avg_margin': np.float64(0.623101515429771),
  'margin_volatility': np.float64(0.09007144951394498),
  'trough_margin': 0.4348150357995227,
  'peak_margin': 0.739514348785872,
  'cycle_resilience': True,
  'meets_threshold': True},
 'rnd_efficiency': {'rnd_intensity': {'period_0': 0.12660044150110375,
   'period_1': 0.15103279780854373,
   'period_2': 0.260706340378198,
   'period_3': 0.19573456193802483,
   'period_4': 0.32793795312763446,
   'period_5': 0.2720763723150358,
   'period_6': 0.19522200772200773,
   'period_7': 0.2353223388305847},
  'avg_rnd_intensity': np.float64(0.2205791017026416),
  'rnd_trend': 'stable',
  'appropriate_intensity': np.True_,
  'meets_t

In [130]:
balance_sheets[1]

                                        [1;38;5;38mNVIDIA CORP[0m                                         
                                [1mConsolidated Balance Sheets[0m                                 
                                          Instant                                           
                                                                                            
 [1m [0m[1m                                                  [0m[1m [0m [1m        [0m[1m [0m [1mJul 28, 2024[0m[1m [0m [1mJan 28, 2024[0m[1m [0m 
 ────────────────────────────────────────────────────────────────────────────────────────── 
  [38;5;242m    Cash and cash equivalents                     [0m  [2;38;5;249mmillions[0m  [38;5;242m       8,563[0m  [38;5;242m       7,280[0m  
  [38;5;242m    Marketable securities                         [0m  [2;38;5;249mmillions[0m  [38;5;242m      26,237[0m  [38;5;242m      18,704[0m  
  [38;5;242m    Accounts receivable

In [131]:
income_statements[1]

                                        [1;38;5;38mNVIDIA CORP[0m                                         
                                     [1mIncome Statements[0m                                      
                                          3 Months                                          
                                                                                            
 [1m [0m[1m                                                  [0m[1m [0m [1m        [0m[1m [0m [1mJul 28, 2024[0m[1m [0m [1mJul 30, 2023[0m[1m [0m 
 ────────────────────────────────────────────────────────────────────────────────────────── 
  [38;5;242mRevenue                                           [0m  [2;38;5;249mmillions[0m  [38;5;242m      30,040[0m  [38;5;242m      13,507[0m  
  [38;5;242mCost of revenue                                   [0m  [2;38;5;249mmillions[0m  [38;5;242m  (   7,466)[0m  [38;5;242m  (   4,045)[0m  
  [1;38;5;32mGross profit          

In [132]:
cashflow_statements[1] #income_statements  balance_sheets

                                        [1;38;5;38mNVIDIA CORP[0m                                         
                            [1mConsolidated Statement of Cash Flows[0m                            
                                          6 Months                                          
                                                                                            
 [1m [0m[1m                                                  [0m[1m [0m [1m        [0m[1m [0m [1mJul 28, 2024[0m[1m [0m [1mJul 30, 2023[0m[1m [0m 
 ────────────────────────────────────────────────────────────────────────────────────────── 
  [38;5;242m  Net income                                      [0m  [2;38;5;249mmillions[0m  [38;5;242m      31,480[0m  [38;5;242m       8,232[0m  
  [38;5;242m    Stock-based compensation expense              [0m  [2;38;5;249mmillions[0m  [38;5;242m       2,164[0m  [38;5;242m       1,576[0m  
  [38;5;242m    Depreciation and am

In [None]:
# Open ai financial health assessment

from pydantic import BaseModel
from openai import OpenAI

client = OpenAI()

class Step(BaseModel):
    justification: str

class Assessment(BaseModel):
    steps: list[Step]
    trends: str
    approve: bool
    
def assess(income_statements_str, cashflow_statements_str, balance_sheets_str):
    completion = client.beta.chat.completions.parse(
        model="o3-mini",
        reasoning_effort= "high",
        messages=[
            {"role": "system", "content": """You are a financial advisor. You are provided the last two years worth of financial data based on 10-K and 10-K documnent for a company.
            The goal is to buy the most promising dividend stock. Whether or not a stock is a dividend stock is not for you to extract but you are looking at the financial health of the company and your primary goal is to avoid risk.
            You must provide me with the trends over the past two years, your step by step justification of your assessment and whether or not you approve this purchase considering the potential risks to this company."""},
            {"role": "user", "content": f"""Here are the balance sheets, income statements, and cashflow statements for the past three years.\n\n
            Income statements:\n{income_statements_str}\n\nCashflow statements:\n{cashflow_statements_str}\n\nBalance sheets:\n{balance_sheets_str}
            """},
        ],
        response_format=Assessment,
    )
    
    assessment = completion.choices[0].message.parsed
    return assessment

In [None]:
from pprint import pp

In [None]:
# processing the balance sheets
from tqdm import tqdm
balance_sheet_approved = {}
balance_sheet_rejected = {}
for i in tqdm(range(len(stack))):
    try:
        dividend, gr, gr_df = stack[i]
        filings_list = fetch_10K_and_10Q_filings(gr, "2023-01-01", "2025-5-15",form=["10-Q", "10-K"])
        _, _, _, balance_sheets_str, income_statements_str, cashflow_statements_str = extract_financials(filings_list)
        assessment = assess(income_statements_str, cashflow_statements_str, balance_sheets_str)
        if assessment.approve:
            balance_sheet_approved[gr] = (dividend, assessment.trends, gr_df)
        else:
            balance_sheet_rejected[gr] = (dividend, assessment.trends, gr_df)
    except Exception as e:
        print(e)
    

In [None]:
import pickle

with open('investment_grade_dividend_penny_stocks.pkl', 'wb') as f:
    pickle.dump(balance_sheet_approved, f)

In [None]:
balance_sheet_rejected

In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt


ticker = 'GSL'

# Initialize an empty DataFrame to store dividend data
all_dividends = pd.DataFrame()
all_prices = pd.DataFrame()

stock = yf.Ticker(ticker)
    
# Get dividend history
dividends = stock.dividends.reset_index()
dividends['Ticker'] = ticker

# Get historical price data
prices = stock.history(period="max").reset_index()[['Date', 'Close']]
prices['Ticker'] = ticker
# Append to the main DataFrames
all_dividends = pd.concat([all_dividends, dividends], ignore_index=True)
all_prices = pd.concat([all_prices, prices], ignore_index=True)

# Set the date as the index
all_dividends.set_index('Date', inplace=True)
all_prices.set_index('Date', inplace=True)

div_data = all_dividends[all_dividends['Ticker'] == ticker]
price_data = all_prices[all_prices['Ticker'] == ticker]

# Aligning the indices to ensure proper plotting
combined_data = pd.merge(price_data, div_data[['Dividends']], left_index=True, right_index=True, how='outer')

fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot dividends as stem plot for better visibility
ax1.stem(combined_data.index, combined_data['Dividends'].fillna(0), linefmt='blue', markerfmt='bo', basefmt=" ", label='Dividends')
ax1.set_xlabel('Date')
ax1.set_ylabel('Dividends', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Plot price on the same graph with a different y-axis
ax2 = ax1.twinx()
ax2.plot(combined_data.index, combined_data['Close'], color='red', label='Close Price')
ax2.set_ylabel('Close Price', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Add title and show the plot
plt.title(f'{ticker} Dividends and Close Price Over Time')
fig.tight_layout()
plt.show()