In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import database models and functions
from src.database.base import init_db, get_db_session
from src.database.companies import Company
from src.database.financial_concepts import FinancialConcept
from src.database.financial_values import FinancialValue

# Import ingestion functionality
from src.ingestion.edgar_db.accessors import get_company, get_10k_filing
from src.ingestion.ingestion_helpers import ingest_financial_data

# Import logging and config
from src.utils.logging import get_logger
from src.utils.config import settings

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')

logger = get_logger(__name__)

In [None]:
# Initialize database connection
try:
    engine, db_session = init_db(settings.database.url)
    print("✅ Database connection established successfully")
except Exception as e:
    print(f"❌ Failed to connect to database: {e}")
    print("Please check your database configuration")

In [None]:
# Get basic counts from the database
session = get_db_session()

total_companies = session.query(Company).count()
total_financial_concepts = session.query(FinancialConcept).count()
total_financial_values = session.query(FinancialValue).count()

print("📊 Database Overview:")
print(f"   Companies: {total_companies:,}")
print(f"   Financial Concepts: {total_financial_concepts:,}")
print(f"   Financial Values: {total_financial_values:,}")
print(f"   Average Financial Values per Company: {total_financial_values/total_companies:.1f}" if total_companies > 0 else "   No companies found")


In [None]:
# Find companies with no financial values
try:
    session = get_db_session()

    # Query companies with no financial values using LEFT JOIN
    companies_without_values = session.query(Company).outerjoin(
        FinancialValue, Company.id == FinancialValue.company_id
    ).filter(FinancialValue.company_id.is_(None)).all()

    print("🔍 Companies Missing Financial Values:")
    print(f"   Count: {len(companies_without_values):,}")
    print(f"   Percentage: {len(companies_without_values)/total_companies*100:.1f}%" if total_companies > 0 else "   No companies to analyze")

    # Create a DataFrame for analysis
    if companies_without_values:
        missing_values_df = pd.DataFrame([{
            'company_id': str(company.id),
            'name': company.name,
            'display_name': company.display_name,
            'cik': company.cik,
            'tickers': ', '.join(company.tickers) if company.tickers else None,
            'exchanges': ', '.join(company.exchanges) if company.exchanges else None,
            'sic': company.sic,
            'entity_type': company.entity_type
        } for company in companies_without_values])

        print("\n📋 Sample of companies without financial values:")
        print(missing_values_df[['name', 'tickers', 'cik', 'sic']].head(10).to_string(index=False))
    else:
        print("   All companies have financial values!")
        missing_values_df = pd.DataFrame()

except Exception as e:
    print(f"❌ Error analyzing companies without financial values: {e}")
    missing_values_df = pd.DataFrame()

In [None]:
# Find filings that don't have FinancialValues


from src.database.companies import list_all_companies
from src.database.filings import get_filings_by_company


all_companies = list_all_companies(offset=0, limit=100)


In [None]:
# all_companies = [get_company_by_ticker('GOOGL')]
filings_without_values = []
for company in all_companies:
    print(company.tickers[0])
    all_filings = get_filings_by_company(company.id)
    print(len(all_filings))

    # Filter to get only filings that don't have any financial values
    for filing in all_filings:
        value_count = session.query(FinancialValue).filter(
            FinancialValue.filing_id == filing.id
        ).count()
        if value_count == 0:
            filings_without_values.append(filing)


In [None]:
filings_without_values

In [None]:
for filing in filings_without_values:
    edgar_company = get_company(company.tickers[0])
    edgar_filing = get_10k_filing(edgar_company, filing.period_of_report.year)
    ingest_financial_data(company.id, filing.id, edgar_filing)

In [None]:
len(filings_without_values)

In [None]:
# Re-run analysis after ingestion to see improvements
def analyze_ingestion_impact():
    """Analyze the impact of financial data ingestion."""
    try:
        session = get_db_session()

        # Get updated counts
        new_total_companies = session.query(Company).count()
        new_total_concepts = session.query(FinancialConcept).count()
        new_total_values = session.query(FinancialValue).count()

        # Get companies still missing financial values
        companies_still_missing = session.query(Company).outerjoin(
            FinancialValue, Company.id == FinancialValue.company_id
        ).filter(FinancialValue.company_id.is_(None)).count()

        print("🔄 Updated Database Statistics:")
        print(f"   Companies: {new_total_companies:,}")
        print(f"   Financial Concepts: {new_total_concepts:,} (+{new_total_concepts - total_financial_concepts})")
        print(f"   Financial Values: {new_total_values:,} (+{new_total_values - total_financial_values})")
        print(f"   Companies still missing financial data: {companies_still_missing:,}")

        if new_total_values > total_financial_values:
            improvement = ((total_companies - companies_still_missing) / total_companies) * 100
            print(f"   📈 Financial data coverage improved to {improvement:.1f}%")

        return {
            'new_concepts': new_total_concepts - total_financial_concepts,
            'new_values': new_total_values - total_financial_values,
            'companies_still_missing': companies_still_missing
        }

    except Exception as e:
        print(f"❌ Error analyzing ingestion impact: {e}")
        return None

# Run impact analysis if ingestion was performed
print("\n📊 Analyzing impact of financial data ingestion...")
impact = analyze_ingestion_impact()

In [None]:
# Get comprehensive overview of financial concepts
session = get_db_session()

# Get all financial concepts with usage statistics
concepts_query = session.query(FinancialConcept).all()

if concepts_query:
    print("💰 Financial Concepts Overview:")
    print(f"   Total financial concepts: {len(concepts_query):,}")

    # Create detailed DataFrame for analysis
    concepts_data = []
    for concept in concepts_query:
        # Count financial values for this concept
        value_count = session.query(FinancialValue).filter(
            FinancialValue.concept_id == concept.id
        ).count()

        # Count unique companies using this concept
        company_count = session.query(FinancialValue.company_id).filter(
            FinancialValue.concept_id == concept.id
        ).distinct().count()

        # Count unique filings using this concept
        filing_count = session.query(FinancialValue.filing_id).filter(
            FinancialValue.concept_id == concept.id,
            FinancialValue.filing_id.isnot(None)
        ).distinct().count()

        concepts_data.append({
            'id': str(concept.id),
            'name': concept.name,
            'description': concept.description,
            'labels_count': len(concept.labels) if concept.labels else 0,
            'labels': ', '.join(concept.labels[:3]) + ('...' if len(concept.labels) > 3 else '') if concept.labels else None,
            'total_values': value_count,
            'companies_using': company_count,
            'filings_using': filing_count,
            'coverage_pct': (company_count / total_companies * 100) if total_companies > 0 else 0
        })

    concepts_df = pd.DataFrame(concepts_data)
    concepts_df = concepts_df.sort_values('total_values', ascending=False)

    print("\n📊 Financial Concepts Statistics:")
    print(f"   Concepts with no usage: {len(concepts_df[concepts_df['total_values'] == 0])}")
    print(f"   Concepts with 1-10 values: {len(concepts_df[(concepts_df['total_values'] >= 1) & (concepts_df['total_values'] <= 10)])}")
    print(f"   Concepts with 11-100 values: {len(concepts_df[(concepts_df['total_values'] >= 11) & (concepts_df['total_values'] <= 100)])}")
    print(f"   Concepts with 100+ values: {len(concepts_df[concepts_df['total_values'] > 100])}")
    print(f"   Average values per concept: {concepts_df['total_values'].mean():.1f}")
    print(f"   Median values per concept: {concepts_df['total_values'].median():.1f}")

else:
    print("   No financial concepts found in database")
    concepts_df = pd.DataFrame()


In [None]:
# Categorize financial concepts by common patterns
if not concepts_df.empty:
    print("📋 Financial Concepts Categorization:")

    # Analyze concept names for common patterns
    concept_names = concepts_df['name'].tolist()

    # Common financial statement categories
    categories = {
        'Assets': [name for name in concept_names if any(keyword in name.lower() for keyword in ['asset', 'cash', 'inventory', 'receivable', 'equipment', 'property'])],
        'Liabilities': [name for name in concept_names if any(keyword in name.lower() for keyword in ['liability', 'liabilities', 'payable', 'debt', 'loan', 'obligation'])],
        'Equity': [name for name in concept_names if any(keyword in name.lower() for keyword in ['equity', 'capital', 'retained', 'shareholder', 'stockholder'])],
        'Revenue': [name for name in concept_names if any(keyword in name.lower() for keyword in ['revenue', 'sales', 'income', 'earning', 'proceed'])],
        'Expenses': [name for name in concept_names if any(keyword in name.lower() for keyword in ['expense', 'cost', 'depreciation', 'amortization', 'paid', 'payment'])],
        'Cash Flow': [name for name in concept_names if any(keyword in name.lower() for keyword in ['cash flow', 'operating activities', 'investing activities', 'financing activities'])],
        'Shares': [name for name in concept_names if any(keyword in name.lower() for keyword in ['shares', 'share', 'outstanding', 'weighted'])],
        'Per Share': [name for name in concept_names if any(keyword in name.lower() for keyword in ['per share', 'eps', 'earnings per'])],
    }

    for category, names in categories.items():
        if names:
            # Get usage stats for this category
            category_concepts = concepts_df[concepts_df['name'].isin(names)]
            total_values = category_concepts['total_values'].sum()
            avg_usage = category_concepts['total_values'].mean()

            print(f"\n   📊 {category} ({len(names)} concepts):")
            print(f"      Total values: {total_values:,}")
            print(f"      Avg values per concept: {avg_usage:.1f}")
            if len(names) <= 5:
                print(f"      Concepts: {', '.join(names)}")
            else:
                print(f"      Top concepts: {', '.join(names[:5])}...")

    # Find uncategorized concepts
    all_categorized = set()
    for names in categories.values():
        all_categorized.update(names)

    uncategorized = [name for name in concept_names if name not in all_categorized]
    if uncategorized:
        uncategorized_concepts = concepts_df[concepts_df['name'].isin(uncategorized)]
        print(f"\n   ❓ Uncategorized ({len(uncategorized)} concepts):")
        print(f"      Total values: {uncategorized_concepts['total_values'].sum():,}")
        if len(uncategorized) <= 10:
            print(f"      Concepts: {', '.join(uncategorized)}")
        else:
            # Show top 10 by usage
            top_uncategorized = uncategorized_concepts.nlargest(10, 'total_values')['name'].tolist()
            print(f"      Top by usage: {', '.join(top_uncategorized)}")
else:
    print("⚠️  No concepts data available for categorization")

In [None]:
# Explore concepts with and without descriptions
if not concepts_df.empty:
    print("📝 Financial Concepts Documentation:")

    # Count concepts with descriptions
    with_descriptions = concepts_df[concepts_df['description'].notna() & (concepts_df['description'] != '')]
    without_descriptions = concepts_df[concepts_df['description'].isna() | (concepts_df['description'] == '')]

    print(f"   Concepts with descriptions: {len(with_descriptions)} ({len(with_descriptions)/len(concepts_df)*100:.1f}%)")
    print(f"   Concepts without descriptions: {len(without_descriptions)} ({len(without_descriptions)/len(concepts_df)*100:.1f}%)")

    # Show concepts with labels
    with_labels = concepts_df[concepts_df['labels_count'] > 0]
    print(f"   Concepts with labels: {len(with_labels)} ({len(with_labels)/len(concepts_df)*100:.1f}%)")

    if not with_descriptions.empty:
        print("\n📖 Sample Concepts with Descriptions:")
        sample_with_desc = with_descriptions.head(5)
        for _, concept in sample_with_desc.iterrows():
            desc_preview = concept['description'][:100] + '...' if len(str(concept['description'])) > 100 else concept['description']
            print(f"   • {concept['name']}: {desc_preview}")

    if not without_descriptions.empty:
        print("\n❓ High-Usage Concepts Missing Descriptions:")
        missing_desc_high_usage = without_descriptions[without_descriptions['total_values'] > 10].head(10)
        if not missing_desc_high_usage.empty:
            print(missing_desc_high_usage[['name', 'total_values', 'companies_using']].to_string(index=False))
        else:
            print("   All high-usage concepts have descriptions!")

    # Export detailed concepts analysis
    output_file = '/home/steven/symbology/outputs/financial_concepts_analysis.csv'
    concepts_df.to_csv(output_file, index=False)
    print(f"\n💾 Detailed concepts analysis exported to: {output_file}")
else:
    print("⚠️  No concepts data available for documentation analysis")

In [None]:
# Test the fix for the MARA ingestion error
print("🔧 Testing the fix for non-numeric value handling...")

# Try ingesting MARA again with the updated error handling
if edgar_ready:
    test_result_mara = ingest_company_financial_data('MARA', 2023, dry_run=False)

    print("\n📊 MARA Ingestion Results:")
    for key, value in test_result_mara.items():
        print(f"   {key}: {value}")

    if test_result_mara['success']:
        print("\n✅ Successfully fixed the numeric value validation issue!")
        print(f"   Financial values ingested: {test_result_mara.get('values_added', 0)}")
        if 'financial_data_counts' in test_result_mara:
            for statement_type, count in test_result_mara['financial_data_counts'].items():
                print(f"   {statement_type}: {count} values")
    else:
        print(f"\n❌ Still having issues: {test_result_mara['error']}")
else:
    print("⚠️  EDGAR access not configured")

## 4. Analysis of Duplicate Financial Concept Names

Let's examine if there are any financial concepts with identical names in the database, which could indicate data quality issues or legitimate variations.

In [None]:
# Analyze duplicate financial concept names
if not concepts_df.empty:
    print("🔍 Analyzing Financial Concept Name Duplicates:")

    # Group by name and count occurrences
    name_counts = concepts_df['name'].value_counts()
    duplicate_names = name_counts[name_counts > 1]

    print(f"   Total unique concept names: {len(name_counts)}")
    print(f"   Total financial concepts: {len(concepts_df)}")
    print(f"   Names appearing multiple times: {len(duplicate_names)}")

    if len(duplicate_names) > 0:
        print(f"   Concepts with duplicate names: {duplicate_names.sum()} out of {len(concepts_df)}")
        print(f"   Percentage of concepts with duplicate names: {(duplicate_names.sum() / len(concepts_df)) * 100:.1f}%")

        print("\n📊 Duplicate Names Summary:")
        for name, count in duplicate_names.head(10).items():
            print(f"   '{name}': {count} concepts")

        # Show detailed analysis of duplicates
        print("\n📋 Detailed Analysis of Duplicate Concepts:")
        for name in duplicate_names.head(5).index:
            duplicate_concepts = concepts_df[concepts_df['name'] == name]
            print(f"\n   📌 Name: '{name}' ({len(duplicate_concepts)} concepts)")

            for _, concept in duplicate_concepts.iterrows():
                desc_preview = str(concept['description'])[:80] + '...' if len(str(concept['description'])) > 80 else str(concept['description'])
                print(f"      ID: {concept['id'][:8]}...")
                print(f"      Description: {desc_preview}")
                print(f"      Labels: {concept['labels']}")
                print(f"      Usage: {concept['total_values']} values, {concept['companies_using']} companies")
                print("      ---")

        # Create summary DataFrame for export
        duplicate_summary = []
        for name in duplicate_names.index:
            duplicate_concepts = concepts_df[concepts_df['name'] == name]
            for i, (_, concept) in enumerate(duplicate_concepts.iterrows()):
                duplicate_summary.append({
                    'concept_name': name,
                    'duplicate_number': i + 1,
                    'total_duplicates': len(duplicate_concepts),
                    'concept_id': concept['id'],
                    'description': concept['description'],
                    'labels': concept['labels'],
                    'total_values': concept['total_values'],
                    'companies_using': concept['companies_using']
                })

        duplicates_df = pd.DataFrame(duplicate_summary)
        output_file = '/home/steven/symbology/outputs/duplicate_financial_concepts.csv'
        duplicates_df.to_csv(output_file, index=False)
        print(f"\n💾 Detailed duplicate analysis exported to: {output_file}")

    else:
        print("\n✅ No duplicate concept names found!")
        print(f"   All {len(concepts_df)} financial concepts have unique names.")

    # Additional analysis: Similar names (potential typos or variations)
    print("\n🔤 Looking for Similar Names (potential variations):")
    concept_names = concepts_df['name'].tolist()
    similar_groups = []

    # Look for names that are very similar (differing by case, spaces, or punctuation)
    normalized_names = {}
    for name in concept_names:
        # Normalize: lowercase, remove spaces and underscores
        normalized = name.lower().replace('_', '').replace(' ', '').replace('-', '')
        if normalized in normalized_names:
            normalized_names[normalized].append(name)
        else:
            normalized_names[normalized] = [name]

    similar_groups = {norm: names for norm, names in normalized_names.items() if len(names) > 1}

    if similar_groups:
        print(f"   Found {len(similar_groups)} groups of similar names:")
        for i, (normalized, names) in enumerate(list(similar_groups.items())[:5]):
            print(f"      Group {i+1}: {names}")
        if len(similar_groups) > 5:
            print(f"      ... and {len(similar_groups) - 5} more groups")
    else:
        print("   No similar name patterns found.")

else:
    print("⚠️  No concepts data available for duplicate analysis")