In [3]:
import pandas as pd
import numpy as np
import time as time

In [5]:
# Read the CSV file
sick_df = pd.read_csv('/Users/sudz4/Desktop/SPS_local/sps/x_bio_weapon/all_delistings_since_2020.csv')

# Clean up CIK numbers
sick_df['cik'] = sick_df['cik'].astype(str).str.zfill(10)

# Convert date string to datetime
sick_df['date'] = pd.to_datetime(sick_df['date'])

# Add columns for SEC data
sick_df['ticker'] = ''
sick_df['sic'] = ''
sick_df['sic_description'] = ''
sick_df['is_biotech'] = False
sick_df['biotech_identification_method'] = ''

In [1]:
print(len(sick_df))
display(sick_df.head())

NameError: name 'sick_df' is not defined

In [None]:


# Define biotech-related keywords and patterns (expanded)
biotech_keywords = [
    'therapeutics', 'pharma', 'biotech', 'bio', 'health sciences',
    'medical', 'genetics', 'genomics', 'life sciences', 'biologics',
    'biosciences', 'oncology', 'diagnostic', 'biomedical',
    'laboratory', 'clinical', 'research', 'drug', 'therapeutic',
    'cell therapy', 'gene therapy', 'molecular', 'pharmaceutical'
]

# Define comprehensive biotech and related SIC codes with descriptions
biotech_sic_dict = {
    '2833': 'Medicinal Chemicals and Botanical Products',
    '2834': 'Pharmaceutical Preparations',
    '2835': 'In Vitro and In Vivo Diagnostic Substances',
    '2836': 'Biological Products, Except Diagnostic Substances',
    '3826': 'Laboratory Analytical Instruments',
    '3841': 'Surgical and Medical Instruments',
    '3842': 'Orthopedic, Prosthetic, and Surgical Supplies',
    '3843': 'Dental Equipment and Supplies',
    '3844': 'X-Ray Apparatus and Tubes',
    '3845': 'Electromedical and Electrotherapeutic Apparatus',
    '3851': 'Ophthalmic Goods',
    '8071': 'Medical Laboratories',
    '8731': 'Commercial Physical and Biological Research',
    '8732': 'Commercial Nonphysical Research',
    '8733': 'Noncommercial Research Organizations',
    '8734': 'Testing Laboratories',
    '8999': 'Services, Not Elsewhere Classified (including Medical Research)',
    '6324': 'Hospital and Medical Service Plans',
    '5122': 'Drugs, Proprietaries, and Druggists Sundries'
}

biotech_sic_codes = list(biotech_sic_dict.keys())

# Set up SEC API request headers
headers = {
    'User-Agent': 'sanspeursystems matt@sanspeursystems.com',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

# Process each company
for idx, row in df.iterrows():
    print(f"Processing {row['company']} ({idx + 1}/{len(df)})")
    
    # Check company name for biotech keywords
    company_name_lower = row['company'].lower()
    matching_keywords = [kw for kw in biotech_keywords if kw in company_name_lower]
    if matching_keywords:
        df.at[idx, 'is_biotech'] = True
        df.at[idx, 'biotech_identification_method'] = f"name_match:{','.join(matching_keywords)}"
    
    # Get SEC data
    cik = row['cik']
    url = f'https://data.sec.gov/submissions/CIK{cik}.json'
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            
            # Store basic info
            df.at[idx, 'ticker'] = data.get('tickers', [''])[0]
            sic_code = str(data.get('sic', '')).strip()
            df.at[idx, 'sic'] = sic_code
            df.at[idx, 'sic_description'] = data.get('sicDescription', '') or biotech_sic_dict.get(sic_code, '')
            
            # Check if SIC code matches biotech
            if sic_code in biotech_sic_codes:
                df.at[idx, 'is_biotech'] = True
                current_method = df.at[idx, 'biotech_identification_method']
                if current_method:
                    df.at[idx, 'biotech_identification_method'] = f"{current_method};sic_{sic_code}"
                else:
                    df.at[idx, 'biotech_identification_method'] = f"sic_{sic_code}"
            
            print(f"Company: {row['company']}")
            print(f"SIC: {sic_code} - {df.at[idx, 'sic_description']}")
            print(f"Ticker: {df.at[idx, 'ticker']}")
            print(f"Is Biotech: {df.at[idx, 'is_biotech']}")
            print("---")
            
    except Exception as e:
        print(f"Error for {row['company']}: {str(e)}")
    
    time.sleep(0.1)  # Respect SEC rate limits

# Filter for biotech companies
biotech_df = df[df['is_biotech']].copy()

# Display results
print("\nPotential Biotech Companies Found:")
print(biotech_df[['date', 'company', 'ticker', 'sic', 'sic_description', 'biotech_identification_method']])

# Save results with detailed information
biotech_df.to_csv('biotech_delistings_detailed.csv', index=False)

# Print summary statistics
print("\nSummary Statistics:")
print(f"Total companies processed: {len(df)}")
print(f"Companies identified as biotech: {len(biotech_df)}")
if len(df) > 0:
    print(f"Percentage biotech: {(len(biotech_df) / len(df)) * 100:.1f}%")

# Print identification method breakdown
print("\nIdentification Method Breakdown:")
print(biotech_df['biotech_identification_method'].value_counts())

# Print SIC code distribution for identified biotech companies
print("\nSIC Code Distribution for Identified Biotech Companies:")
sic_distribution = biotech_df['sic'].value_counts()
for sic in sic_distribution.index:
    description = biotech_sic_dict.get(sic, 'Unknown')
    print(f"SIC {sic}: {description} - {sic_distribution[sic]} companies")