In [15]:
import pandas as pd
import numpy as np
from workalendar.usa import UnitedStates

cal = UnitedStates()

# 1. Read VBA sheet as main data framework
print("Reading ARKK VBA data...")
complete_data = pd.read_excel('ARKK521.xlsx', sheet_name='VBA')
complete_data['Date'] = pd.to_datetime(complete_data['Date'])

# 2. Extract company names and industry information for lookup
print("Getting company names and industry info...")
stocks_info = pd.read_excel('arkstocksinfo.xlsx', sheet_name='Sheet1')

# Use Name (B column) to match and get H column (long_comp_name) content
# B column = "Bloomberg Name", H column = "long_comp_name"
name_lookup = stocks_info.drop_duplicates('Bloomberg Name').set_index('Bloomberg Name')['long_comp_name'].to_dict()
industry_lookup = stocks_info.drop_duplicates('Bloomberg Name').set_index('Bloomberg Name')['gics_industry_name'].to_dict()

# Add columns by mapping Name to company info
complete_data['Company_Name'] = complete_data['Name'].map(name_lookup)
complete_data['Industry'] = complete_data['Name'].map(industry_lookup)

print(f"Company names found for {complete_data['Company_Name'].notna().sum():,} records")
print(f"Industry info found for {complete_data['Industry'].notna().sum():,} records")

# 3. Process price data
print("Processing price data...")
df_raw = pd.read_excel('ALLARK521Stocks.xlsx', sheet_name='Sheet1', header=None)

# Extract BBID and prices
price_lookup = {}
for i in range(2, df_raw.shape[1], 2):
    # Check if both date and price columns exist
    if i < df_raw.shape[1] and (i + 1) < df_raw.shape[1]:
        bbid = df_raw.iloc[0, i]
        if pd.notna(bbid) and str(bbid).startswith('BBG'):
            stock_prices = {}
            date_col = i
            price_col = i + 1
            
            for row in range(2, df_raw.shape[0]):
                # Add safety check for row access
                if row < df_raw.shape[0]:
                    date_val = df_raw.iloc[row, date_col]
                    price_val = df_raw.iloc[row, price_col]
                    
                    if pd.notna(date_val) and pd.notna(price_val):
                        try:
                            date_obj = pd.to_datetime(date_val)
                            stock_prices[date_obj] = float(price_val)
                        except:
                            continue
            
            if stock_prices:
                price_lookup[bbid] = stock_prices

print(f"Found price data for {len(price_lookup)} stocks")

# 4. Add stock price column
complete_data['Stock_Price'] = complete_data.apply(
    lambda row: price_lookup.get(row['BBID'], {}).get(row['Date'], np.nan), axis=1
)

# 5. Filter business days only
print("Filtering business days...")
original_count = len(complete_data)
complete_data = complete_data[complete_data['Date'].apply(lambda x: cal.is_working_day(x.date()))]
print(f"Filtered: {original_count:,} -> {len(complete_data):,} rows")

# 6. Set date as index
complete_data.set_index('Date', inplace=True)
complete_data.sort_index(inplace=True)

# 7. Display final results
print(f"\n" + "="*60)
print("📊 FINAL DATAFRAME")
print("="*60)
print(f"Shape: {complete_data.shape[0]:,} rows × {complete_data.shape[1]} columns")
print(f"Date range: {complete_data.index.min().strftime('%Y-%m-%d')} to {complete_data.index.max().strftime('%Y-%m-%d')}")

print(f"\nColumns:")
for i, col in enumerate(complete_data.columns, 1):
    non_null = complete_data[col].notna().sum()
    print(f"{i:2d}. {col:<20} ({non_null:,} non-null)")

print(f"\nSample data:")
print(complete_data[['Company_Name', 'Industry', 'Stock_Price']].head())

# Statistics
price_missing = complete_data['Stock_Price'].isnull().sum()
print(f"\nStock_Price missing: {price_missing:,} / {len(complete_data):,} ({price_missing/len(complete_data)*100:.1f}%)")

# Save results
complete_data.to_excel('merged_ark_data_clean.xlsx')
print(f"\n✅ Saved to merged_ark_data_clean.xlsx")

Reading ARKK VBA data...
Getting company names and industry info...
Company names found for 170,323 records
Industry info found for 161,049 records
Processing price data...


  warn(msg)


Found price data for 433 stocks
Filtering business days...
Filtered: 170,323 -> 116,862 rows

📊 FINAL DATAFRAME
Shape: 116,862 rows × 11 columns
Date range: 2014-10-31 to 2025-05-21

Columns:
 1. Name                 (116,862 non-null)
 2. Weight               (116,862 non-null)
 3. Position             (116,862 non-null)
 4. Market Value         (116,862 non-null)
 5. CUSIP                (104,900 non-null)
 6. BBID                 (116,862 non-null)
 7. ISIN                 (113,749 non-null)
 8. Fund Flows           (116,862 non-null)
 9. Company_Name         (116,862 non-null)
10. Industry             (110,478 non-null)
11. Stock_Price          (115,270 non-null)

Sample data:
                Company_Name                                  Industry  \
Date                                                                     
2014-10-31  MercadoLibre Inc                          Broadline Retail   
2014-10-31      Autodesk Inc                                  Software   
2014-10-31    

In [None]:
# Complete ARK Data Processing Pipeline
# 1. Fill missing prices → 2. Remove low quality dates → 3. Clean data & calculate weights

import yfinance as yf
from datetime import timedelta
import numpy as np

print("🚀 COMPLETE ARK DATA PROCESSING PIPELINE")
print("="*80)

# ============================================================================
# STEP 1: FILL MISSING STOCK PRICES USING YFINANCE
# ============================================================================

print("📊 STEP 1: FILLING MISSING STOCK PRICES")
print("="*60)

# Find records with missing stock prices
missing_mask = complete_data['Stock_Price'].isna()
missing_records = complete_data[missing_mask].copy()
missing_count = len(missing_records)

print(f"Found {missing_count:,} records with missing stock prices")

if missing_count > 0:
    # Extract ticker and create unique combinations
    print("Extracting unique ticker-date combinations...")
    missing_records['Ticker'] = missing_records['Name'].str.split().str[0]
    missing_records['Date'] = missing_records.index
    
    # Filter valid tickers
    def is_valid_ticker(ticker):
        if pd.isna(ticker):
            return False
        ticker_str = str(ticker).upper()
        if (ticker_str[0].isdigit() or 'XX' in ticker_str or len(ticker_str) > 6):
            return False
        return True
    
    valid_missing = missing_records[missing_records['Ticker'].apply(is_valid_ticker)].copy()
    unique_combinations = valid_missing[['Ticker', 'Date']].drop_duplicates()
    print(f"Valid (ticker, date) combinations to fetch: {len(unique_combinations):,}")
    
    # Fetch prices by ticker
    failed_tickers = set()
    filled_prices = {}
    successful_count = 0
    
    ticker_groups = unique_combinations.groupby('Ticker')
    
    for ticker, group in ticker_groups:
        if ticker in failed_tickers:
            continue
            
        print(f"Processing {ticker} ({len(group)} dates)...")
        
        try:
            dates = group['Date'].tolist()
            min_date = min(dates)
            max_date = max(dates)
            
            stock = yf.Ticker(ticker)
            hist = stock.history(start=min_date, end=max_date + timedelta(days=1))
            
            if hist.empty:
                failed_tickers.add(ticker)
                continue
            
            ticker_success = 0
            for date in dates:
                date_str = date.strftime('%Y-%m-%d')
                matching_rows = hist[hist.index.strftime('%Y-%m-%d') == date_str]
                
                if not matching_rows.empty:
                    price = matching_rows['Close'].iloc[0]
                    filled_prices[(ticker, date)] = price
                    ticker_success += 1
            
            if ticker_success > 0:
                successful_count += ticker_success
                print(f"   ✅ Found {ticker_success}/{len(dates)} prices for {ticker}")
            else:
                failed_tickers.add(ticker)
                
        except Exception as e:
            failed_tickers.add(ticker)
    
    # Update dataframe with filled prices
    if successful_count > 0:
        print("Updating dataframe with filled prices...")
        for idx, row in valid_missing.iterrows():
            ticker = row['Ticker']
            date = row['Date']
            if (ticker, date) in filled_prices:
                complete_data.loc[idx, 'Stock_Price'] = filled_prices[(ticker, date)]
    
    print(f"✅ Price filling complete: {successful_count:,} prices filled")

# ============================================================================
# STEP 2: ANALYZE AND REMOVE DATES WITH POOR PRICE COVERAGE
# ============================================================================

print(f"\n📊 STEP 2: ANALYZING PRICE COVERAGE BY DATE")
print("="*60)

# Calculate price coverage by date
daily_stats = complete_data.groupby(complete_data.index).agg({
    'Stock_Price': lambda x: x.notna().sum(),
    'Name': 'count'
})
daily_stats.columns = ['Tickers_With_Price', 'Total_Tickers']
daily_stats['Price_Coverage_Pct'] = (daily_stats['Tickers_With_Price'] / daily_stats['Total_Tickers'] * 100).round(1)

print(f"Overall price coverage: {daily_stats['Price_Coverage_Pct'].mean():.1f}%")

# Show worst dates
worst_dates = daily_stats.nsmallest(10, 'Price_Coverage_Pct')
print(f"\nWorst 10 dates (price coverage):")
for date, row in worst_dates.iterrows():
    print(f"   {date.strftime('%Y-%m-%d')}: {row['Tickers_With_Price']:.0f}/{row['Total_Tickers']:.0f} ({row['Price_Coverage_Pct']:.1f}%)")

# Remove dates with <50% coverage
threshold = 50.0
dates_to_remove = daily_stats[daily_stats['Price_Coverage_Pct'] < threshold].index

if len(dates_to_remove) > 0:
    print(f"\nRemoving {len(dates_to_remove)} dates with <{threshold}% price coverage...")
    original_count = len(complete_data)
    complete_data = complete_data[~complete_data.index.isin(dates_to_remove)]
    removed_count = original_count - len(complete_data)
    print(f"✅ Removed {removed_count:,} records from {len(dates_to_remove):,} dates")
else:
    print(f"✅ No dates with <{threshold}% coverage found")

# ============================================================================
# STEP 3: DATA CLEANING AND WEIGHT CALCULATION
# ============================================================================

print(f"\n📊 STEP 3: DATA CLEANING AND WEIGHT CALCULATION")
print("="*60)

# Remove USD entries
print("Removing USD Currency entries...")
original_count = len(complete_data)
complete_data = complete_data[complete_data['Name'] != 'USD Curncy']
usd_removed = original_count - len(complete_data)
print(f"Removed {usd_removed:,} USD Currency entries")

# Recalculate Market Value
print("Recalculating Market Value...")
complete_data['Market Value'] = np.nan
mask = (complete_data['Position'].notna()) & (complete_data['Stock_Price'].notna())
complete_data.loc[mask, 'Market Value'] = complete_data.loc[mask, 'Position'] * complete_data.loc[mask, 'Stock_Price']
calculated_count = complete_data['Market Value'].notna().sum()
print(f"Market Value calculated for: {calculated_count:,} records")

# Calculate ETF Market Value and Weights
print("Calculating ETF Market Value and Weights...")
complete_data['Weight'] = np.nan

# Daily ETF market values
daily_total_mv = complete_data.groupby(complete_data.index)['Market Value'].sum()
complete_data['ETF Market Value'] = complete_data.index.map(daily_total_mv)

# Calculate weights
weight_mask = ((complete_data['Market Value'].notna()) & 
              (complete_data['ETF Market Value'].notna()) & 
              (complete_data['ETF Market Value'] != 0))
complete_data.loc[weight_mask, 'Weight'] = (
    complete_data.loc[weight_mask, 'Market Value'] / 
    complete_data.loc[weight_mask, 'ETF Market Value']
)

weight_calculated = complete_data['Weight'].notna().sum()
print(f"Weight calculated for: {weight_calculated:,} records")

# Validate weights
weight_sums = complete_data.groupby(complete_data.index)['Weight'].sum()
tolerance = 0.001
problematic_dates = weight_sums[abs(weight_sums - 1.0) > tolerance]

if len(problematic_dates) > 0:
    print(f"⚠️  {len(problematic_dates)} dates with weight sum ≠ 1")
else:
    print("✅ All dates have weight sum ≈ 1.0")

# ============================================================================
# STEP 4: FINAL EXPORT TO EXCEL
# ============================================================================

print(f"\n📊 STEP 4: EXPORTING FINAL DATA")
print("="*60)

# Prepare for export
export_data = complete_data.reset_index()
export_data['Date'] = export_data['Date'].dt.date

# Remove unwanted columns
columns_to_remove = ['CUSIP', 'ISIN']
for col in columns_to_remove:
    if col in export_data.columns:
        export_data = export_data.drop(columns=[col])

# Reorder columns
column_order = ['Date', 'Name', 'Company_Name', 'Industry', 'Position', 'Stock_Price', 
                'Market Value', 'ETF Market Value', 'Weight', 'BBID', 'Fund']

existing_cols = [col for col in column_order if col in export_data.columns]
other_cols = [col for col in export_data.columns if col not in column_order]
final_column_order = existing_cols + other_cols
export_data = export_data[final_column_order]

# Export to Excel
filename = 'ark_data_complete_processed.xlsx'
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
    export_data.to_excel(writer, sheet_name='ARK_Data_Final', index=False)
    
    # Auto-adjust column widths
    workbook = writer.book
    worksheet = writer.sheets['ARK_Data_Final']
    
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 50)
        worksheet.column_dimensions[column_letter].width = adjusted_width

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print(f"\n🎉 PROCESSING COMPLETE!")
print("="*80)
print(f"✅ Final data exported to '{filename}'")
print(f"📊 Final dataset: {len(export_data):,} rows × {len(export_data.columns)} columns")
print(f"📅 Date range: {export_data['Date'].min()} to {export_data['Date'].max()}")
print(f"💹 Records with prices: {complete_data['Stock_Price'].notna().sum():,}")
print(f"📈 Overall price coverage: {complete_data['Stock_Price'].notna().sum() / len(complete_data) * 100:.1f}%")
print(f"⚖️  Average daily weight sum: {weight_sums.mean():.6f}")



🚀 COMPLETE ARK DATA PROCESSING PIPELINE
📊 STEP 1: FILLING MISSING STOCK PRICES
Found 908 records with missing stock prices
Extracting unique ticker-date combinations...
Valid (ticker, date) combinations to fetch: 358
Processing ARMH (357 dates)...


$ARMH: possibly delisted; no price data found  (1d 2014-10-31 00:00:00 -> 2016-07-26 00:00:00) (Yahoo error = "Data doesn't exist for startDate = 1414728000, endDate = 1469505600")
$NVTAQ: possibly delisted; no timezone found


Processing NVTAQ (1 dates)...
✅ Price filling complete: 0 prices filled

📊 STEP 2: ANALYZING PRICE COVERAGE BY DATE
Overall price coverage: 99.1%

Worst 10 dates (price coverage):
   2024-02-06: 34/36 (94.4%)
   2023-03-30: 28/29 (96.6%)
   2023-03-31: 28/29 (96.6%)
   2023-04-03: 28/29 (96.6%)
   2023-04-04: 28/29 (96.6%)
   2023-04-05: 28/29 (96.6%)
   2023-04-06: 28/29 (96.6%)
   2023-04-10: 28/29 (96.6%)
   2023-04-11: 28/29 (96.6%)
   2023-04-12: 28/29 (96.6%)
✅ No dates with <50.0% coverage found

📊 STEP 3: DATA CLEANING AND WEIGHT CALCULATION
Removing USD Currency entries...
Removed 2,556 USD Currency entries
Recalculating Market Value...
Market Value calculated for: 112,742 records
Calculating ETF Market Value and Weights...
Weight calculated for: 112,686 records
⚠️  1 dates with weight sum ≠ 1

📊 STEP 4: EXPORTING FINAL DATA

🎉 PROCESSING COMPLETE!
✅ Final data exported to 'ark_data_complete_processed.xlsx'
📊 Final dataset: 113,650 rows × 11 columns
📅 Date range: 2014-10-31 to

In [None]:

print("DataFrame Info:")
complete_data.info()

print("\nFirst 5 rows:")
complete_data.head()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 113650 entries, 2014-10-31 to 2025-05-21
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Name              113650 non-null  object 
 1   Weight            112686 non-null  float64
 2   Position          113650 non-null  float64
 3   Market Value      112742 non-null  float64
 4   CUSIP             104313 non-null  object 
 5   BBID              113650 non-null  object 
 6   ISIN              113115 non-null  object 
 7   Fund Flows        113650 non-null  float64
 8   Company_Name      113650 non-null  object 
 9   Industry          109860 non-null  object 
 10  Stock_Price       112742 non-null  float64
 11  ETF Market Value  113650 non-null  float64
dtypes: float64(6), object(6)
memory usage: 11.3+ MB

First 5 rows:


Unnamed: 0_level_0,Name,Weight,Position,Market Value,CUSIP,BBID,ISIN,Fund Flows,Company_Name,Industry,Stock_Price,ETF Market Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-10-31,MELI US Equity,0.01465,424.0,57727.6,58733R102,BBG000GQPB11,US58733R1023,0.0,MercadoLibre Inc,Broadline Retail,136.15,3940483.0
2014-10-31,ADSK US Equity,0.020443,1400.0,80556.0,52769106,BBG000BM7HL0,US0527691069,0.0,Autodesk Inc,Software,57.54,3940483.0
2014-10-31,PRLB US Equity,0.026278,1584.0,103546.08,743713109,BBG000BT13B3,US7437131094,0.0,Proto Labs Inc,Machinery,65.37,3940483.0
2014-10-31,WOLF US Equity,0.009714,1216.0,38279.68,977852102,BBG000BG14P4,US9778521024,0.0,Wolfspeed Inc,Semiconductors & Semiconductor Equipment,31.48,3940483.0
2014-10-31,NVDA US Equity,0.030784,248320.0,121304.32,67066G104,BBG000BBJQV0,US67066G1040,0.0,NVIDIA Corp,Semiconductors & Semiconductor Equipment,0.4885,3940483.0


In [30]:
# ARK Portfolio Analysis - Small Positions (<1%) Performance Study
# Phase 1: Statistical Analysis of Less Than 1% Positions

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("🔍 ARK SMALL POSITIONS ANALYSIS - PHASE 1")
print("="*80)

# Step 1: Filter positions with weight < 1%
print("📊 STEP 1: IDENTIFYING SMALL POSITIONS (<1%)")
print("="*60)

# Create small positions dataset
small_positions = complete_data[complete_data['Weight'] < 0.01].copy()
large_positions = complete_data[complete_data['Weight'] >= 0.01].copy()

print(f"Total records: {len(complete_data):,}")
print(f"Small positions (<1%): {len(small_positions):,} ({len(small_positions)/len(complete_data)*100:.1f}%)")
print(f"Large positions (≥1%): {len(large_positions):,} ({len(large_positions)/len(complete_data)*100:.1f}%)")

# Step 2: Calculate daily returns for analysis
print(f"\n📈 STEP 2: CALCULATING DAILY RETURNS")
print("="*60)

def calculate_returns(df):
    """Calculate daily returns for each stock using vectorized operations"""
    df = df.copy().sort_index()
    df['Price_Return'] = df.groupby('Name')['Stock_Price'].pct_change()
    return df

# Calculate returns for all data at once, then split
complete_data_with_returns = calculate_returns(complete_data)

# Filter out invalid returns
valid_returns_mask = (
    (complete_data_with_returns['Price_Return'].notna()) & 
    (abs(complete_data_with_returns['Price_Return']) < 2.0)  # Remove >200% daily moves
)

complete_data_clean = complete_data_with_returns[valid_returns_mask]

# Split into small and large positions
small_positions = complete_data_clean[complete_data_clean['Weight'] < 0.01].copy()
large_positions = complete_data_clean[complete_data_clean['Weight'] >= 0.01].copy()

print(f"Small positions with valid returns: {len(small_positions):,}")
print(f"Large positions with valid returns: {len(large_positions):,}")

# Step 3: Statistical Analysis - Mean, Median
print(f"\n📊 STEP 3: STATISTICAL ANALYSIS - MEAN & MEDIAN")
print("="*60)

small_returns = small_positions['Price_Return']
large_returns = large_positions['Price_Return']

print("SMALL POSITIONS (<1%) STATISTICS:")
print(f"  Mean daily return: {small_returns.mean():.4f} ({small_returns.mean()*100:.2f}%)")
print(f"  Median daily return: {small_returns.median():.4f} ({small_returns.median()*100:.2f}%)")
print(f"  Standard deviation: {small_returns.std():.4f} ({small_returns.std()*100:.2f}%)")

print(f"\nLARGE POSITIONS (≥1%) STATISTICS:")
print(f"  Mean daily return: {large_returns.mean():.4f} ({large_returns.mean()*100:.2f}%)")
print(f"  Median daily return: {large_returns.median():.4f} ({large_returns.median()*100:.2f}%)")
print(f"  Standard deviation: {large_returns.std():.4f} ({large_returns.std()*100:.2f}%)")

# Step 4: Distribution Analysis
print(f"\n📊 STEP 4: DISTRIBUTION ANALYSIS")
print("="*60)

percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]

print("SMALL POSITIONS DISTRIBUTION:")
for p in percentiles:
    value = np.percentile(small_returns, p)
    print(f"  {p:2d}th percentile: {value:.4f} ({value*100:.2f}%)")

print(f"\nLARGE POSITIONS DISTRIBUTION:")
for p in percentiles:
    value = np.percentile(large_returns, p)
    print(f"  {p:2d}th percentile: {value:.4f} ({value*100:.2f}%)")

# Step 5: Top 1% and Bottom 1% Returns
print(f"\n🏆 STEP 5: TOP 1% AND BOTTOM 1% RETURNS")
print("="*60)

small_top_1pct = small_returns.quantile(0.99)
small_bottom_1pct = small_returns.quantile(0.01)
large_top_1pct = large_returns.quantile(0.99)
large_bottom_1pct = large_returns.quantile(0.01)

print(f"SMALL POSITIONS:")
print(f"  Top 1% threshold: {small_top_1pct:.4f} ({small_top_1pct*100:.2f}%)")
print(f"  Bottom 1% threshold: {small_bottom_1pct:.4f} ({small_bottom_1pct*100:.2f}%)")

print(f"\nLARGE POSITIONS:")
print(f"  Top 1% threshold: {large_top_1pct:.4f} ({large_top_1pct*100:.2f}%)")
print(f"  Bottom 1% threshold: {large_bottom_1pct:.4f} ({large_bottom_1pct*100:.2f}%)")

# Step 6: Market Value Analysis
print(f"\n💰 STEP 6: MARKET VALUE ANALYSIS")
print("="*60)

# Calculate daily market value changes
print("📊 CALCULATING DAILY MARKET VALUE CHANGES")
print("-" * 40)

# Calculate daily market value change = Previous Market Value * Daily Return
small_positions['Daily_MV_Change'] = small_positions['Market Value'] * small_positions['Price_Return']
large_positions['Daily_MV_Change'] = large_positions['Market Value'] * large_positions['Price_Return']

# Remove extreme outliers and NaN values
small_mv_clean = small_positions['Daily_MV_Change'].dropna()
large_mv_clean = large_positions['Daily_MV_Change'].dropna()

# Filter out extreme values (likely data errors)
small_mv_clean = small_mv_clean[abs(small_mv_clean) < small_mv_clean.quantile(0.999)]
large_mv_clean = large_mv_clean[abs(large_mv_clean) < large_mv_clean.quantile(0.999)]

print(f"Small positions daily changes: {len(small_mv_clean):,} observations")
print(f"Large positions daily changes: {len(large_mv_clean):,} observations")

# Daily market value statistics
print(f"\n💸 DAILY MARKET VALUE IMPACT:")
print("-" * 40)

small_daily_avg = small_mv_clean.mean()
large_daily_avg = large_mv_clean.mean()
small_daily_median = small_mv_clean.median()
large_daily_median = large_mv_clean.median()

print(f"SMALL POSITIONS (<1%) DAILY MARKET VALUE IMPACT:")
print(f"  Average daily change: ${small_daily_avg:,.2f}")
print(f"  Median daily change: ${small_daily_median:,.2f}")
print(f"  Total days analyzed: {len(small_mv_clean):,}")

print(f"\nLARGE POSITIONS (≥1%) DAILY MARKET VALUE IMPACT:")
print(f"  Average daily change: ${large_daily_avg:,.2f}")
print(f"  Median daily change: ${large_daily_median:,.2f}")
print(f"  Total days analyzed: {len(large_mv_clean):,}")

# Cumulative gains/losses
print(f"\n📈 CUMULATIVE GAINS/LOSSES:")
print("-" * 40)

small_total_change = small_mv_clean.sum()
large_total_change = large_mv_clean.sum()

print(f"TOTAL MARKET VALUE IMPACT OVER ENTIRE PERIOD:")
print(f"  Small positions total change: ${small_total_change:,.2f}")
print(f"  Large positions total change: ${large_total_change:,.2f}")

if small_total_change > 0:
    print(f"  ✅ Small positions MADE money: ${small_total_change:,.2f}")
else:
    print(f"  ❌ Small positions LOST money: ${abs(small_total_change):,.2f}")

if large_total_change > 0:
    print(f"  ✅ Large positions MADE money: ${large_total_change:,.2f}")
else:
    print(f"  ❌ Large positions LOST money: ${abs(large_total_change):,.2f}")

# Win/Loss days analysis
print(f"\n🏆 WIN/LOSS DAYS ANALYSIS:")
print("-" * 40)

small_positive_days = (small_mv_clean > 0).sum()
small_negative_days = (small_mv_clean < 0).sum()
small_win_rate = small_positive_days / len(small_mv_clean)

large_positive_days = (large_mv_clean > 0).sum()
large_negative_days = (large_mv_clean < 0).sum()
large_win_rate = large_positive_days / len(large_mv_clean)

print(f"SMALL POSITIONS WIN/LOSS RECORD:")
print(f"  Positive days: {small_positive_days:,} ({small_win_rate*100:.1f}%)")
print(f"  Negative days: {small_negative_days:,} ({(1-small_win_rate)*100:.1f}%)")
print(f"  Average gain on positive days: ${small_mv_clean[small_mv_clean > 0].mean():,.2f}")
print(f"  Average loss on negative days: ${small_mv_clean[small_mv_clean < 0].mean():,.2f}")

print(f"\nLARGE POSITIONS WIN/LOSS RECORD:")
print(f"  Positive days: {large_positive_days:,} ({large_win_rate*100:.1f}%)")
print(f"  Negative days: {large_negative_days:,} ({(1-large_win_rate)*100:.1f}%)")
print(f"  Average gain on positive days: ${large_mv_clean[large_mv_clean > 0].mean():,.2f}")
print(f"  Average loss on negative days: ${large_mv_clean[large_mv_clean < 0].mean():,.2f}")

# Extreme days analysis
print(f"\n🎯 EXTREME MARKET VALUE DAYS:")
print("-" * 40)

# Best and worst days
small_best_day = small_mv_clean.max()
small_worst_day = small_mv_clean.min()
large_best_day = large_mv_clean.max()
large_worst_day = large_mv_clean.min()

print(f"EXTREME DAYS:")
print(f"  Small positions best day: ${small_best_day:,.2f}")
print(f"  Small positions worst day: ${small_worst_day:,.2f}")
print(f"  Large positions best day: ${large_best_day:,.2f}")
print(f"  Large positions worst day: ${large_worst_day:,.2f}")

# Top 1% and bottom 1% days
small_top_1pct_mv = small_mv_clean.quantile(0.99)
small_bottom_1pct_mv = small_mv_clean.quantile(0.01)
large_top_1pct_mv = large_mv_clean.quantile(0.99)
large_bottom_1pct_mv = large_mv_clean.quantile(0.01)

print(f"\nTOP 1% AND BOTTOM 1% THRESHOLDS:")
print(f"  Small positions - Top 1% days: ${small_top_1pct_mv:,.2f}")
print(f"  Small positions - Bottom 1% days: ${small_bottom_1pct_mv:,.2f}")
print(f"  Large positions - Top 1% days: ${large_top_1pct_mv:,.2f}")
print(f"  Large positions - Bottom 1% days: ${large_bottom_1pct_mv:,.2f}")

# Summary table
print(f"\n📋 MARKET VALUE SUMMARY TABLE:")
print("-" * 40)

summary_mv_data = {
    'Metric': ['Average Daily Change ($)', 'Median Daily Change ($)', 'Total Change ($)',
               'Win Rate (%)', 'Best Day ($)', 'Worst Day ($)'],
    'Small Positions (<1%)': [
        f"{small_daily_avg:,.2f}",
        f"{small_daily_median:,.2f}",
        f"{small_total_change:,.2f}",
        f"{small_win_rate*100:.1f}",
        f"{small_best_day:,.2f}",
        f"{small_worst_day:,.2f}"
    ],
    'Large Positions (≥1%)': [
        f"{large_daily_avg:,.2f}",
        f"{large_daily_median:,.2f}",
        f"{large_total_change:,.2f}",
        f"{large_win_rate*100:.1f}",
        f"{large_best_day:,.2f}",
        f"{large_worst_day:,.2f}"
    ]
}

summary_mv_df = pd.DataFrame(summary_mv_data)
print(summary_mv_df.to_string(index=False))

# Step 7: Phase 1 Summary
print(f"\n🎯 STEP 7: PHASE 1 SUMMARY")
print("="*80)

performance_diff = small_returns.mean() - large_returns.mean()

print(f"📈 HISTORICAL PERFORMANCE SUMMARY:")
print(f"  Small positions mean return: {small_returns.mean()*100:.3f}% daily")
print(f"  Large positions mean return: {large_returns.mean()*100:.3f}% daily")
print(f"  Performance difference: {performance_diff*100:.3f}% daily")

print(f"\n📊 DISTRIBUTION CHARACTERISTICS:")
print(f"  Small positions volatility: {small_returns.std()*100:.2f}%")
print(f"  Large positions volatility: {large_returns.std()*100:.2f}%")

print(f"\n🏆 EXTREME RETURNS:")
print(f"  Small positions - Top 1%: {small_top_1pct*100:.2f}%")
print(f"  Small positions - Bottom 1%: {small_bottom_1pct*100:.2f}%")
print(f"  Large positions - Top 1%: {large_top_1pct*100:.2f}%")
print(f"  Large positions - Bottom 1%: {large_bottom_1pct*100:.2f}%")

print(f"\n💰 MARKET VALUE CONCLUSION:")
if small_total_change > large_total_change:
    print(f"  ✅ SMALL POSITIONS OUTPERFORMED in absolute dollar terms!")
    print(f"     Small positions gained ${small_total_change - large_total_change:,.2f} more than large positions")
else:
    print(f"  ❌ SMALL POSITIONS UNDERPERFORMED in absolute dollar terms!")
    print(f"     Small positions lost ${abs(small_total_change - large_total_change):,.2f} compared to large positions")

print(f"\n💡 KEY INSIGHTS:")
print(f"   • Small positions average ${small_daily_avg:,.2f} per day")
print(f"   • Large positions average ${large_daily_avg:,.2f} per day") 
print(f"   • Small positions win rate: {small_win_rate*100:.1f}%")
print(f"   • Large positions win rate: {large_win_rate*100:.1f}%")

print(f"\n✅ Phase 1 Complete!")
print(f"📊 Ready for Phase 2: Holding Period and Stability Analysis")

🔍 ARK SMALL POSITIONS ANALYSIS - PHASE 1
📊 STEP 1: IDENTIFYING SMALL POSITIONS (<1%)
Total records: 113,650
Small positions (<1%): 35,890 (31.6%)
Large positions (≥1%): 76,796 (67.6%)

📈 STEP 2: CALCULATING DAILY RETURNS
Small positions with valid returns: 35,524
Large positions with valid returns: 76,385

📊 STEP 3: STATISTICAL ANALYSIS - MEAN & MEDIAN
SMALL POSITIONS (<1%) STATISTICS:
  Mean daily return: -0.0062 (-0.62%)
  Median daily return: 0.0000 (0.00%)
  Standard deviation: 0.1526 (15.26%)

LARGE POSITIONS (≥1%) STATISTICS:
  Mean daily return: 0.0020 (0.20%)
  Median daily return: 0.0004 (0.04%)
  Standard deviation: 0.0931 (9.31%)

📊 STEP 4: DISTRIBUTION ANALYSIS
SMALL POSITIONS DISTRIBUTION:
   1th percentile: -0.7133 (-71.33%)
   5th percentile: -0.0799 (-7.99%)
  10th percentile: -0.0472 (-4.72%)
  25th percentile: -0.0178 (-1.78%)
  50th percentile: 0.0000 (0.00%)
  75th percentile: 0.0145 (1.45%)
  90th percentile: 0.0389 (3.89%)
  95th percentile: 0.0637 (6.37%)
  99th 