# Feature Return Analysis

Analyze which features are predictive of future returns using:
- Information Coefficient (IC): correlation between feature and forward returns
- Spearman rank correlation (robust to outliers)
- Feature vs return scatter plots
- Correlation heatmaps
- IC curves over different horizons

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from scipy.stats import spearmanr, pearsonr
from datetime import datetime, timedelta

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# config
API_KEY = "vFDjkUVRfPnedLrbRjm75BZ9CJHz3dfv"
TICKER = "AAPL"
START_DATE = "2025-10-01"
END_DATE = "2025-11-01"

In [None]:
def pull_polygon_data(ticker, start, end, api_key):
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{start}/{end}?apiKey={api_key}"
    response = requests.get(url)
    data = response.json()
    
    if 'results' not in data or len(data['results']) < 2:
        raise ValueError("not enough data")
    
    df = pd.DataFrame(data['results'])
    df['timestamp'] = pd.to_datetime(df['t'], unit='ms')
    df = df.rename(columns={'o':'open','h':'high','l':'low','c':'close','v':'volume'})
    df = df[['timestamp','open','high','low','close','volume']]
    return df

In [None]:
def calculate_features_with_horizons(df, forward_horizons=[1, 2, 3, 5]):
    """Calculate features and forward returns at multiple horizons"""
    df = df.copy()
    
    # features
    df['momentum_1min'] = df['close'].pct_change()
    df['volatility_1min'] = df['momentum_1min'] ** 2
    df['price_direction'] = (df['close'] > df['open']).astype(int)
    df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
    df['vwap_dev'] = (df['close'] - df['vwap']) / df['vwap']
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute
    
    # forward returns at different horizons
    for h in forward_horizons:
        df[f'return_{h}min'] = df['close'].shift(-h) / df['close'] - 1
    
    df = df.dropna()
    return df

In [None]:
# load and prepare data
print(f"loading data for {TICKER}...")
df = pull_polygon_data(TICKER, START_DATE, END_DATE, API_KEY)
df = calculate_features_with_horizons(df, forward_horizons=[1, 2, 3, 5])
print(f"loaded {len(df)} bars")
df.head()

## Information Coefficient Analysis

IC measures correlation between feature values and forward returns.
- Pearson IC: linear correlation
- Spearman IC: rank correlation (robust to outliers)

In [None]:
def calculate_ic(df, features, return_col='return_1min'):
    """Calculate Information Coefficient for each feature"""
    results = []
    
    for feat in features:
        # pearson correlation
        pearson_ic, p_pearson = pearsonr(df[feat], df[return_col])
        
        # spearman rank correlation (robust to outliers)
        spearman_ic, p_spearman = spearmanr(df[feat], df[return_col])
        
        results.append({
            'feature': feat,
            'pearson_ic': pearson_ic,
            'pearson_pvalue': p_pearson,
            'spearman_ic': spearman_ic,
            'spearman_pvalue': p_spearman
        })
    
    return pd.DataFrame(results).sort_values('pearson_ic', key=abs, ascending=False)

In [None]:
features = ['momentum_1min', 'volatility_1min', 'price_direction', 'vwap_dev', 'hour', 'minute']

ic_results = calculate_ic(df, features, return_col='return_1min')
print("\nInformation Coefficient Rankings (1-min forward return):")
print(ic_results.to_string(index=False))

## IC Curves Over Different Horizons

In [None]:
# calculate IC for each feature at different horizons
horizons = [1, 2, 3, 5]
ic_by_horizon = {}

for h in horizons:
    ic_by_horizon[h] = calculate_ic(df, features, return_col=f'return_{h}min')

# plot IC curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for feat in features:
    pearson_ics = [ic_by_horizon[h].loc[ic_by_horizon[h]['feature'] == feat, 'pearson_ic'].values[0] for h in horizons]
    spearman_ics = [ic_by_horizon[h].loc[ic_by_horizon[h]['feature'] == feat, 'spearman_ic'].values[0] for h in horizons]
    
    axes[0].plot(horizons, pearson_ics, marker='o', label=feat)
    axes[1].plot(horizons, spearman_ics, marker='o', label=feat)

axes[0].set_xlabel('Forward Horizon (minutes)')
axes[0].set_ylabel('Pearson IC')
axes[0].set_title('Pearson IC vs Horizon')
axes[0].legend()
axes[0].grid(True)
axes[0].axhline(y=0, color='black', linestyle='--', alpha=0.3)

axes[1].set_xlabel('Forward Horizon (minutes)')
axes[1].set_ylabel('Spearman IC')
axes[1].set_title('Spearman IC vs Horizon')
axes[1].legend()
axes[1].grid(True)
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

## Feature vs Return Scatter Plots

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feat in enumerate(features):
    axes[i].scatter(df[feat], df['return_1min'], alpha=0.3, s=10)
    axes[i].set_xlabel(feat)
    axes[i].set_ylabel('1-min forward return')
    axes[i].set_title(f'{feat} vs return')
    axes[i].axhline(y=0, color='red', linestyle='--', alpha=0.5)
    axes[i].axvline(x=0, color='red', linestyle='--', alpha=0.5)
    
    # add IC annotation
    ic_val = ic_results.loc[ic_results['feature'] == feat, 'pearson_ic'].values[0]
    axes[i].text(0.05, 0.95, f'IC={ic_val:.4f}', transform=axes[i].transAxes, 
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Feature Correlation Heatmap

In [None]:
# correlation matrix
corr_matrix = df[features + ['return_1min']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Winsorization Analysis

Check if extreme outliers are skewing the IC. Winsorize features (cap at percentiles) and recalculate.

In [None]:
def winsorize(series, lower=0.01, upper=0.99):
    """Cap values at lower and upper percentiles"""
    lower_val = series.quantile(lower)
    upper_val = series.quantile(upper)
    return series.clip(lower=lower_val, upper=upper_val)

# create winsorized features
df_wins = df.copy()
for feat in features:
    df_wins[f'{feat}_wins'] = winsorize(df[feat])

# compare IC before and after winsorization
ic_original = calculate_ic(df, features, 'return_1min')
ic_winsorized = calculate_ic(df_wins, [f'{f}_wins' for f in features], 'return_1min')
ic_winsorized['feature'] = ic_winsorized['feature'].str.replace('_wins', '')

comparison = pd.DataFrame({
    'feature': features,
    'original_ic': ic_original['pearson_ic'].values,
    'winsorized_ic': ic_winsorized['pearson_ic'].values,
    'ic_change': ic_winsorized['pearson_ic'].values - ic_original['pearson_ic'].values
})

print("\nImpact of Winsorization on IC:")
print(comparison.to_string(index=False))

## Feature Distribution Analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feat in enumerate(features):
    axes[i].hist(df[feat], bins=50, alpha=0.7, edgecolor='black')
    axes[i].set_xlabel(feat)
    axes[i].set_ylabel('frequency')
    axes[i].set_title(f'{feat} distribution')
    axes[i].axvline(x=df[feat].mean(), color='red', linestyle='--', label='mean')
    axes[i].axvline(x=df[feat].median(), color='green', linestyle='--', label='median')
    axes[i].legend()

plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("\nFeature Summary Statistics:")
print(df[features].describe())

print("\nForward Return Summary Statistics:")
print(df[['return_1min', 'return_2min', 'return_3min', 'return_5min']].describe())