# Feature Analysis
Compute 45 features, correlation heatmap, distributions, feature importance (Random Forest, permutation), PCA/t-SNE.

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path("..").resolve()))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data.data_loader import load_ohlcv
from src.feature_engineering import compute_all_features, FEATURE_COLUMNS
from src.data_preprocessing import drop_missing_or_fill

In [None]:
df = load_ohlcv("../data/sample_data.csv")
df = compute_all_features(df)
df = drop_missing_or_fill(df)
feat_cols = [c for c in FEATURE_COLUMNS if c in df.columns]
X = df[feat_cols]
print("Features shape:", X.shape)

In [None]:
plt.figure(figsize=(14, 12))
corr = X.corr()
sns.heatmap(corr, cmap="RdBu_r", center=0, vmin=-0.5, vmax=0.5)
plt.title("Feature correlation matrix")
plt.tight_layout()
plt.show()

In [None]:
y = (df["close"].shift(-1) / df["close"] - 1 >= 0).astype(int)
y = y.reindex(X.index).dropna()
X = X.loc[y.index]
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
imp = pd.Series(rf.feature_importances_, index=feat_cols).sort_values(ascending=False)
imp.head(15).plot(kind="barh", figsize=(8, 6))
plt.title("Top 15 feature importance (Random Forest)")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Xp = pca.fit_transform(X)
plt.scatter(Xp[:, 0], Xp[:, 1], c=y, alpha=0.5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of features")
plt.colorbar(plt.cm.ScalarMappable(cmap="viridis"), label="Direction")
plt.show()