**Comparing Feature Selection for Multi-Layer Perceptron (NN)**

- Price features only
- Aggregated sentiment only
- Price + sentiment features

In [31]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

def generate_rolling_mlp_data(df, window_size=14, ticker='AAPL', features=['open', 'high', 'low', 'close', 'volume', 'sentiment']):
    df = df[df['ticker'] == ticker].copy()
    df = df.sort_values(by='date')
    X, y = [], []

    for i in range(window_size, len(df) - 1):
        window = df.iloc[i - window_size:i]
        next_day = df.iloc[i + 1]
        flat_features = window[features].values.flatten()
        X.append(flat_features)
        y.append(next_day['label'])

    return np.array(X), np.array(y)

# Load dataset
df = pd.read_csv("data/dataset.csv", sep="\t")

# Define feature sets
feature_sets = {
    "Price-only MLP": ["open", "high", "low", "close", "volume"],
    "Sentiment-only MLP": ["sentiment"],
    "Price+Sentiment MLP": ["open", "high", "low", "close", "volume", "sentiment"],
    "Price + Technical Indicators": ["open", "high", "low", "close", "volume", "wma_close",	"sma_close", "lag_1_close", "lag_7_close", "volatility"]
}

# Number of randomized runs
N_RUNS = 10

for label, features in feature_sets.items():
    accuracies = []

    for run in range(N_RUNS):
        X, y = generate_rolling_mlp_data(df, window_size=30, ticker='AAPL', features=features)

        # Train/test split
        split_idx = int(0.8 * len(X))
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]


        # Train model with different random seed
        clf = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1500, random_state=run)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)

    # Summary stats
    mean_acc = np.mean(accuracies)
    std_acc = np.std(accuracies)

    print(f"\n=== {label} ===")
    print(f"Mean Accuracy over {N_RUNS} runs: {mean_acc:.4f}")
    print(f"Std Dev: {std_acc:.4f}")



=== Price-only MLP ===
Mean Accuracy over 10 runs: 0.6533
Std Dev: 0.0285

=== Sentiment-only MLP ===
Mean Accuracy over 10 runs: 0.5598
Std Dev: 0.0327

=== Price+Sentiment MLP ===
Mean Accuracy over 10 runs: 0.6543
Std Dev: 0.0187

=== Price + Technical Indicators ===
Mean Accuracy over 10 runs: 0.6511
Std Dev: 0.0293
