In [1]:
# If needed:
# !pip install pandas numpy scikit-learn tensorflow pandas_ta --quiet

import pandas as pd
import numpy as np
import pandas_ta as ta  # For RSI, MACD, Moving Averages
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("TensorFlow version:", tf.__version__)
print("Pandas version:", pd.__version__)
print("Numpy version:", np.__version__)




TensorFlow version: 2.18.0
Pandas version: 2.2.3
Numpy version: 1.26.4


In [None]:
# Load the merged CSV with Apple prices + sentiment.

csv_file = "merged_news_stock_data.csv"  # adjust if needed

df = pd.read_csv(csv_file, parse_dates=["Date"])
df.sort_values("Date", inplace=True)
df.dropna(subset=["Date"], inplace=True)
df.reset_index(drop=True, inplace=True)

print("Columns in dataset:", df.columns.tolist())
print("Row count:", len(df))
df.head()

## 2) Compute RSI, MACD, Moving Average

Using **pandas_ta** to get:
1. **RSI** (14-day)
2. **MACD** (12,26,9) plus signal line
3. **50-day SMA**

In [None]:

df["RSI"] = ta.rsi(df["Close"], length=14)

# MACD
macd_data = ta.macd(df["Close"])  # yields MACD_12_26_9, MACDh_12_26_9, MACDs_12_26_9
df["MACD"] = macd_data["MACD_12_26_9"]
df["MACD_signal"] = macd_data["MACDs_12_26_9"]  # the signal line

# Moving Average (50-day)
df["SMA_50"] = ta.sma(df["Close"], length=50)

# Drop rows that turned NaN due to rolling calculations
df.dropna(subset=["RSI", "MACD", "MACD_signal", "SMA_50"], inplace=True)
df.reset_index(drop=True, inplace=True)

print("After adding RSI, MACD, SMA_50:")
df.head()

In [None]:
df["future_close"] = df["Close"].shift(-1)
df["target_up"] = (df["future_close"] > df["Close"]).astype(int)
# Drop last row which has no future_close
df.dropna(subset=["future_close"], inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

## 4) Select Feature Columns


In [None]:
feature_cols = [
    "Close", 
    "Volume",
    "RSI", 
    "MACD", 
    "MACD_signal", 
    "SMA_50",
    "sentiment_polarity",
    "sentiment_neg",
    "sentiment_neu",
    "sentiment_pos"
]

df_model = df[["Date"] + feature_cols + ["target_up"]].copy()
df_model.dropna(inplace=True)
print("df_model columns:", df_model.columns.tolist())
df_model.head()

## 5) Build Sequences for LSTM


In [None]:
# 1) Scale features (so RSI, Volume, Sentiment are on similar numeric scales)
scaler = StandardScaler()

df_model_scaled = df_model.copy()
df_model_scaled[feature_cols] = scaler.fit_transform(df_model_scaled[feature_cols])

# 2) Convert to sequences
lookback = 5

vals = df_model_scaled[feature_cols].values  # shape (N, num_features)
targets = df_model_scaled["target_up"].values

X, y = [], []
for i in range(len(vals) - lookback):
    seq_x = vals[i : i+lookback]
    label = targets[i + lookback]
    X.append(seq_x)
    y.append(label)

X = np.array(X)
y = np.array(y)

print("X shape:", X.shape, "y shape:", y.shape)
# e.g. X shape: (samples, 5, #features), y shape: (samples,)

## 6) Train/Test Split (Time-based)
We'll do an 80/20 split by index. The first 80% for training, last 20% for test.

In [None]:
train_size = int(0.8 * len(X))
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])


## 7) Build LSTM Model
We'll do a single-layer LSTM with 64 units, plus a Dense(1) for binary classification.

In [None]:
model = keras.Sequential()
model.add(layers.Input(shape=(lookback, len(feature_cols))))
model.add(layers.LSTM(64, return_sequences=False))
model.add(layers.Dense(1, activation="sigmoid"))

model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

model.summary()

## 8) Train the Model

In [None]:
epochs = 10
batch_size = 32

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)

## 9) Evaluate on Test Data

In [None]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)
print(classification_report(y_test, y_pred))

## 10) Next Steps
- Adjust **lookback** if you want a longer or shorter window.
- Tweak the LSTM layer size (64→128) or add dropout.
- Add more features or different sentiment transforms.
- Increase the **epochs** if underfitting, but watch for overfitting.
- Use a more advanced train/test methodology like walk-forward validation.
- Evaluate the model’s predictions with a trading strategy or backtest approach.