In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from datetime import datetime, timedelta

In [2]:
# --- 1. CONFIGURATION ---
TICKER = '^NSEI'  # Nifty 50 Index ticker on Yahoo Finance
YEARS_OF_DATA = 3 # Fetch 3 years of data (2 for training, 1 for testing)
TRAIN_RATIO = 0.8  # Use 80% of the historical data for training

In [3]:
# Calculate start and end dates for data retrieval
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(days=365 * YEARS_OF_DATA)).strftime('%Y-%m-%d')

print(f"Fetching Nifty 50 data ({TICKER}) from {start_date} to {end_date}...")

Fetching Nifty 50 data (^NSEI) from 2022-10-26 to 2025-10-25...


In [4]:
# --- 2. DATA RETRIEVAL ---
try:
    data = yf.download(TICKER, start=start_date, end=end_date, interval="1d")
    if data.empty:
        raise ValueError("No data returned for the specified period. Check the ticker or dates.")
except Exception as e:
    print(f"Error fetching data: {e}")
    exit()

  data = yf.download(TICKER, start=start_date, end=end_date, interval="1d")
[*********************100%***********************]  1 of 1 completed


In [11]:
# Filter only necessary columns (Open, Close, Volume)
df = data[['Open', 'Close', 'Volume','High','Low']].copy()
print(f"Data successfully loaded. Total days: {len(df)}")

Data successfully loaded. Total days: 740


In [12]:
# --- 3. FEATURE ENGINEERING & TARGET CREATION ---

# a) Create the Target Variable (Direction)
# Shift the 'Close' price backwards by 1 day to see what the next day's close was.
# If Next Day Close > Today's Close, Direction = 1 (Up)
# If Next Day Close <= Today's Close, Direction = 0 (Down/Flat)
df['Target'] = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)

# b) Calculate Simple Technical Indicators
# 1. Moving Averages (MA)
df['SMA_10'] = df['Close'].rolling(window=10).mean()
df['SMA_50'] = df['Close'].rolling(window=50).mean()

In [13]:
print(df.columns)


MultiIndex([(  'Open', '^NSEI'),
            ( 'Close', '^NSEI'),
            ('Volume', '^NSEI'),
            (  'High', '^NSEI'),
            (   'Low', '^NSEI'),
            ('Target',      ''),
            ('SMA_10',      ''),
            ('SMA_50',      '')],
           names=['Price', 'Ticker'])


In [14]:
# 2. Momentum/Volatility Features
# The difference between short and long MAs is a common feature (like MACD line)
df['MA_Diff'] = df['SMA_10'] - df['SMA_50']
# Daily range normalized by the opening price
df['Day_Range_Ratio'] = (df['High'] - df['Low']) / df['Open']

In [15]:
# 3. Lagged Returns
# Use past performance as predictors
df['Lag_1_Day_Return'] = df['Close'].pct_change().shift(1)
df['Lag_5_Day_Return'] = df['Close'].pct_change().rolling(window=5).mean().shift(1)

In [16]:
# c) Create the list of features (excluding Open, Close, Target, etc.)
features = [col for col in df.columns if col not in ['Open', 'Close', 'Volume', 'Target', 'High', 'Low', 'Adj Close']]

In [17]:
# --- 4. DATA CLEANING and SPLIT ---

# Drop any rows with NaN values resulting from rolling calculations (usually the first 50 days)
df.dropna(inplace=True)
print(f"Data shape after cleaning (rows used): {len(df)}")

# Separate features (X) and target (y)
X = df[features]
y = df['Target']

# Time-Series Split: Use the initial part for training and the later part for testing
split_index = int(TRAIN_RATIO * len(df))
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"Training set size: {len(X_train)} days ({X_train.index.min().date()} to {X_train.index.max().date()})")
print(f"Testing set size: {len(X_test)} days ({X_test.index.min().date()} to {X_test.index.max().date()})")


Data shape after cleaning (rows used): 691
Training set size: 552 days (2023-01-05 to 2025-04-02)
Testing set size: 139 days (2025-04-03 to 2025-10-24)


In [18]:
# --- 5. MODEL TRAINING AND BACKTESTING ---

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
model.fit(X_train, y_train)

# Backtest (predict on the unseen test set)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- BACKTESTING RESULTS (Test Set) ---")
print(f"Model Accuracy on unseen data: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Down (0)', 'Up (1)']))



--- BACKTESTING RESULTS (Test Set) ---
Model Accuracy on unseen data: 1.00

Classification Report:
               precision    recall  f1-score   support

    Down (0)       1.00      1.00      1.00        63
      Up (1)       1.00      1.00      1.00        76

    accuracy                           1.00       139
   macro avg       1.00      1.00      1.00       139
weighted avg       1.00      1.00      1.00       139



In [19]:
# --- 6. FUTURE PREDICTION ---

# The last row of the main dataframe contains the most recent data needed to predict the next day
last_day_data = df.iloc[-1]
# We need to drop the 'Target' column since we don't know the future yet
X_future = last_day_data[features].to_frame().T

# Check if the feature calculation resulted in NaNs for the last day
if X_future.isnull().values.any():
    print("\n--- PREDICTION ERROR ---")
    print("Cannot make a prediction. Some features for the last day are missing or incomplete. This usually happens if the most recent data is not complete or the rolling window calculation failed.")
else:
    # Make the prediction
    future_prediction = model.predict(X_future)[0]
    future_proba = model.predict_proba(X_future)[0]

    direction = "UP" if future_prediction == 1 else "DOWN/FLAT"
    up_proba = future_proba[1] * 100

    print("\n--- NEXT DAY PREDICTION ---")
    print(f"Prediction for the next trading day ({df.index[-1].date() + timedelta(days=1)}):")
    print(f"Direction: {direction}")
    print(f"Confidence (Probability of UP): {up_proba:.2f}%")
    print("\nNote: A 50% confidence level is essentially a coin flip.")

# --- 7. FEATURE IMPORTANCE (Insight) ---

print("\n--- FEATURE IMPORTANCE (Insights) ---")
feature_importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print("Features ranked by importance to the model:")
print(feature_importances)




--- NEXT DAY PREDICTION ---
Prediction for the next trading day (2025-10-25):
Direction: DOWN/FLAT
Confidence (Probability of UP): 9.00%

Note: A 50% confidence level is essentially a coin flip.

--- FEATURE IMPORTANCE (Insights) ---
Features ranked by importance to the model:
(Target, )              0.849111
(Day_Range_Ratio, )     0.018256
(MA_Diff, )             0.017410
(SMA_50, )              0.016070
(Lag_1_Day_Return, )    0.015065
(Volume, ^NSEI)         0.014960
(Lag_5_Day_Return, )    0.014050
(Close, ^NSEI)          0.013300
(SMA_10, )              0.011776
(Open, ^NSEI)           0.010224
(Low, ^NSEI)            0.010135
(High, ^NSEI)           0.009642
dtype: float64


In [20]:
def predict_nifty_direction(date_input, verbose=True):
    """
    Predict Nifty direction (next trading day) for the given date using the trained model.
    date_input: str or datetime-like. The function will use the latest trading row on or before this date.
    Returns: dict with keys:
      - 'date_used' : Timestamp of the trading row used for features
      - 'predicted_for' : approximate next day (date_used + 1 day)
      - 'prediction' : 1 (UP) or 0 (DOWN/FLAT)
      - 'direction' : "UP" or "DOWN/FLAT"
      - 'prob_up' : probability of UP (0-1)
      - 'prob_down' : probability of DOWN/FLAT (0-1)
    """
    # normalize input to Timestamp (date-only)
    date_ts = pd.to_datetime(date_input)

    # find the last trading day on or before the provided date
    candidate_idx = df.index[df.index <= date_ts]
    if len(candidate_idx) == 0:
        raise ValueError("No trading data on or before the provided date. Provide a later date.")
    date_used = candidate_idx.max()

    # prepare feature row
    row = df.loc[date_used]
    X_row = row[features].to_frame().T

    # check for NaNs in features
    if X_row.isnull().values.any():
        raise ValueError(f"Features for the chosen date ({date_used.date()}) contain NaNs. Cannot make prediction.")

    # predict
    proba = model.predict_proba(X_row)[0]
    pred = model.predict(X_row)[0]
    prob_up = float(proba[1])
    prob_down = float(proba[0])
    direction = "UP" if pred == 1 else "DOWN/FLAT"

    # try to identify the next trading day index if available, otherwise approximate
    try:
        pos = df.index.get_loc(date_used)
        if pos < len(df) - 1:
            predicted_for = df.index[pos + 1]
        else:
            predicted_for = date_used + timedelta(days=1)
    except Exception:
        predicted_for = date_used + timedelta(days=1)

    result = {
        "date_used": date_used,
        "predicted_for": predicted_for,
        "prediction": int(pred),
        "direction": direction,
        "prob_up": prob_up,
        "prob_down": prob_down,
    }

    if verbose:
        print(f"Prediction for next trading day after {date_used.date()} (approx {predicted_for.date()}):")
        print(f"  Direction: {direction}")
        print(f"  Probability UP: {prob_up:.2%}")
        print(f"  Probability DOWN/FLAT: {prob_down:.2%}")

    return result

In [22]:
from datetime import timedelta

# Prepare predictions for 2025-10-15 through 2025-10-20 and compare with actual next-trading-day market moves.

dates_to_check = pd.date_range("2025-9-15", "2025-10-20", freq="D")
comparison_rows = []

def _close_on_or_before(ts):
    """Return close price for the latest trading day on or before ts (None if not available)."""
    if ts in df.index:
        return float(df.loc[ts, "Close"]), ts
    # fetch a small window around the date and pick the latest <= ts
    start = (ts - timedelta(days=5)).strftime("%Y-%m-%d")
    end = (ts + timedelta(days=1)).strftime("%Y-%m-%d")
    tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
    if tmp.empty:
        return None, None
    available = tmp.index[tmp.index <= ts]
    if len(available) == 0:
        return None, None
    idx = available.max()
    return float(tmp.loc[idx, "Close"]), idx

def _next_trading_close_after(ts):
    """Return (close, index) for the first trading day strictly after ts (None if not available)."""
    start = (ts + timedelta(days=1)).strftime("%Y-%m-%d")
    end = (ts + timedelta(days=10)).strftime("%Y-%m-%d")
    tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
    if tmp.empty:
        return None, None
    idx = tmp.index.min()
    return float(tmp.loc[idx, "Close"]), idx

for d in dates_to_check:
    try:
        pred = predict_nifty_direction(d, verbose=False)
    except Exception as e:
        comparison_rows.append({
            "date_requested": pd.to_datetime(d).date(),
            "error": str(e)
        })
        continue

    date_used = pred["date_used"]
    predicted_for = pred["predicted_for"]
    predicted_direction = pred["direction"]
    prob_up = pred["prob_up"]

    # get close on date_used (from df if available, else fetch)
    close_used, used_idx = _close_on_or_before(date_used)
    # get actual next trading day close (first trading day after date_used)
    close_next, next_idx = _next_trading_close_after(date_used)

    if close_used is None or close_next is None:
        actual_direction = None
        correct = None
    else:
        actual_direction = "UP" if close_next > close_used else "DOWN/FLAT"
        correct = (actual_direction == predicted_direction)

    comparison_rows.append({
        "date_requested": pd.to_datetime(d).date(),
        "date_used": date_used.date() if isinstance(date_used, (pd.Timestamp,)) else date_used,
        "used_index_in_df": bool(used_idx is not None and used_idx in df.index),
        "predicted_for": (next_idx.date() if next_idx is not None else (pd.to_datetime(predicted_for).date() if predicted_for is not None else None)),
        "predicted_direction": predicted_direction,
        "pred_prob_up": prob_up,
        "actual_direction": actual_direction,
        "close_used": close_used,
        "close_next_trading": close_next,
        "prediction_correct": correct,
        "error": None
    })

# Build DataFrame summary and show results
results_df = pd.DataFrame(comparison_rows)
print(results_df[[
    "date_requested", "date_used", "predicted_for",
    "predicted_direction", "pred_prob_up",
    "actual_direction", "prediction_correct"
]])
# Simple accuracy over rows where we could compare
valid = results_df["prediction_correct"].notnull()
if valid.any():
    accuracy = results_df.loc[valid, "prediction_correct"].mean()
    print(f"\nComparison accuracy on available rows: {accuracy:.2%} ({valid.sum()} rows)")
else:
    print("\nNo valid comparisons could be made (missing market data or feature NaNs).")

  return float(df.loc[ts, "Close"]), ts
  tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
  return float(tmp.loc[idx, "Close"]), idx
  return float(df.loc[ts, "Close"]), ts
  tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
  return float(tmp.loc[idx, "Close"]), idx
  return float(df.loc[ts, "Close"]), ts
  tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
  return float(tmp.loc[idx, "Close"]), idx
  return float(df.loc[ts, "Close"]), ts
  tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
  return float(tmp.loc[idx, "Close"]), idx
  return float(df.loc[ts, "Close"]), ts
  tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
  return float(tmp.loc[idx, "Close"]), idx
  return float(df.loc[ts, "Close"]), ts
  tmp = yf.download(TICKER, start=start, end=end, interval="1d", progress=False)
  return float(tmp.loc[idx, "Close"]), idx
  return float(d

   date_requested   date_used predicted_for predicted_direction  pred_prob_up  \
0      2025-09-15  2025-09-15    2025-09-16                  UP          0.90   
1      2025-09-16  2025-09-16    2025-09-17                  UP          0.89   
2      2025-09-17  2025-09-17    2025-09-18                  UP          0.94   
3      2025-09-18  2025-09-18    2025-09-19           DOWN/FLAT          0.08   
4      2025-09-19  2025-09-19    2025-09-22           DOWN/FLAT          0.09   
5      2025-09-20  2025-09-19    2025-09-22           DOWN/FLAT          0.09   
6      2025-09-21  2025-09-19    2025-09-22           DOWN/FLAT          0.09   
7      2025-09-22  2025-09-22    2025-09-23           DOWN/FLAT          0.07   
8      2025-09-23  2025-09-23    2025-09-24           DOWN/FLAT          0.07   
9      2025-09-24  2025-09-24    2025-09-25           DOWN/FLAT          0.03   
10     2025-09-25  2025-09-25    2025-09-26           DOWN/FLAT          0.10   
11     2025-09-26  2025-09-2

  return float(tmp.loc[idx, "Close"]), idx
