In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [20]:
axis_data = pd.read_csv('F:/Xai_traderx/data/raw/axis_raw.csv')
reliance_data = pd.read_csv('F:/Xai_traderx/data/raw/reliance_raw.csv')

In [21]:
axis_data.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Stock
0,,AXISBANK.NS,AXISBANK.NS,AXISBANK.NS,AXISBANK.NS,AXISBANK.NS,
1,2020-05-04,401.0525817871094,422.16062520124706,397.9660543544019,422.16062520124706,37853978,Axis
2,2020-05-05,387.31243896484375,416.9333774203813,384.8232844727818,412.2039838854636,50981056,Axis
3,2020-05-06,387.1631164550781,392.73882889219556,366.4533317436688,383.3298122554856,67511628,Axis
4,2020-05-07,395.626220703125,400.50495742630875,381.0397814692974,385.3211150378657,41339309,Axis


In [22]:
reliance_data.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Stock
0,,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,
1,2020-05-04,651.7192993164062,665.2513523679601,643.6590437820607,653.8989609027406,53456868,Reliance
2,2020-05-05,663.2760009765625,671.608706667689,657.1684258602176,660.1200900918712,44903763,Reliance
3,2020-05-06,663.321533203125,674.3106589376562,656.3965464371961,664.797335147017,40489094,Reliance
4,2020-05-07,684.3006591796875,687.3431400484432,656.1694269836476,660.7103834852225,50026966,Reliance


In [23]:
def clean_stock_data(df):
    # Drop the first two rows and reset the index
    df = df.drop([0, 1]).reset_index(drop=True)
    
    # Drop the 'Stock' column as it's not needed for prediction
    df = df.drop(columns=['Stock'])
    
    # Set the correct column names
    df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
    
    # Convert 'Date' to datetime and drop rows with invalid 'Date'
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])
    
    # Convert numeric columns to the appropriate type
    numeric_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

In [24]:
axis_df = clean_stock_data(axis_data)
reliance_df = clean_stock_data(reliance_data)

In [25]:
axis_df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-05-05,387.312439,416.933377,384.823284,412.203984,50981056
1,2020-05-06,387.163116,392.738829,366.453332,383.329812,67511628
2,2020-05-07,395.626221,400.504957,381.039781,385.321115,41339309
3,2020-05-08,380.392609,407.673768,378.003026,401.749568,33505509
4,2020-05-11,377.903412,391.792906,375.215143,389.303751,30141935


In [26]:
reliance_df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-05-05,663.276001,671.608707,657.168426,660.12009,44903763
1,2020-05-06,663.321533,674.310659,656.396546,664.797335,40489094
2,2020-05-07,684.300659,687.34314,656.169427,660.710383,50026966
3,2020-05-08,709.207947,717.427041,697.991719,701.579101,84262770
4,2020-05-11,716.019409,733.365887,713.975972,717.472477,67085063


In [27]:
def generate_features(df):
    feat_df = df.copy()
    # Price returns
    feat_df['Close_Return_1D'] = feat_df['Close'].pct_change(1)
    feat_df['Close_Return_3D'] = feat_df['Close'].pct_change(3)
    feat_df['Return'] = feat_df['Close'].pct_change()

    # Price volatility and spreads
    feat_df['High_Low_Spread'] = (feat_df['High'] - feat_df['Low']) / feat_df['Close'].replace(0, np.nan)
    feat_df['price_gap'] = feat_df['Open'] - feat_df['Close'].shift(1)
    feat_df['return_volatility'] = feat_df['Return'].rolling(window=20).std()
    feat_df['return_zscore'] = (feat_df['Return'] - feat_df['Return'].rolling(50).mean()) / feat_df['Return'].rolling(50).std()

    # Volume-based features
    feat_df['volume_ratio'] = feat_df['Volume'] / feat_df['Volume'].rolling(window=20).mean()
    feat_df['Volume_Z'] = (feat_df['Volume'] - feat_df['Volume'].rolling(20).mean()) / feat_df['Volume'].rolling(20).std()
    
    # On-Balance Volume (OBV)
    obv = (np.sign(feat_df['Close'].diff()) * feat_df['Volume']).fillna(0).cumsum()
    feat_df['OBV'] = obv

    # RSI Calculation
    delta = feat_df['Close'].diff()
    gain = delta.clip(lower=0).rolling(14).mean()
    loss = (-delta.clip(upper=0)).rolling(14).mean()
    rs = gain / loss.replace(0, np.nan)
    feat_df['RSI'] = 100 - (100 / (1 + rs))

    # MACD & Histogram
    ema12 = feat_df['Close'].ewm(span=12, adjust=False).mean()
    ema26 = feat_df['Close'].ewm(span=26, adjust=False).mean()
    macd = ema12 - ema26
    signal = macd.ewm(span=9, adjust=False).mean()
    feat_df['MACD_Hist'] = macd - signal

    # ATR (Average True Range)
    high_low = feat_df['High'] - feat_df['Low']
    high_close = np.abs(feat_df['High'] - feat_df['Close'].shift())
    low_close = np.abs(feat_df['Low'] - feat_df['Close'].shift())
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    feat_df['ATR'] = tr.rolling(14).mean()

    # Moving Averages
    feat_df['SMA_10'] = feat_df['Close'].rolling(window=10).mean()
    feat_df['SMA_20'] = feat_df['Close'].rolling(window=20).mean()
    feat_df['SMA_50'] = feat_df['Close'].rolling(window=50).mean()
    feat_df['EMA_12'] = feat_df['Close'].ewm(span=12, adjust=False).mean()
    feat_df['EMA_50'] = feat_df['Close'].ewm(span=50, adjust=False).mean()

    # Bollinger Bands
    rolling_std = feat_df['Close'].rolling(20).std()
    feat_df['bb_upper'] = feat_df['SMA_20'] + (2 * rolling_std)
    feat_df['bb_lower'] = feat_df['SMA_20'] - (2 * rolling_std)
    feat_df['bb_width'] = (feat_df['bb_upper'] - feat_df['bb_lower']) / feat_df['SMA_20'].replace(0, np.nan)
    feat_df['Bollinger_%B'] = (feat_df['Close'] - feat_df['bb_lower']) / (feat_df['bb_upper'] - feat_df['bb_lower']).replace(0, np.nan)

    # Other Features (Bear & Bull Indicators)
    feat_df['bear_score'] = (0.4 * (feat_df['RSI'] < 35).astype(float) +
                              0.3 * (feat_df['MACD_Hist'] < -5).astype(float) +
                              0.3 * (feat_df['Return'] < -0.03).astype(float))
    feat_df['bull_strength'] = ((feat_df['RSI'] > 60) & (feat_df['MACD_Hist'] > 5)).astype(float)
    feat_df['bear_strength'] = ((feat_df['RSI'] < 35) & (feat_df['MACD_Hist'] < -5)).astype(float)

    # Momentum & Volatility
    feat_df['Momentum'] = feat_df['Close'].diff(periods=5)
    feat_df['Volatility'] = feat_df['Close'].rolling(window=14).std()

    # Target Variable
    feat_df['next_close'] = feat_df['Close'].shift(-1)
   

    # Drop missing values from rolling and shifting
    feat_df = feat_df.dropna().reset_index(drop=True)

    return feat_df


In [28]:
axis_df = generate_features(axis_df)
reliance_df = generate_features(reliance_df)

In [29]:
axis_df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Close_Return_1D,Close_Return_3D,Return,High_Low_Spread,...,bb_upper,bb_lower,bb_width,Bollinger_%B,bear_score,bull_strength,bear_strength,Momentum,Volatility,next_close
0,2020-07-15,424.799133,442.521933,420.916058,422.160635,43250557,0.021427,-0.029459,0.021427,0.050861,...,454.303081,398.685371,0.130407,0.469522,0.0,0.0,0.0,-17.424042,14.353156,432.117218
1,2020-07-16,432.117218,434.257885,416.634689,430.076123,30183647,0.017227,-0.013748,0.017227,0.040783,...,453.637754,402.198291,0.120209,0.581634,0.0,0.0,0.0,-19.664337,14.293742,431.221161
2,2020-07-17,431.221161,437.991649,422.758035,432.11725,33464985,-0.002074,0.036869,-0.002074,0.035327,...,453.762583,403.671502,0.116839,0.549991,0.0,0.0,0.0,-6.471771,12.168083,432.117218
3,2020-07-20,432.117218,440.87903,427.437595,435.801179,30400586,0.002078,0.017227,0.002078,0.031106,...,453.998923,403.818494,0.116996,0.563939,0.0,0.0,0.0,-6.023743,9.547082,444.264313
4,2020-07-21,444.264313,449.043477,436.597705,438.091197,43147019,0.028111,0.028111,0.028111,0.028014,...,454.422846,403.648464,0.118345,0.799928,0.0,0.0,0.0,28.376343,9.983551,476.872223


In [30]:
axis_df.isnull().sum()

Date                 0
Close                0
High                 0
Low                  0
Open                 0
Volume               0
Close_Return_1D      0
Close_Return_3D      0
Return               0
High_Low_Spread      0
price_gap            0
return_volatility    0
return_zscore        0
volume_ratio         0
Volume_Z             0
OBV                  0
RSI                  0
MACD_Hist            0
ATR                  0
SMA_10               0
SMA_20               0
SMA_50               0
EMA_12               0
EMA_50               0
bb_upper             0
bb_lower             0
bb_width             0
Bollinger_%B         0
bear_score           0
bull_strength        0
bear_strength        0
Momentum             0
Volatility           0
next_close           0
dtype: int64

In [31]:
reliance_df.isnull().sum()

Date                 0
Close                0
High                 0
Low                  0
Open                 0
Volume               0
Close_Return_1D      0
Close_Return_3D      0
Return               0
High_Low_Spread      0
price_gap            0
return_volatility    0
return_zscore        0
volume_ratio         0
Volume_Z             0
OBV                  0
RSI                  0
MACD_Hist            0
ATR                  0
SMA_10               0
SMA_20               0
SMA_50               0
EMA_12               0
EMA_50               0
bb_upper             0
bb_lower             0
bb_width             0
Bollinger_%B         0
bear_score           0
bull_strength        0
bear_strength        0
Momentum             0
Volatility           0
next_close           0
dtype: int64

The features were chosen based on industry knowledge and research, focusing on technical indicators and trend tracking. We’ll check the correlation and feature importance later to drop unnecessary ones, keeping only the most relevant for predicting the next day’s price.

In [33]:
axis_df.to_csv('F:/Xai_traderx/data/processed/axis_processed.csv', index=False)
reliance_df.to_csv('F:/Xai_traderx/data/processed/reliance_processed.csv', index=False)