***FEATURE ENGINEERING***

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

import yfinance as yf

stock = 'RELIANCE.NS'
start = '2015-01-01'
end = '2025-01-01'

df = yf.download(stock, start, end)
df.columns = df.columns.get_level_values(0)
df.head()


[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01,194.85408,195.753849,193.921394,194.502962,2963643
2015-01-02,194.338364,196.642649,194.064053,194.897973,7331366
2015-01-05,192.209641,195.512446,191.825591,194.217655,10103941
2015-01-06,183.486282,191.584174,182.586513,190.925813,18627980
2015-01-07,187.480423,188.467972,183.683857,183.705802,20720312


**DERIVED FEATURES**

These helps models understand momentum, volatility, trends, etc.


In [16]:
# daily returns
df['Daily Returns'] = df['Close'].pct_change()

# Price Range (Intrday Volatility)
df['Price Range'] = df['High'] - df['Low']

# Percentage Range
df ['Range %'] = (df['High'] - df['Low']) / df['Open']

df.head()

Price,Close,High,Low,Open,Volume,Daily Returns,Price Range,Range %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01,194.85408,195.753849,193.921394,194.502962,2963643,,1.832455,0.009421
2015-01-02,194.338364,196.642649,194.064053,194.897973,7331366,-0.002647,2.578596,0.01323
2015-01-05,192.209641,195.512446,191.825591,194.217655,10103941,-0.010954,3.686856,0.018983
2015-01-06,183.486282,191.584174,182.586513,190.925813,18627980,-0.045385,8.99766,0.047126
2015-01-07,187.480423,188.467972,183.683857,183.705802,20720312,0.021768,4.784115,0.026042


**MOVING AVERAGES (TREND INDICATOR)**

Smooth outs the noise and reveals trend direction


In [17]:
df['MA5'] = df['Close'].rolling(window=5).mean()
df['MA10'] = df['Close'].rolling(window=10).mean()
df['MA20'] = df['Close'].rolling(window=20).mean()
df['MA50'] = df['Close'].rolling(window=50).mean()
df['MA100'] = df['Close'].rolling(window=100).mean()

df.head()

Price,Close,High,Low,Open,Volume,Daily Returns,Price Range,Range %,MA5,MA10,MA20,MA50,MA100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-01-01,194.85408,195.753849,193.921394,194.502962,2963643,,1.832455,0.009421,,,,,
2015-01-02,194.338364,196.642649,194.064053,194.897973,7331366,-0.002647,2.578596,0.01323,,,,,
2015-01-05,192.209641,195.512446,191.825591,194.217655,10103941,-0.010954,3.686856,0.018983,,,,,
2015-01-06,183.486282,191.584174,182.586513,190.925813,18627980,-0.045385,8.99766,0.047126,,,,,
2015-01-07,187.480423,188.467972,183.683857,183.705802,20720312,0.021768,4.784115,0.026042,190.473758,,,,


In [18]:
# Check for missing values
missing = df.isnull().sum()
print("Missing values in each column:\n", missing)


Missing values in each column:
 Price
Close             0
High              0
Low               0
Open              0
Volume            0
Daily Returns     1
Price Range       0
Range %           0
MA5               4
MA10              9
MA20             19
MA50             49
MA100            99
dtype: int64


**VOLATILITY FEATURES**

Helps us understand the uncertainty

In [19]:
df['StdDev5'] = df['Close'].rolling(window=5).std()
df['StdDev10'] = df['Close'].rolling(window=10).std()


**LAG FEATURES (PREVIOUS DAY'S INFO)**

Makes historical info available for model training

In [20]:
df['Lag_1'] = df['Close'].shift(1)
df['Lag_2'] = df['Close'].shift(2)


**CUMULATIVE RETURNS**

Cumulative return from the start of the data

In [21]:
df['Cumulative Return'] = (1 + df['Daily Returns']).cumprod()


**TIME-BASED FEATURES**

These allows the model to learn from seasonal patterns

In [22]:
df['Day'] = df.index.day
df['Month'] = df.index.month
df['Weekday'] = df.index.weekday


In [23]:
df.dropna(inplace=True)
df.head()


Price,Close,High,Low,Open,Volume,Daily Returns,Price Range,Range %,MA5,MA10,...,MA50,MA100,StdDev5,StdDev10,Lag_1,Lag_2,Cumulative Return,Day,Month,Weekday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-05-29,194.886902,196.98688,194.020239,195.531336,29277517,0.00137,2.966641,0.015172,196.317987,198.0813,...,192.583274,192.87586,1.673936,2.39927,194.620224,196.731308,1.000168,29,5,4
2015-06-01,200.497925,201.08682,195.109097,195.553539,19594976,0.028791,5.977724,0.030568,196.662424,198.319072,...,192.832432,192.932299,2.347679,2.518414,194.886902,194.620224,1.028964,1,6,0
2015-06-02,199.697937,204.175659,198.097955,201.275698,20178960,-0.00399,6.077705,0.030196,197.286859,198.387961,...,193.013799,192.985894,2.706641,2.548626,200.497925,194.886902,1.024859,2,6,1
2015-06-03,199.186859,202.32016,197.442438,199.77574,15546279,-0.002559,4.877722,0.024416,197.777969,198.110188,...,193.239809,193.055667,2.801742,2.249293,199.697937,200.497925,1.022236,3,6,2
2015-06-04,202.820129,203.597892,198.342392,200.931255,15931395,0.018241,5.2555,0.026156,199.41795,198.464629,...,193.547481,193.249005,2.889782,2.68954,199.186859,199.697937,1.040882,4,6,3


In [24]:
# Check for missing values
missing = df.isnull().sum()
print("Missing values in each column:\n", missing)


Missing values in each column:
 Price
Close                0
High                 0
Low                  0
Open                 0
Volume               0
Daily Returns        0
Price Range          0
Range %              0
MA5                  0
MA10                 0
MA20                 0
MA50                 0
MA100                0
StdDev5              0
StdDev10             0
Lag_1                0
Lag_2                0
Cumulative Return    0
Day                  0
Month                0
Weekday              0
dtype: int64


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2368 entries, 2015-05-29 to 2024-12-31
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Close              2368 non-null   float64
 1   High               2368 non-null   float64
 2   Low                2368 non-null   float64
 3   Open               2368 non-null   float64
 4   Volume             2368 non-null   int64  
 5   Daily Returns      2368 non-null   float64
 6   Price Range        2368 non-null   float64
 7   Range %            2368 non-null   float64
 8   MA5                2368 non-null   float64
 9   MA10               2368 non-null   float64
 10  MA20               2368 non-null   float64
 11  MA50               2368 non-null   float64
 12  MA100              2368 non-null   float64
 13  StdDev5            2368 non-null   float64
 14  StdDev10           2368 non-null   float64
 15  Lag_1              2368 non-null   float64
 16  Lag_2 

In [26]:
import os

processed_dir = 'data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save your feature-engineering Dataframe
df.to_pickle(os.path.join(processed_dir, 'processed_reliance_data.pk1'))

In [27]:
import os

processed_dir = 'data/raw'
os.makedirs(processed_dir, exist_ok=True)

# Save your feature-engineering Dataframe
df.to_csv(os.path.join(processed_dir, 'raw_processed_reliance_data.csv'))
