# preprocessing ---- yfinance

In [37]:
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import StandardScaler
import talib
import logging

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        logging.info(f"Data downloaded successfully with {len(data)} records.")
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data by normalizing and adding technical indicators."""
    logging.info("Starting preprocessing of data")
    scaler = StandardScaler()
    data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']] = scaler.fit_transform(
        data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']])
    
    # Adding technical indicators
    data['SMA_10'] = talib.SMA(data['Close'], timeperiod=10)
    data['EMA_10'] = talib.EMA(data['Close'], timeperiod=10)
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    data['MACD'], data['MACD_signal'], _ = talib.MACD(data['Close'])
    data.fillna(method='bfill', inplace=True)
    logging.info("Data preprocessing complete")
    return data

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    
    # Display some initial data info and stats
    logging.info("Initial data overview")
    print(stock_data.head())
    stock_data.info()
    
    preprocessed_data = preprocess_data(stock_data)
    logging.info("Preprocessing complete")
    # Display a brief summary for the preprocessed data
    print(preprocessed_data.describe())

if __name__ == "__main__":
    main()


2024-05-01 05:00:33,176 - INFO - Downloading stock data for ticker: TATAELXSI.NS
[*********************100%%**********************]  1 of 1 completed
2024-05-01 05:00:33,244 - INFO - Data downloaded successfully with 1235 records.
2024-05-01 05:00:33,244 - INFO - Initial data overview
2024-05-01 05:00:33,252 - INFO - Starting preprocessing of data
2024-05-01 05:00:33,259 - INFO - Data preprocessing complete
2024-05-01 05:00:33,259 - INFO - Preprocessing complete


                  Open         High         Low       Close   Adj Close  \
Date                                                                      
2019-04-01  965.099976   976.500000  955.000000  960.750000  911.626160   
2019-04-02  964.650024   983.000000  964.650024  980.049988  929.939331   
2019-04-03  998.799988  1000.700012  971.150024  975.049988  925.195068   
2019-04-04  977.400024   977.400024  957.099976  961.400024  912.242981   
2019-04-05  965.650024   972.750000  961.900024  965.099976  915.753784   

             Volume  
Date                 
2019-04-01   492878  
2019-04-02  1126399  
2019-04-03  1053259  
2019-04-04   422661  
2019-04-05   295544  
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1235 entries, 2019-04-01 to 2024-03-28
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1235 non-null   float64
 1   High       1235 non-null   float64
 2   Low        1235 non-null   floa

# preprocessing - CSV Input 

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import talib
import logging

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def read_stock_data(file_path: str) -> pd.DataFrame:
    """Reads stock data from a CSV file."""
    try:
        logging.info(f"Reading stock data from {file_path}")
        data = pd.read_csv(file_path)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        logging.info(f"Data loaded successfully with {len(data)} records.")
        return data
    except Exception as e:
        logging.error(f"Failed to read stock data: {e}")
        raise

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data by normalizing and adding technical indicators."""
    logging.info("Starting preprocessing of data")
    scaler = StandardScaler()
    data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']] = scaler.fit_transform(
        data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']])
    
    # Adding technical indicators
    data['SMA_10'] = talib.SMA(data['Close'], timeperiod=10)
    data['EMA_10'] = talib.EMA(data['Close'], timeperiod=10)
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    data['MACD'], data['MACD_signal'], _ = talib.MACD(data['Close'])
    data.fillna(method='bfill', inplace=True)
    logging.info("Data preprocessing complete")
    return data

def main():
    file_path = '/home/e1ba06db-f639-4812-9369-20fe39201d9e/Wells Fargo/TATA_ELXSI_INPUT_DATA.csv'  # Update this path
    stock_data = read_stock_data(file_path)
    stock_data.head() , stock_data.info()
    preprocessed_data = preprocess_data(stock_data)
    
    logging.info("Preprocessing complete")
    # Display a brief summary for the preprocessed data
    print(preprocessed_data.describe())

if __name__ == "__main__":
    main()
