In [None]:
Code/ Chunk:1

Data Preprocessing: 

import pandas as pd
import numpy as np
import yfinance as yf
import talib
import logging
import plotly.io as pio
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import display, HTML
from plotly.subplots import make_subplots


# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data by adding technical indicators."""
    logging.info("Starting preprocessing of data")
    
    # Adding technical indicators without normalizing
    data['SMA_10'] = talib.SMA(data['Close'], timeperiod=10)
    data['EMA_10'] = talib.EMA(data['Close'], timeperiod=10)
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    data['MACD'], data['MACD_signal'], _ = talib.MACD(data['Close'])
    data.fillna(method='bfill', inplace=True)
    logging.info("Data preprocessing complete")
    return data

def main() -> pd.DataFrame:
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    return preprocessed_data

# Run main and get the preprocessed data
preprocessed_data = main()

# Displaying the first 5 rows with a title
display(HTML('<h2>First 5 Rows of the DataFrame</h2>'))
display(HTML(preprocessed_data.head(5).to_html()))
# Displaying the last 5 rows with a title
display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
display(HTML(preprocessed_data.tail(5).to_html()))


Code/Chunk:2

Data aggregation

def aggregate_to_monthly_average(data: pd.DataFrame) -> pd.DataFrame:
    """Aggregates daily data into monthly averages."""
    try:
        logging.info("Starting the aggregation of data to monthly averages.")
        monthly_data_avg = data.resample('M').mean()
        logging.info("Data successfully aggregated to monthly averages.")
        return monthly_data_avg
    except Exception as e:
        logging.error(f"Failed to aggregate data: {e}")
        raise

def plot_trend_comparison(daily_data: pd.DataFrame, monthly_data: pd.DataFrame, title="Daily vs. Monthly Trend Comparison") -> None:
    """Plots the daily trend and monthly average trend side-by-side using Plotly."""
    try:
        logging.info("Starting to plot the trend comparison.")
        fig = make_subplots(rows=1, cols=2, subplot_titles=('Daily Trend', 'Monthly Average Trend'))
        
        # Daily plot
        daily_trace = go.Scatter(x=daily_data.index, y=daily_data['Close'], mode='lines', name='Daily Close', line=dict(color='blue'))
        fig.add_trace(daily_trace, row=1, col=1)
        
        # Monthly plot
        monthly_trace = go.Scatter(x=monthly_data.index, y=monthly_data['Close'], mode='lines', name='Monthly Close', line=dict(color='red'))
        fig.add_trace(monthly_trace, row=1, col=2)
        
        # Layout
        fig.update_layout(title_text=title, hovermode='x', showlegend=False)
        
        pio.show(fig)  # Display the plot inline
        logging.info("Successfully plotted the trend comparison.")
    except Exception as e:
        logging.error(f"Failed to plot the trend comparison: {e}")
        raise

# Aggregate and plot the data
monthly_data = aggregate_to_monthly_average(preprocessed_data)
plot_trend_comparison(preprocessed_data, monthly_data)


#print(monthly_data.head(5))
# Displaying the last 5 rows with a title
display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
display(HTML(monthly_data.tail(5).to_html()))

Code/Chunk:3
convert to stationary to implement model

from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries: pd.Series) -> None:
    """Tests and logs the stationarity of the provided timeseries."""
    logging.info("Testing the stationarity of the timeseries")
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value
    logging.info("Results of Dickey-Fuller Test:")
    logging.info(dfoutput)

def make_stationary(data: pd.DataFrame, target_column: str) -> pd.DataFrame:
    """Transforms the target column to make the data stationary."""
    logging.info("Making data stationary")
    
    # Apply log transformation
    data[f'{target_column}_log'] = np.log(data[target_column])
    
    # Apply differencing
    data[f'{target_column}_stationary'] = data[f'{target_column}_log'].diff().dropna()

    # Test stationarity
    test_stationarity(data[f'{target_column}_stationary'].dropna())

    return data

# Example usage
def main():
    # Assuming preprocessed_data is already loaded from previous steps
    target_column = 'Close'  # Define which column to make stationary
    preprocessed_data.dropna(inplace=True)  # Drop NaN values for stationarity testing
    stationary_data = make_stationary(preprocessed_data, target_column)
    
    logging.info("Stationarity transformation complete")
    return stationary_data

# Run main to perform stationarity transformation
stationary_data = main()


# Print the stationary data
#print(stationary_data)

# Displaying the last 5 rows with a title
display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
display(HTML(stationary_data.head(5).to_html()))

Code/Chunk:5

ARIMA model Implementation - Univariate model

import pandas as pd
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from statsmodels.tsa.stattools import acf, pacf
from IPython.display import display, HTML

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

# Function to split data into training and testing datasets
def split_data(data: pd.DataFrame, start_train: str, end_train: str, start_test: str, end_test: str) -> (pd.DataFrame, pd.DataFrame):
    """Splits the data into training and testing datasets."""
    train_data = data[start_train:end_train]
    test_data = data[start_test:end_test]
    return train_data, test_data

# Function to implement SARIMAX model and return fitted model
def train_sarimax(train_data: pd.Series, p: int, d: int, q: int, seasonal_order=(0, 0, 0, 0)):
    """Trains the SARIMAX model and returns the fitted model."""
    model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Function to visualize the train/test/forecast results with Plotly
def plot_train_test_forecast(train_data: pd.Series, test_data: pd.Series, forecast: pd.Series) -> None:
    """Visualizes the train, test, and forecast data using Plotly."""
    fig = make_subplots(rows=1, cols=1)
    
    # Add traces for training, testing, and forecasted data
    fig.add_trace(go.Scatter(x=train_data.index, y=train_data, mode='lines', name='Train Data'))
    fig.add_trace(go.Scatter(x=test_data.index, y=test_data, mode='lines', name='Test Data'))
    fig.add_trace(go.Scatter(x=forecast.index, y=forecast, mode='lines', name='Forecast'))
    
    # Set plot layout
    fig.update_layout(
        title='Train, Test and Forecast Data',
        xaxis_title='Date',
        yaxis_title='Stock Price',
        hovermode='x',
        showlegend=True
    )
    
    fig.show()

# Function to plot ACF and PACF with Plotly
def plot_acf_pacf(data: pd.Series) -> None:
    """Plots the ACF and PACF using Plotly."""
    acf_values = acf(data, nlags=40)
    pacf_values = pacf(data, nlags=40)
    
    fig = make_subplots(rows=1, cols=2, subplot_titles=('ACF', 'PACF'))

    fig.add_trace(go.Bar(x=np.arange(len(acf_values)), y=acf_values, name='ACF'), row=1, col=1)
    fig.add_trace(go.Bar(x=np.arange(len(pacf_values)), y=pacf_values, name='PACF'), row=1, col=2)

    fig.update_layout(title_text='ACF and PACF Plots', showlegend=False)
    fig.show()

# Main function for SARIMAX implementation and forecasting
def main():
    # Assuming `stationary_data` is already loaded from previous steps
    target_column = 'Close_stationary'
    train_start, train_end = '01-04-2019', '31-03-2023'
    test_start, test_end = '01-04-2023', '31-03-2024'
    forecast_start, forecast_end = '01-04-2024', '31-03-2025'

    train_data, test_data = split_data(stationary_data[target_column].dropna(), train_start, train_end, test_start, test_end)
    
    # Plot ACF and PACF plots for SARIMAX parameter selection
    plot_acf_pacf(train_data)
    
    # Train SARIMAX model with assumed p, d, q values and seasonal_order (these should be fine-tuned)
    p, d, q, seasonal_order = 1, 1, 1, (1, 1, 1, 12)
    model_fit = train_sarimax(train_data, p, d, q, seasonal_order)
    
    # Forecast for the next 12 months
    forecast_steps = pd.date_range(start=forecast_start, end=forecast_end, freq='M').size
    forecast = model_fit.forecast(steps=forecast_steps)
    
    # Set the forecast index to the expected date range
    forecast.index = pd.date_range(start=forecast_start, end=forecast_end, freq='M')

    # Visualization
    plot_train_test_forecast(train_data, test_data, forecast)
    
    # Print SARIMAX results
    print(model_fit.summary())

# Run the main function
main()


Code/Chunk:6

Analyze Model Performance

import pandas as pd
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.stattools import acf, pacf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

# Function to split data into training and testing datasets
def split_data(data: pd.DataFrame, start_train: str, end_train: str, start_test: str, end_test: str) -> (pd.DataFrame, pd.DataFrame):
    """Splits the data into training and testing datasets."""
    train_data = data[start_train:end_train]
    test_data = data[start_test:end_test]
    return train_data, test_data

# Function to implement SARIMAX model and return fitted model
def train_sarimax(train_data: pd.Series, p: int, d: int, q: int, seasonal_order=(0, 0, 0, 0)):
    """Trains the SARIMAX model and returns the fitted model."""
    model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Function to analyze residuals
def plot_residuals(residuals):
    """Analyzes residuals for autocorrelation, normality, and heteroscedasticity."""
    fig, ax = plt.subplots(1, 3, figsize=(18, 5))
    
    # Residuals vs Time
    sns.lineplot(x=residuals.index, y=residuals, ax=ax[0])
    ax[0].set_title('Residuals vs Time')
    
    # Histogram
    sns.histplot(residuals, kde=True, ax=ax[1])
    ax[1].set_title('Histogram of Residuals')
    
    # Q-Q Plot
    stats.probplot(residuals, dist="norm", plot=ax[2])
    ax[2].set_title('Q-Q Plot')
    
    plt.tight_layout()
    plt.show()

# Function to calculate and print model accuracy metrics
def calculate_accuracy(test_data, forecast):
    """Calculates and prints MSE, MAE, and RMSE."""
    mse = mean_squared_error(test_data, forecast)
    mae = mean_absolute_error(test_data, forecast)
    rmse = np.sqrt(mse)
    
    logging.info(f'MSE: {mse:.3f}')
    logging.info(f'MAE: {mae:.3f}')
    logging.info(f'RMSE: {rmse:.3f}')
    
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'RMSE: {rmse:.3f}')

# Main function to train and analyze SARIMAX model
def main():
    # Assuming `stationary_data` is already loaded from previous steps
    target_column = 'Close_stationary'
    train_start, train_end = '01-04-2019', '31-03-2023'
    test_start, test_end = '01-04-2023', '31-03-2024'
    
    train_data, test_data = split_data(stationary_data[target_column].dropna(), train_start, train_end, test_start, test_end)
    
    # Train SARIMAX model with assumed p, d, q values and seasonal_order (these should be fine-tuned)
    p, d, q, seasonal_order = 1, 1, 1, (1, 1, 1, 12)
    model_fit = train_sarimax(train_data, p, d, q, seasonal_order)
    
    # Forecast for the test period
    forecast = model_fit.forecast(steps=len(test_data))
    
    # Plot residuals
    plot_residuals(model_fit.resid)
    
    # Evaluate accuracy metrics
    calculate_accuracy(test_data, forecast)
    
    # Print SARIMAX summary
    print(model_fit.summary())

# Run the main function
main()


Check the code end to end. 
Code is maintain the code flow?
Code is advanced to meet the industrial standards? 

In [41]:
!pip install yfinance

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


# ARIMA MODEL FINAL OUTPUT

### Data Preprocessing

In [43]:
import pandas as pd
import numpy as np
import yfinance as yf
import talib
import logging

class DataPipeline:
    def __init__(self, ticker: str, start_date: str, end_date: str):
        self.ticker = ticker
        self.start_date = start_date
        self.end_date = end_date
        self.data = None
    
    def download_stock_data(self) -> pd.DataFrame:
        """Download stock data from Yahoo Finance."""
        try:
            logging.info(f"Downloading stock data for ticker: {self.ticker}")
            self.data = yf.download(self.ticker, start=self.start_date, end=self.end_date)
            self.data.reset_index(inplace=True)
            self.data['Date'] = pd.to_datetime(self.data['Date'])
            self.data.set_index('Date', inplace=True)
            return self.data
        except Exception as e:
            logging.error(f"Failed to download stock data: {e}")
            raise
    
    def preprocess_data(self) -> pd.DataFrame:
        """Add technical indicators to the stock data."""
        logging.info("Starting preprocessing of data")
        
        # Adding technical indicators
        self.data['SMA_10'] = talib.SMA(self.data['Close'], timeperiod=10)
        self.data['EMA_10'] = talib.EMA(self.data['Close'], timeperiod=10)
        self.data['RSI'] = talib.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], _ = talib.MACD(self.data['Close'])
        
        # Fill missing values by backfilling
        self.data.fillna(method='bfill', inplace=True)
        logging.info("Data preprocessing complete")
        return self.data


ModuleNotFoundError: No module named 'yfinance'

In [None]:
if __name__ == "__main__":
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # Initialize DataPipeline
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    pipeline = DataPipeline(ticker_symbol, start_date, end_date)
    
    # Download and preprocess data
    pipeline.download_stock_data()
    preprocessed_data = pipeline.preprocess_data()
    
    # Displaying the data
    display(HTML('<h2>First 5 Rows of the DataFrame</h2>'))
    display(HTML(preprocessed_data.head(5).to_html()))
    display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
    display(HTML(preprocessed_data.tail(5).to_html()))


In [None]:
import pandas as pd
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

# Function to split data into training and testing datasets
def split_data(data: pd.DataFrame, start_train: str, end_train: str, start_test: str, end_test: str) -> (pd.DataFrame, pd.DataFrame):
    """Splits the data into training and testing datasets."""
    train_data = data[start_train:end_train]
    test_data = data[start_test:end_test]
    return train_data, test_data

# Function to train SARIMAX model and return fitted model
def train_sarimax(train_data: pd.Series, p: int, d: int, q: int, seasonal_order=(0, 0, 0, 0)):
    """Trains the SARIMAX model and returns the fitted model."""
    model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Function to calculate and print model accuracy metrics
def calculate_accuracy(test_data, forecast):
    """Calculates and prints MSE, MAE, and RMSE."""
    mse = mean_squared_error(test_data, forecast)
    mae = mean_absolute_error(test_data, forecast)
    rmse = np.sqrt(mse)
    
    logging.info(f'MSE: {mse:.3f}')
    logging.info(f'MAE: {mae:.3f}')
    logging.info(f'RMSE: {rmse:.3f}')
    
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'RMSE: {rmse:.3f}')

# Function to visualize train, test, and forecast data using Plotly
def plot_train_test_forecast(train_data: pd.Series, test_data: pd.Series, forecast: pd.Series) -> None:
    """Visualizes the train, test, and forecast data using Plotly."""
    fig = make_subplots(rows=1, cols=1)
    
    # Add traces for training, testing, and forecasted data
    fig.add_trace(go.Scatter(x=train_data.index, y=train_data, mode='lines', name='Train Data'))
    fig.add_trace(go.Scatter(x=test_data.index, y=test_data, mode='lines', name='Test Data'))
    fig.add_trace(go.Scatter(x=forecast.index, y=forecast, mode='lines', name='Forecast'))
    
    # Set plot layout
    fig.update_layout(
        title='Train, Test and Forecast Data',
        xaxis_title='Date',
        yaxis_title='Stock Price',
        hovermode='x',
        showlegend=True
    )
    
    fig.show()

# Main function for final model evaluation, visualization, and reporting
def main():
    # Assuming `stationary_data` is already loaded from previous steps
    target_column = 'Close_stationary'
    train_start, train_end = '01-04-2019', '31-03-2023'
    test_start, test_end = '01-04-2023', '31-03-2024'
    forecast_start, forecast_end = '01-04-2024', '31-03-2025'

    train_data, test_data = split_data(stationary_data[target_column].dropna(), train_start, train_end, test_start, test_end)
    
    # Train SARIMAX model with assumed p, d, q values and seasonal_order
    p, d, q, seasonal_order = 1, 1, 1, (1, 1, 1, 12)
    model_fit = train_sarimax(train_data, p, d, q, seasonal_order)
    
    # Forecast for the next 12 months
    forecast_steps = pd.date_range(start=forecast_start, end=forecast_end, freq='M').size
    forecast = model_fit.forecast(steps=forecast_steps)
    
    # Set the forecast index to the expected date range
    forecast.index = pd.date_range(start=forecast_start, end=forecast_end, freq='M')

    # Plot and visualize the forecast along with train and test data
    plot_train_test_forecast(train_data, test_data, forecast)
    
    # Evaluate the model accuracy for the test period
    forecast_test = model_fit.forecast(steps=len(test_data))
    calculate_accuracy(test_data, forecast_test)
    
    # Print SARIMAX summary
    print(model_fit.summary())

# Run the main function
main()


In [None]:
import pandas as pd
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy import stats

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

# Function to split data into training and testing datasets
def split_data(data: pd.DataFrame, start_train: str, end_train: str, start_test: str, end_test: str) -> (pd.DataFrame, pd.DataFrame):
    """Splits the data into training and testing datasets."""
    train_data = data[start_train:end_train]
    test_data = data[start_test:end_test]
    return train_data, test_data

# Function to implement SARIMAX model and return fitted model
def train_sarimax(train_data: pd.Series, p: int, d: int, q: int, seasonal_order=(0, 0, 0, 0)):
    """Trains the SARIMAX model and returns the fitted model."""
    model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Function to analyze residuals
def plot_residuals(residuals):
    """Analyzes residuals for autocorrelation, normality, and heteroscedasticity."""
    fig, ax = plt.subplots(1, 3, figsize=(18, 5))
    
    # Residuals vs Time
    sns.lineplot(x=residuals.index, y=residuals, ax=ax[0])
    ax[0].set_title('Residuals vs Time')
    
    # Histogram
    sns.histplot(residuals, kde=True, ax=ax[1])
    ax[1].set_title('Histogram of Residuals')
    
    # Q-Q Plot
    stats.probplot(residuals, dist="norm", plot=ax[2])
    ax[2].set_title('Q-Q Plot')
    
    plt.tight_layout()
    plt.show()

# Function to calculate and print model accuracy metrics
def calculate_accuracy(test_data, forecast):
    """Calculates and prints MSE, MAE, and RMSE."""
    mse = mean_squared_error(test_data, forecast)
    mae = mean_absolute_error(test_data, forecast)
    rmse = np.sqrt(mse)
    
    logging.info(f'MSE: {mse:.3f}')
    logging.info(f'MAE: {mae:.3f}')
    logging.info(f'RMSE: {rmse:.3f}')
    
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'RMSE: {rmse:.3f}')

# Main function to train and analyze SARIMAX model
def main():
    # Assuming `stationary_data` is already loaded from previous steps
    target_column = 'Close_stationary'
    train_start, train_end = '01-04-2019', '31-03-2023'
    test_start, test_end = '01-04-2023', '31-03-2024'
    
    train_data, test_data = split_data(stationary_data[target_column].dropna(), train_start, train_end, test_start, test_end)
    
    # Train SARIMAX model with assumed p, d, q values and seasonal_order (these should be fine-tuned)
    p, d, q, seasonal_order = 1, 1, 1, (1, 1, 1, 12)
    model_fit = train_sarimax(train_data, p, d, q, seasonal_order)
    
    # Forecast for the test period
    forecast = model_fit.forecast(steps=len(test_data))
    
    # Plot residuals
    plot_residuals(model_fit.resid)
    
    # Evaluate accuracy metrics
    calculate_accuracy(test_data, forecast)
    
    # Print SARIMAX summary
    print(model_fit.summary())

# Run the main function
main()
