# Data Preprocessing

1) Convert 'Date' to datetime format.
2) Set 'Date' as the index.
3) Check for missing values.

###
Next steps in preprocessing might include:
1) Normalizing or Scaling the Data: Especially relevant for machine learning models like LSTM and XGBoost to ensure all features contribute equally.
2) Creating Lag Features: Useful for time series forecasting, particularly for machine learning models.
3) Partitioning the Data: Splitting the data into training and testing sets based on the specified dates.

###
The steps you've outlined are a good start for preprocessing the data, but there are 
some additional steps we could take to further prepare the data for your analysis, 
especially given the use of sophisticated models like LSTM and XGBoost. Here’s an enhanced 
preprocessing script that includes normalization and additional feature engineering to potentially improve model performance:

 1) Data Normalization - Standardize the stock prices and volumes.
 2) Feature Engineering - Add technical indicators and time features.
 3) Train-Test Split - Clearly split the data according to your specified dates.

 Here's how you can implement these steps:

 Adding technical indicators like SMA, EMA, RSI, and MACD to the data depends on your specific analysis and modeling goals. Here's why these indicators might be important:
 

SMA (Simple Moving Average):
Purpose: To smooth out price data and identify trends over a specified period.
Use Case: It's useful for identifying long-term trends and making investment decisi

ons.
EMA (Exponential Moving Average):
Purpose: Similar to SMA, but gives more weight to recent data points.
Use Case: Helps in tracking more current price trends and short-term rev


rsals.
RSI (Relative Strength Index):
Purpose: Measures the speed and change of price movements.
Use Case: Indicates overbought or oversold conditions, which can signal potential 

reversals.
MACD (Moving Average Convergence Divergence):
Purpose: Shows the relationship between two moving averages and identifies changes in the trend.
Use Case: Useful for spotting trend reversals and momentum shifts.

In [None]:
!pip install plotly pandas numpy

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import talib
import logging
import plotly.io as pio
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import display, HTML
from plotly.subplots import make_subplots


# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data by adding technical indicators."""
    logging.info("Starting preprocessing of data")
    
    # Adding technical indicators without normalizing
    data['SMA_10'] = talib.SMA(data['Close'], timeperiod=10)
    data['EMA_10'] = talib.EMA(data['Close'], timeperiod=10)
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    data['MACD'], data['MACD_signal'], _ = talib.MACD(data['Close'])
    data.fillna(method='bfill', inplace=True)
    logging.info("Data preprocessing complete")
    return data

def main() -> pd.DataFrame:
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    return preprocessed_data

# Run main and get the preprocessed data
preprocessed_data = main()

# Displaying the first 5 rows with a title
display(HTML('<h2>First 5 Rows of the DataFrame</h2>'))
display(HTML(preprocessed_data.head(5).to_html()))
# Displaying the last 5 rows with a title
display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
display(HTML(preprocessed_data.tail(5).to_html()))


# Data aggregation

convert daily data into monthly data with some aggregation

In [None]:
def aggregate_to_monthly_average(data: pd.DataFrame) -> pd.DataFrame:
    """Aggregates daily data into monthly averages."""
    try:
        logging.info("Starting the aggregation of data to monthly averages.")
        monthly_data_avg = data.resample('M').mean()
        logging.info("Data successfully aggregated to monthly averages.")
        return monthly_data_avg
    except Exception as e:
        logging.error(f"Failed to aggregate data: {e}")
        raise

def plot_trend_comparison(daily_data: pd.DataFrame, monthly_data: pd.DataFrame, title="Daily vs. Monthly Trend Comparison") -> None:
    """Plots the daily trend and monthly average trend side-by-side using Plotly."""
    try:
        logging.info("Starting to plot the trend comparison.")
        fig = make_subplots(rows=1, cols=2, subplot_titles=('Daily Trend', 'Monthly Average Trend'))
        
        # Daily plot
        daily_trace = go.Scatter(x=daily_data.index, y=daily_data['Close'], mode='lines', name='Daily Close', line=dict(color='blue'))
        fig.add_trace(daily_trace, row=1, col=1)
        
        # Monthly plot
        monthly_trace = go.Scatter(x=monthly_data.index, y=monthly_data['Close'], mode='lines', name='Monthly Close', line=dict(color='red'))
        fig.add_trace(monthly_trace, row=1, col=2)
        
        # Layout
        fig.update_layout(title_text=title, hovermode='x', showlegend=False)
        
        pio.show(fig)  # Display the plot inline
        logging.info("Successfully plotted the trend comparison.")
    except Exception as e:
        logging.error(f"Failed to plot the trend comparison: {e}")
        raise

# Aggregate and plot the data
monthly_data = aggregate_to_monthly_average(preprocessed_data)
plot_trend_comparison(preprocessed_data, monthly_data)


#print(monthly_data.head(5))
# Displaying the last 5 rows with a title
display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
display(HTML(monthly_data.tail(5).to_html()))


# convert to stationary to implement model

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries: pd.Series) -> None:
    """Tests and logs the stationarity of the provided timeseries."""
    logging.info("Testing the stationarity of the timeseries")
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value
    logging.info("Results of Dickey-Fuller Test:")
    logging.info(dfoutput)

def make_stationary(data: pd.DataFrame, target_column: str) -> pd.DataFrame:
    """Transforms the target column to make the data stationary."""
    logging.info("Making data stationary")
    
    # Apply log transformation
    data[f'{target_column}_log'] = np.log(data[target_column])
    
    # Apply differencing
    data[f'{target_column}_stationary'] = data[f'{target_column}_log'].diff().dropna()

    # Test stationarity
    test_stationarity(data[f'{target_column}_stationary'].dropna())

    return data

# Example usage
def main():
    # Assuming preprocessed_data is already loaded from previous steps
    target_column = 'Close'  # Define which column to make stationary
    preprocessed_data.dropna(inplace=True)  # Drop NaN values for stationarity testing
    stationary_data = make_stationary(preprocessed_data, target_column)
    
    logging.info("Stationarity transformation complete")
    return stationary_data

# Run main to perform stationarity transformation
stationary_data = main()


# Print the stationary data
#print(stationary_data)

# Displaying the last 5 rows with a title
display(HTML('<h2>Last 5 Rows of the DataFrame</h2>'))
display(HTML(stationary_data.head(5).to_html()))


###
Explanation:
test_stationarity Function: Uses the Augmented Dickey-Fuller (ADF) test to check for stationarity and logs the result.
make_stationary Function: Applies log transformation and differencing to make the data stationary.
Advanced Code: Incorporates logging, clean structure, and leverages standard libraries for ADF testing and transformations.


After converting the data into stationary the total rows is 1233 before converting total rows is 1235?
Whatares the reason

The difference in the total number of rows before and after making the data stationary might be due to the removal of NaN values during the stationary transformation process.?

# 1)  ARIMA model Implementation - Univariate model

In [None]:
## Second test with forecast the 12 months data

In [None]:
import pandas as pd
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from statsmodels.tsa.stattools import acf, pacf
from IPython.display import display, HTML

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

# Function to split data into training and testing datasets
def split_data(data: pd.DataFrame, start_train: str, end_train: str, start_test: str, end_test: str) -> (pd.DataFrame, pd.DataFrame):
    """Splits the data into training and testing datasets."""
    train_data = data[start_train:end_train]
    test_data = data[start_test:end_test]
    return train_data, test_data

# Function to implement SARIMAX model and return fitted model
def train_sarimax(train_data: pd.Series, p: int, d: int, q: int, seasonal_order=(0, 0, 0, 0)):
    """Trains the SARIMAX model and returns the fitted model."""
    model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Function to visualize the train/test/forecast results with Plotly
def plot_train_test_forecast(train_data: pd.Series, test_data: pd.Series, forecast: pd.Series) -> None:
    """Visualizes the train, test, and forecast data using Plotly."""
    fig = make_subplots(rows=1, cols=1)
    
    # Add traces for training, testing, and forecasted data
    fig.add_trace(go.Scatter(x=train_data.index, y=train_data, mode='lines', name='Train Data'))
    fig.add_trace(go.Scatter(x=test_data.index, y=test_data, mode='lines', name='Test Data'))
    fig.add_trace(go.Scatter(x=forecast.index, y=forecast, mode='lines', name='Forecast'))
    
    # Set plot layout
    fig.update_layout(
        title='Train, Test and Forecast Data',
        xaxis_title='Date',
        yaxis_title='Stock Price',
        hovermode='x',
        showlegend=True
    )
    
    fig.show()

# Function to plot ACF and PACF with Plotly
def plot_acf_pacf(data: pd.Series) -> None:
    """Plots the ACF and PACF using Plotly."""
    acf_values = acf(data, nlags=40)
    pacf_values = pacf(data, nlags=40)
    
    fig = make_subplots(rows=1, cols=2, subplot_titles=('ACF', 'PACF'))

    fig.add_trace(go.Bar(x=np.arange(len(acf_values)), y=acf_values, name='ACF'), row=1, col=1)
    fig.add_trace(go.Bar(x=np.arange(len(pacf_values)), y=pacf_values, name='PACF'), row=1, col=2)

    fig.update_layout(title_text='ACF and PACF Plots', showlegend=False)
    fig.show()

# Main function for SARIMAX implementation and forecasting
def main():
    # Assuming `stationary_data` is already loaded from previous steps
    target_column = 'Close_stationary'
    train_start, train_end = '01-04-2019', '31-03-2023'
    test_start, test_end = '01-04-2023', '31-03-2024'
    forecast_start, forecast_end = '01-04-2024', '31-03-2025'

    train_data, test_data = split_data(stationary_data[target_column].dropna(), train_start, train_end, test_start, test_end)
    
    # Plot ACF and PACF plots for SARIMAX parameter selection
    plot_acf_pacf(train_data)
    
    # Train SARIMAX model with assumed p, d, q values and seasonal_order (these should be fine-tuned)
    p, d, q, seasonal_order = 1, 1, 1, (1, 1, 1, 12)
    model_fit = train_sarimax(train_data, p, d, q, seasonal_order)
    
    # Forecast for the next 12 months
    forecast_steps = pd.date_range(start=forecast_start, end=forecast_end, freq='M').size
    forecast = model_fit.forecast(steps=forecast_steps)
    
    # Set the forecast index to the expected date range
    forecast.index = pd.date_range(start=forecast_start, end=forecast_end, freq='M')

    # Visualization
    plot_train_test_forecast(train_data, test_data, forecast)
    
    # Print SARIMAX results
    print(model_fit.summary())

# Run the main function
main()


# Analyze Model Performance

In [None]:
import pandas as pd
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAX
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.stattools import acf, pacf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname=s - %(message=s')

# Function to split data into training and testing datasets
def split_data(data: pd.DataFrame, start_train: str, end_train: str, start_test: str, end_test: str) -> (pd.DataFrame, pd.DataFrame):
    """Splits the data into training and testing datasets."""
    train_data = data[start_train:end_train]
    test_data = data[start_test:end_test]
    return train_data, test_data

# Function to implement SARIMAX model and return fitted model
def train_sarimax(train_data: pd.Series, p: int, d: int, q: int, seasonal_order=(0, 0, 0, 0)):
    """Trains the SARIMAX model and returns the fitted model."""
    model = SARIMAX(train_data, order=(p, d, q), seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Function to analyze residuals
def plot_residuals(residuals):
    """Analyzes residuals for autocorrelation, normality, and heteroscedasticity."""
    fig, ax = plt.subplots(1, 3, figsize=(18, 5))
    
    # Residuals vs Time
    sns.lineplot(x=residuals.index, y=residuals, ax=ax[0])
    ax[0].set_title('Residuals vs Time')
    
    # Histogram
    sns.histplot(residuals, kde=True, ax=ax[1])
    ax[1].set_title('Histogram of Residuals')
    
    # Q-Q Plot
    stats.probplot(residuals, dist="norm", plot=ax[2])
    ax[2].set_title('Q-Q Plot')
    
    plt.tight_layout()
    plt.show()

# Function to calculate and print model accuracy metrics
def calculate_accuracy(test_data, forecast):
    """Calculates and prints MSE, MAE, and RMSE."""
    mse = mean_squared_error(test_data, forecast)
    mae = mean_absolute_error(test_data, forecast)
    rmse = np.sqrt(mse)
    
    logging.info(f'MSE: {mse:.3f}')
    logging.info(f'MAE: {mae:.3f}')
    logging.info(f'RMSE: {rmse:.3f}')
    
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'RMSE: {rmse:.3f}')

# Main function to train and analyze SARIMAX model
def main():
    # Assuming `stationary_data` is already loaded from previous steps
    target_column = 'Close_stationary'
    train_start, train_end = '01-04-2019', '31-03-2023'
    test_start, test_end = '01-04-2023', '31-03-2024'
    
    train_data, test_data = split_data(stationary_data[target_column].dropna(), train_start, train_end, test_start, test_end)
    
    # Train SARIMAX model with assumed p, d, q values and seasonal_order (these should be fine-tuned)
    p, d, q, seasonal_order = 1, 1, 1, (1, 1, 1, 12)
    model_fit = train_sarimax(train_data, p, d, q, seasonal_order)
    
    # Forecast for the test period
    forecast = model_fit.forecast(steps=len(test_data))
    
    # Plot residuals
    plot_residuals(model_fit.resid)
    
    # Evaluate accuracy metrics
    calculate_accuracy(test_data, forecast)
    
    # Print SARIMAX summary
    print(model_fit.summary())

# Run the main function
main()


####

Explanation of the Code:
Stationarity Transformation: The make_stationary function transforms the series to a stationary form and checks stationarity with the ADF test.
Parameter Tuning: The time_series_cv function applies cross-validation with grid search to select the best model parameters.
Residual Analysis: The plot_residuals function analyzes residuals for time-based patterns, normality, and other issues.
Accuracy Calculation: The calculate_accuracy function computes model evaluation metrics to assess accuracy.
Main Function: The main function orchestrates the data preprocessing, parameter tuning, model training, and final evaluation.

# implement the Prophet model for forecasting

### Data Preprocessing

# Preprocessing & Tuning Prophet model

### Final Code

# To further improve:

Additional Preprocessing: You might consider more advanced techniques for handling missing values or outliers specific to your dataset.
Parameter Tuning: Consider tuning the Prophet model parameters (like seasonality, holidays, etc.) to improve forecast accuracy.
Cross-Validation: Use cross-validation methods available in Prophet to validate and adjust the forecast model.



focuses on additional preprocessing, parameter tuning, and cross-validation for the Prophet model. The improvements include handling seasonality, holidays, and automatic parameter tuning.

# Prophet Model Implementation

In [None]:
import pandas as pd
import yfinance as yf
import logging
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
import plotly.graph_objs as go

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Download stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def clean_data(data: pd.DataFrame) -> pd.DataFrame:
    """Clean stock data by filling missing values and removing duplicates."""
    logging.info("Cleaning data")
    data.fillna(method='ffill', inplace=True)
    data.fillna(method='bfill', inplace=True)
    data.drop_duplicates(inplace=True)
    logging.info("Data cleaning complete")
    return data

def preprocess_data_for_prophet(data: pd.DataFrame) -> pd.DataFrame:
    """Prepare data for Prophet by renaming columns and formatting."""
    logging.info("Preparing data for Prophet model")
    prophet_data = data[['Date', 'Close']].rename(columns={'Date': 'ds', 'Close': 'y'})
    logging.info("Data preparation complete")
    return prophet_data

def tune_prophet_parameters() -> dict:
    """Tune Prophet model parameters for better accuracy."""
    return {
        'seasonality_mode': 'multiplicative',
        'yearly_seasonality': True,
        'weekly_seasonality': True,
        'daily_seasonality': False
    }

def prophet_cross_validation(data: pd.DataFrame, params: dict):
    """Perform cross-validation on the Prophet model."""
    logging.info("Performing cross-validation with Prophet")
    model = Prophet(**params)
    model.fit(data)
    df_cv = cross_validation(model, initial='730 days', period='180 days', horizon='365 days')
    df_p = performance_metrics(df_cv)
    logging.info(f"Cross-validation metrics:\n{df_p}")
    return df_cv, df_p

def plot_cross_validation(df_cv: pd.DataFrame):
    """Plot the cross-validation forecast against actuals."""
    logging.info("Plotting cross-validation forecast")
    trace_actual = go.Scatter(x=df_cv['ds'], y=df_cv['y'], mode='lines', name='Actual')
    trace_forecast = go.Scatter(x=df_cv['ds'], y=df_cv['yhat'], mode='lines', name='Forecast')
    trace_upper = go.Scatter(x=df_cv['ds'], y=df_cv['yhat_upper'], mode='lines', name='Upper Confidence', line=dict(dash='dash'))
    trace_lower = go.Scatter(x=df_cv['ds'], y=df_cv['yhat_lower'], mode='lines', name='Lower Confidence', line=dict(dash='dash'))
    layout = go.Layout(title='Prophet Cross-Validation Forecast', xaxis={'title': 'Date'}, yaxis={'title': 'Stock Price'})
    fig = go.Figure(data=[trace_actual, trace_forecast, trace_upper, trace_lower], layout=layout)
    fig.show()

def main():
    # Define parameters
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'

    # Download and preprocess data
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    stock_data = clean_data(stock_data)
    prophet_data = preprocess_data_for_prophet(stock_data)

    # Perform cross-validation with Prophet
    params = tune_prophet_parameters()
    df_cv, _ = prophet_cross_validation(prophet_data, params)

    # Plot cross-validation results
    plot_cross_validation(df_cv)

    # Display data summary in table format
    logging.info("Displaying summary of the stock data:")
    summary = stock_data.describe()
    logging.info("\n" + summary.to_string())  # Log summary as text
    display(summary)  # Display summary in table format using pandas

# Run the main function
if __name__ == '__main__':
    main()


# Prophet Model Implementation

In [None]:
import pandas as pd
import logging
from prophet import Prophet
import plotly.graph_objs as go
from IPython.display import display, HTML

def detect_and_remove_outliers(data: pd.DataFrame, column: str) -> pd.DataFrame:
    """Detect and remove outliers in the specified column using the IQR method."""
    logging.info("Detecting and removing outliers")
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    mask = (data[column] >= Q1 - 1.5 * IQR) & (data[column] <= Q3 + 1.5 * IQR)
    filtered_data = data[mask]
    logging.info("Outlier removal complete")
    return filtered_data

def preprocess_data_for_prophet(data: pd.DataFrame) -> pd.DataFrame:
    """Prepare data for Prophet by renaming columns and formatting."""
    logging.info("Preparing data for Prophet model")
    prophet_data = data[['Close']].reset_index()
    prophet_data.rename(columns={'Date': 'ds', 'Close': 'y'}, inplace=True)
    logging.info("Data preparation complete")
    return prophet_data

def forecast_prophet(data: pd.DataFrame, periods: int) -> pd.DataFrame:
    """Forecast future stock prices using the Prophet model."""
    logging.info("Forecasting with Prophet")
    model = Prophet()
    model.fit(data)
    future = model.make_future_dataframe(periods=periods, freq='M')
    forecast = model.predict(future)
    return forecast

def plot_forecast(actual_data: pd.DataFrame, forecast_data: pd.DataFrame) -> None:
    """Plot actual and forecasted stock prices."""
    trace1 = go.Scatter(x=actual_data['ds'], y=actual_data['y'], mode='lines', name='Actual')
    trace2 = go.Scatter(x=forecast_data['ds'], y=forecast_data['yhat'], mode='lines', name='Forecast')
    layout = go.Layout(title='Prophet Model Forecast', xaxis={'title': 'Date'}, yaxis={'title': 'Stock Price'})
    fig = go.Figure(data=[trace1, trace2], layout=layout)
    fig.show()

def main():
    # Define parameters
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    forecast_periods = 12

    # Download and preprocess data
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    stock_data = clean_data(stock_data)
    stock_data = detect_and_remove_outliers(stock_data, 'Close')
    prophet_data = preprocess_data_for_prophet(stock_data)

    # Forecast with Prophet
    forecast = forecast_prophet(prophet_data, forecast_periods)

    # Display the forecast
    display(HTML('<h2>First 5 Rows of the Forecast</h2>'))
    display(HTML(forecast.head(5).to_html()))

    # Plot the forecast
    plot_forecast(prophet_data, forecast)

    # Provide summary of the forecast
    forecast_summary = forecast[['ds', 'yhat']].tail(forecast_periods).describe()
    logging.info("Summary of the 12-month forecasted stock prices:")
    display(HTML('<h2>12-Month Forecast Summary</h2>'))
    display(HTML(forecast_summary.to_html()))
    print(forecast_summary)

# Run the main function
if __name__ == '__main__':
    main()


# Prophet Model Implementation ---- Completed 

# XGBoost Model

### Pre Processing 

In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message=s)')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data for exploratory data analysis and model preparation."""
    # Handling missing values
    missing_summary = df.isnull().sum()
    logging.info(f"Missing Values per Column:\n{missing_summary}")
    df.dropna(inplace=True)

    # Removing duplicates
    duplicates_count = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {duplicates_count}")
    df.drop_duplicates(inplace=True)

    # Statistical Summary
    summary_stats = df.describe()
    logging.info(f"Statistical Summary:\n{summary_stats}")

    # Convert the summary statistics to an HTML table
    summary_html = summary_stats.to_html()
    with open('statistical_summary.html', 'w') as f:
        f.write(summary_html)

    return df

def stock_data_eda(df: pd.DataFrame):
    """Performs exploratory data analysis on stock data."""
    # Line chart for Closing, Opening, and Adjusted Closing Price
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Close Price'))
    fig.add_trace(go.Scatter(x=df.index, y=df['Open'], mode='lines', name='Open Price'))
    fig.add_trace(go.Scatter(x=df.index, y=df['Adj Close'], mode='lines', name='Adjusted Close Price'))

    # Calculate and add Average Stock Price in a Day
    df['Average'] = (df['High'] + df['Low']) / 2
    fig.add_trace(go.Scatter(x=df.index, y=df['Average'], mode='lines', name='Average Price'))

    fig.update_layout(title='Stock Price Over Time',
                      xaxis_title='Date', yaxis_title='Price')
    fig.show()

    # Heatmap for correlation matrix using Seaborn
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    stock_data_eda(preprocessed_data)
    logging.info("Data preprocessing and EDA completed.")

if __name__ == "__main__":
    main()



# Feature Engineering and Train & Test split

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Adds engineered features to the stock data for analysis.
    
    Args:
        df (pd.DataFrame): Preprocessed stock data.

    Returns:
        pd.DataFrame: Stock data with engineered features.
    """
    # Lag features
    df['Lag1'] = df['Close'].shift(1)
    df['Lag5'] = df['Close'].shift(5)
    df['Lag10'] = df['Close'].shift(10)

    # Moving averages
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()

    # Volatility
    df['Volatility'] = df['Close'].rolling(window=5).std()

    # Remove NaN values resulting from rolling windows
    df.dropna(inplace=True)
    
    return df

def generate_eda_report(df: pd.DataFrame):
    """Generates an EDA report with a heatmap for the feature-engineered data.
    
    Args:
        df (pd.DataFrame): Feature-engineered stock data.
    """
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def split_data(df: pd.DataFrame, test_size: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the data into training and testing sets.
    
    Args:
        df (pd.DataFrame): The feature-engineered stock data.
        test_size (float): Fraction of the data to be used for testing.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: X_train, X_test, y_train, y_test.
    """
    X = df.drop(['Close'], axis=1)
    y = df['Close']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    return X_train, X_test, y_train, y_test

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")
    logging.info("Data preprocessing, feature engineering, EDA report, and train-test splitting completed.")

if __name__ == "__main__":
    main()


# Model implemation to forecast the stock price

In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data for exploratory data analysis and model preparation."""
    # Handling missing values
    missing_summary = df.isnull().sum()
    logging.info(f"Missing Values per Column:\n{missing_summary}")
    df.dropna(inplace=True)

    # Removing duplicates
    duplicates_count = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {duplicates_count}")
    df.drop_duplicates(inplace=True)

    return df

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Adds engineered features to the stock data for analysis."""
    # Lag features
    df['Lag1'] = df['Close'].shift(1)
    df['Lag5'] = df['Close'].shift(5)
    df['Lag10'] = df['Close'].shift(10)

    # Moving averages
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()

    # Volatility
    df['Volatility'] = df['Close'].rolling(window=5).std()

    # Remove NaN values resulting from rolling windows
    df.dropna(inplace=True)
    
    return df

def generate_eda_report(df: pd.DataFrame):
    """Generates an EDA report with a heatmap for the feature-engineered data."""
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def split_data(df: pd.DataFrame, test_size: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the data into training and testing sets.
    
    Args:
        df (pd.DataFrame): The feature-engineered stock data.
        test_size (float): Fraction of the data to be used for testing.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: X_train, X_test, y_train, y_test.
    """
    X = df.drop(['Close'], axis=1)
    y = df['Close']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    return X_train, X_test, y_train, y_test

def train_xgboost_model(X_train, y_train):
    """Trains an XGBoost model using grid search for hyperparameter tuning."""
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best Hyperparameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def forecast_stock_prices(model, X_test, steps=12):
    """Forecasts stock prices for the next 12 months using the trained model."""
    future_predictions = []
    current_input = X_test[-1, :].reshape(1, -1)
    
    for _ in range(steps):
        pred = model.predict(current_input)[0]
        future_predictions.append(pred)
        
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = pred

    return future_predictions

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # Train XGBoost Model
    model = train_xgboost_model(X_train, y_train)

    # Forecast Stock Prices for the Next 12 Months
    predictions = forecast_stock_prices(model, X_test, steps=12)
    logging.info(f"Predictions for the next 12 months: {predictions}")

if __name__ == "__main__":
    main()


In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb


def forecast_stock_prices(model, X_test, steps=12):
    """Forecasts stock prices for the next 12 months using the trained model."""
    future_predictions = []
    current_input = X_test[-1, :].reshape(1, -1)
    
    for _ in range(steps):
        pred = model.predict(current_input)[0]
        future_predictions.append(pred)
        
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = pred

    return future_predictions

def plot_forecast(y_test, predictions):
    """Plots the actual and forecasted stock prices."""
    forecast_dates = pd.date_range(start=y_test.index[-1] + pd.DateOffset(1), periods=len(predictions), freq='M')
    forecast_series = pd.Series(predictions, index=forecast_dates)
    
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test, label='Actual')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', linestyle='--')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.title('Actual vs Forecasted Stock Price')
    plt.legend()
    plt.show()

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # Train XGBoost Model
    model = train_xgboost_model(X_train, y_train)

    # Forecast Stock Prices for the Next 12 Months
    predictions = forecast_stock_prices(model, X_test, steps=12)
    
    # Show Forecast Summary and Graph
    forecast_df = pd.DataFrame({'Forecast': predictions})
    logging.info(f"Forecast Summary:\n{forecast_df}")
    plot_forecast(y_test, predictions)

if __name__ == "__main__":
    main()


In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data for exploratory data analysis and model preparation."""
    # Handling missing values
    missing_summary = df.isnull().sum()
    logging.info(f"Missing Values per Column:\n{missing_summary}")
    df.dropna(inplace=True)

    # Removing duplicates
    duplicates_count = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {duplicates_count}")
    df.drop_duplicates(inplace=True)

    return df

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Adds engineered features to the stock data for analysis."""
    # Lag features
    df['Lag1'] = df['Close'].shift(1)
    df['Lag5'] = df['Close'].shift(5)
    df['Lag10'] = df['Close'].shift(10)

    # Moving averages
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()

    # Volatility
    df['Volatility'] = df['Close'].rolling(window=5).std()

    # Remove NaN values resulting from rolling windows
    df.dropna(inplace=True)
    
    return df

def generate_eda_report(df: pd.DataFrame):
    """Generates an EDA report with a heatmap for the feature-engineered data."""
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def split_data(df: pd.DataFrame, test_size: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the data into training and testing sets.
    
    Args:
        df (pd.DataFrame): The feature-engineered stock data.
        test_size (float): Fraction of the data to be used for testing.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: X_train, X_test, y_train, y_test.
    """
    X = df.drop(['Close'], axis=1)
    y = df['Close']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    return X_train, X_test, y_train, y_test

def train_xgboost_model(X_train, y_train):
    """Trains an XGBoost model using grid search for hyperparameter tuning."""
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best Hyperparameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def forecast_stock_prices(model, X_test, steps=12):
    """Forecasts stock prices for the next 12 months using the trained model."""
    future_predictions = []
    last_valid_index = X_test.shape[0] - 1
    current_input = X_test.iloc[last_valid_index, :].values.reshape(1, -1)
    
    for _ in range(steps):
        pred = model.predict(current_input)[0]
        future_predictions.append(pred)
        
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = pred

    return future_predictions

def plot_forecast(y_test, predictions):
    """Plots the actual and forecasted stock prices."""
    forecast_dates = pd.date_range(start=y_test.index[-1] + pd.DateOffset(1), periods=len(predictions), freq='M')
    forecast_series = pd.Series(predictions, index=forecast_dates)
    
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test, label='Actual')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', linestyle='--')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.title('Actual vs Forecasted Stock Price')
    plt.legend()
    plt.show()

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # Train XGBoost Model
    model = train_xgboost_model(X_train, y_train)

    # Forecast Stock Prices for the Next 12 Months
    predictions = forecast_stock_prices(model, X_test, steps=12)
    
    # Show Forecast Summary and Graph
    forecast_df = pd.DataFrame({'Forecast': predictions})
    logging.info(f"Forecast Summary:\n{forecast_df}")
    plot_forecast(y_test, predictions)

if __name__ == "__main__":
    main()


In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data for exploratory data analysis and model preparation."""
    # Handling missing values
    missing_summary = df.isnull().sum()
    logging.info(f"Missing Values per Column:\n{missing_summary}")
    df.dropna(inplace=True)

    # Removing duplicates
    duplicates_count = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {duplicates_count}")
    df.drop_duplicates(inplace=True)

    return df

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Adds engineered features to the stock data for analysis."""
    # Lag features
    df['Lag1'] = df['Close'].shift(1)
    df['Lag5'] = df['Close'].shift(5)
    df['Lag10'] = df['Close'].shift(10)

    # Moving averages
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()

    # Volatility
    df['Volatility'] = df['Close'].rolling(window=5).std()

    # Remove NaN values resulting from rolling windows
    df.dropna(inplace=True)
    
    return df

def generate_eda_report(df: pd.DataFrame):
    """Generates an EDA report with a heatmap for the feature-engineered data."""
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def split_data(df: pd.DataFrame, test_size: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the data into training and testing sets.
    
    Args:
        df (pd.DataFrame): The feature-engineered stock data.
        test_size (float): Fraction of the data to be used for testing.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: X_train, X_test, y_train, y_test.
    """
    X = df.drop(['Close'], axis=1)
    y = df['Close']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    return X_train, X_test, y_train, y_test

def train_xgboost_model(X_train, y_train):
    """Trains an XGBoost model using grid search for hyperparameter tuning."""
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best Hyperparameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def forecast_stock_prices(model, X_test, steps=12):
    """Forecasts stock prices for the next 12 months using the trained model."""
    future_predictions = []
    last_valid_index = X_test.shape[0] - 1
    current_input = X_test.iloc[last_valid_index, :].values.reshape(1, -1)
    
    for _ in range(steps):
        pred = model.predict(current_input)[0]
        future_predictions.append(pred)
        
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = pred

    return future_predictions

def plot_forecast(y_test, predictions):
    """Plots the actual and forecasted stock prices."""
    forecast_dates = pd.date_range(start=y_test.index[-1] + pd.DateOffset(1), periods=len(predictions), freq='M')
    forecast_series = pd.Series(predictions, index=forecast_dates)
    
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test, label='Actual')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', linestyle='--')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.title('Actual vs Forecasted Stock Price')
    plt.legend()
    plt.show()

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # Train XGBoost Model
    model = train_xgboost_model(X_train, y_train)

    # Forecast Stock Prices for the Next 12 Months
    predictions = forecast_stock_prices(model, X_test, steps=12)
    
    # Show Forecast Summary and Graph
    forecast_df = pd.DataFrame({'Forecast': predictions})
    logging.info(f"Forecast Summary:\n{forecast_df}")
    plot_forecast(y_test, predictions)

if __name__ == "__main__":
    main()


# Final model code 

In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data for exploratory data analysis and model preparation."""
    # Handling missing values
    missing_summary = df.isnull().sum()
    logging.info(f"Missing Values per Column:\n{missing_summary}")
    df.dropna(inplace=True)

    # Removing duplicates
    duplicates_count = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {duplicates_count}")
    df.drop_duplicates(inplace=True)

    return df

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Adds engineered features to the stock data for analysis."""
    df['Lag1'] = df['Close'].shift(1)
    df['Lag5'] = df['Close'].shift(5)
    df['Lag10'] = df['Close'].shift(10)
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()
    df['Volatility'] = df['Close'].rolling(window=5).std()
    df.dropna(inplace=True)
    
    return df

def generate_eda_report(df: pd.DataFrame):
    """Generates an EDA report with a heatmap for the feature-engineered data."""
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def split_data(df: pd.DataFrame, test_size: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the data into training and testing sets."""
    X = df.drop(['Close'], axis=1)
    y = df['Close']
    
    split_index = int(len(X) * (1 - test_size))
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]
    
    return X_train, X_test, y_train, y_test

def train_xgboost_model(X_train, y_train):
    """Trains an XGBoost model using grid search for hyperparameter tuning."""
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    
    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0]
    }
    
    time_split = TimeSeriesSplit(n_splits=5)
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=time_split, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best Hyperparameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def forecast_stock_prices(model, X_test, steps=12):
    """Forecasts stock prices for the next 12 months using the trained model."""
    future_predictions = []
    last_valid_index = X_test.shape[0] - 1
    current_input = X_test.iloc[last_valid_index, :].values.reshape(1, -1)
    
    for _ in range(steps):
        pred = model.predict(current_input)[0]
        future_predictions.append(pred)
        
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = pred

    return future_predictions

def plot_forecast(y_test, predictions):
    """Plots the actual and forecasted stock prices using Plotly."""
    forecast_dates = pd.date_range(start=y_test.index[-1] + pd.DateOffset(1), periods=len(predictions), freq='M')
    forecast_series = pd.Series(predictions, index=forecast_dates)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=y_test.index, y=y_test, mode='lines', name='Actual'))
    fig.add_trace(go.Scatter(x=forecast_series.index, y=forecast_series, mode='lines', name='Forecast'))
    fig.update_layout(title='Actual vs Forecasted Stock Price',
                      xaxis_title='Date', yaxis_title='Stock Price')
    fig.show()

def summarize_forecast(predictions):
    """Summarizes forecast results by calculating descriptive statistics."""
    forecast_series = pd.Series(predictions)
    summary_stats = forecast_series.describe()
    logging.info(f"Forecast Summary:\n{summary_stats}")

def evaluate_model_performance(y_test, predictions):
    """Evaluates the model's accuracy using common metrics and prints a summary report."""
    forecast_dates = pd.date_range(start=y_test.index[-1] + pd.DateOffset(1), periods=len(predictions), freq='M')
    forecast_series = pd.Series(predictions, index=forecast_dates)
    
    common_length = min(len(y_test), len(forecast_series))
    y_test_common = y_test[-common_length:]
    forecast_series_common = forecast_series[:common_length]
    
    mae = mean_absolute_error(y_test_common, forecast_series_common)
    rmse = np.sqrt(mean_squared_error(y_test_common, forecast_series_common))
    r2 = r2_score(y_test_common, forecast_series_common)
    
    print("\nModel Performance Summary:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (R2): {r2:.2f}")
    
    logging.info(f"Model Performance:\n"
                 f"Mean Absolute Error (MAE): {mae:.2f}\n"
                 f"Root Mean Squared Error (RMSE): {rmse:.2f}\n"
                 f"R-squared (R2): {r2:.2f}")

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # Train XGBoost Model
    model = train_xgboost_model(X_train, y_train)

    # Forecast Stock Prices for the Next 12 Months
    predictions = forecast_stock_prices(model, X_test, steps=12)
    
    # Summarize and Show Forecast Graph
    summarize_forecast(predictions)
    plot_forecast(y_test, predictions)

    # Evaluate Model Performance
    evaluate_model_performance(y_test, predictions)

if __name__ == "__main__":
    main()


In [None]:
# without accuracy model

In [None]:
import logging
import pandas as pd
import numpy as np
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime=s - %(message)s')

def download_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Downloads stock data from Yahoo Finance."""
    try:
        logging.info(f"Downloading stock data for ticker: {ticker}")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        return data
    except Exception as e:
        logging.error(f"Failed to download stock data: {e}")
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the stock data for exploratory data analysis and model preparation."""
    # Handling missing values
    missing_summary = df.isnull().sum()
    logging.info(f"Missing Values per Column:\n{missing_summary}")
    df.dropna(inplace=True)

    # Removing duplicates
    duplicates_count = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {duplicates_count}")
    df.drop_duplicates(inplace=True)

    return df

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Adds engineered features to the stock data for analysis."""
    # Lag features
    df['Lag1'] = df['Close'].shift(1)
    df['Lag5'] = df['Close'].shift(5)
    df['Lag10'] = df['Close'].shift(10)

    # Moving averages
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()

    # Volatility
    df['Volatility'] = df['Close'].rolling(window=5).std()

    # Remove NaN values resulting from rolling windows
    df.dropna(inplace=True)
    
    return df

def generate_eda_report(df: pd.DataFrame):
    """Generates an EDA report with a heatmap for the feature-engineered data."""
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

def split_data(df: pd.DataFrame, test_size: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the data into training and testing sets.
    
    Args:
        df (pd.DataFrame): The feature-engineered stock data.
        test_size (float): Fraction of the data to be used for testing.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: X_train, X_test, y_train, y_test.
    """
    X = df.drop(['Close'], axis=1)
    y = df['Close']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    return X_train, X_test, y_train, y_test

def train_xgboost_model(X_train, y_train):
    """Trains an XGBoost model using grid search for hyperparameter tuning."""
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best Hyperparameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def forecast_stock_prices(model, X_test, steps=12):
    """Forecasts stock prices for the next 12 months using the trained model."""
    future_predictions = []
    last_valid_index = X_test.shape[0] - 1
    current_input = X_test.iloc[last_valid_index, :].values.reshape(1, -1)
    
    for _ in range(steps):
        pred = model.predict(current_input)[0]
        future_predictions.append(pred)
        
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = pred

    return future_predictions

def plot_forecast(y_test, predictions):
    """Plots the actual and forecasted stock prices using Plotly."""
    forecast_dates = pd.date_range(start=y_test.index[-1] + pd.DateOffset(1), periods=len(predictions), freq='M')
    forecast_series = pd.Series(predictions, index=forecast_dates)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=y_test.index, y=y_test, mode='lines', name='Actual'))
    fig.add_trace(go.Scatter(x=forecast_series.index, y=forecast_series, mode='lines', name='Forecast'))
    fig.update_layout(title='Actual vs Forecasted Stock Price',
                      xaxis_title='Date', yaxis_title='Stock Price')
    fig.show()

def summarize_forecast(predictions):
    """Summarizes forecast results by calculating descriptive statistics."""
    forecast_series = pd.Series(predictions)
    summary_stats = forecast_series.describe()
    logging.info(f"Forecast Summary:\n{summary_stats}")

def main():
    ticker_symbol = 'TATAELXSI.NS'
    start_date = '2019-04-01'
    end_date = '2024-03-31'
    
    stock_data = download_stock_data(ticker_symbol, start_date, end_date)
    preprocessed_data = preprocess_data(stock_data)
    
    # Feature Engineering Step
    engineered_data = feature_engineering(preprocessed_data)

    # Generate EDA Report
    generate_eda_report(engineered_data)

    # Train-Test Split
    X_train, X_test, y_train, y_test = split_data(engineered_data, test_size=0.2)
    logging.info(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # Train XGBoost Model
    model = train_xgboost_model(X_train, y_train)

    # Forecast Stock Prices for the Next 12 Months
    predictions = forecast_stock_prices(model, X_test, steps=12)
    
    # Summarize and Show Forecast Graph
    summarize_forecast(predictions)
    plot_forecast(y_test, predictions)

if __name__ == "__main__":
    main()


# Comparing Models