<a href="https://colab.research.google.com/github/sifatbhuiyan0909/Dhaka-Finance-Navigator/blob/main/notebooks/09_model_prep_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
uploaded= files.upload()

Saving Dhaka-Stock-Exchange-DSE-2021.csv to Dhaka-Stock-Exchange-DSE-2021.csv
Saving Dhaka-Stock-Exchange-DSE-2020.csv to Dhaka-Stock-Exchange-DSE-2020.csv
Saving Dhaka-Stock-Exchange-DSE-2019.csv to Dhaka-Stock-Exchange-DSE-2019.csv
Saving Dhaka-Stock-Exchange-DSE-2018.csv to Dhaka-Stock-Exchange-DSE-2018.csv
Saving Dhaka-Stock-Exchange-DSE-2017.csv to Dhaka-Stock-Exchange-DSE-2017.csv
Saving Dhaka-Stock-Exchange-DSE-2016.csv to Dhaka-Stock-Exchange-DSE-2016.csv
Saving Dhaka-Stock-Exchange-DSE-2015.csv to Dhaka-Stock-Exchange-DSE-2015.csv
Saving Dhaka-Stock-Exchange-DSE-2014.csv to Dhaka-Stock-Exchange-DSE-2014.csv
Saving Dhaka-Stock-Exchange-DSE-2013.csv to Dhaka-Stock-Exchange-DSE-2013.csv
Saving Dhaka-Stock-Exchange-DSE-2012.csv to Dhaka-Stock-Exchange-DSE-2012.csv
Saving Dhaka-Stock-Exchange-DSE-2011.csv to Dhaka-Stock-Exchange-DSE-2011.csv
Saving Dhaka-Stock-Exchange-DSE-2010.csv to Dhaka-Stock-Exchange-DSE-2010.csv
Saving Dhaka-Stock-Exchange-DSE-2009.csv to Dhaka-Stock-Exchange

In [1]:
!pip install pandas_ta



In [2]:
import pandas as pd
import glob
import os
import numpy as np
import pandas_ta as ta

# --- 1. Day 3: Data Acquisition and Merging ---
print("--- Day 3: Merging Data ---")
# Assumes CSV files are in the same directory
all_filenames = glob.glob(os.path.join('.', '*.csv'))
master_df = pd.concat([pd.read_csv(f, header=None) for f in all_filenames], ignore_index=True)
master_df.columns = ['Ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']
df = master_df.copy()

# --- 2. Day 4: Structuring, Cleaning Types, and Fixing Duplicates ---
print("\n--- Day 4: Structuring and Cleaning Types ---")
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Clean and convert numerical columns
for col in ['Open', 'Low', 'Volume']:
    df[col] = df[col].astype(str).str.replace('-', '', regex=False)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Critical Fix: Drop duplicate Date/Ticker entries before setting index.
df = df.drop_duplicates(subset=['Date', 'Ticker'], keep='first')
df = df.set_index('Date')
print(f"Duplicates dropped. Data size: {df.shape}")

# --- 3. Day 5: Missing Data Imputation ---
print("\n--- Day 5: Missing Data Imputation ---")
cols_to_fill = ['Open', 'High', 'Low', 'Close', 'Volume']
df[cols_to_fill] = df[cols_to_fill].ffill()

# --- 4. Day 6: Outlier Detection and Smoothing ---
print("\n--- Day 6: Outlier Correction ---")
df['Daily_Return'] = df['Close'].pct_change()
mu = df['Daily_Return'].mean()
sigma = df['Daily_Return'].std()
outlier_mask = (df['Daily_Return'] < mu - 3 * sigma) | (df['Daily_Return'] > mu + 3 * sigma)
df.loc[outlier_mask, ['Open', 'High', 'Low', 'Close', 'Volume']] = None
df[['Open', 'High', 'Low', 'Close', 'Volume']] = df[['Open', 'High', 'Low', 'Close', 'Volume']].ffill()
df = df.drop(columns=['Daily_Return'])

# --- 5. Day 7: Base Feature Engineering ---
print("\n--- Day 7: Base Feature Engineering ---")
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
df['SMA_5'] = df['Close'].rolling(window=5).mean()
df['SMA_20'] = df['Close'].rolling(window=20).mean()

# --- 6. Day 8: Advanced Feature Engineering (Indicators & Lag) - USING GROUPBY FIX ---
print("\n--- Day 8: Advanced Feature Engineering (Indicators & Lag) ---")

# Prepare for Groupby: Reset index so Ticker is a column we can group by.
df = df.reset_index()

# Define the function to apply all final calculations to a single Ticker's data
def create_advanced_features(group):

    # Calculate RSI
    group['RSI'] = ta.rsi(close=group['Close'], length=14)

    # Calculate MACD
    macd_result = ta.macd(close=group['Close'])

    # --- CRITICAL FIX FOR ATTRIBUTE ERROR ---
    # Check if MACD calculation succeeded (it returns None if data is too short)
    if macd_result is not None:
        # Check for the primary MACD line, falling back if naming convention is different
        if 'MACD_12_26_9' in macd_result.columns:
            group['MACD'] = macd_result['MACD_12_26_9']
        else:
            # Fallback: selects the first column which is usually the main MACD line
            group['MACD'] = macd_result.iloc[:, 0]
    else:
        # If MACD calculation failed, fill with NaN
        group['MACD'] = np.nan

    # Lagged Log Return
    group['Lag_Log_Return'] = group['Log_Return'].shift(1)

    # Drop NaNs created by indicators/lagging *within this group*
    return group.dropna()

# Apply the function to every Ticker and concatenate the results back.
df = df.groupby('Ticker', group_keys=False).apply(create_advanced_features)

# Final Structuring: Set the Date back as the index
df = df.set_index('Date')

# --- 7. Final Summary ---
print("\n--- Final Summary ---")
print("DataFrame is completely cleaned and feature-engineered.")
print(f"Final Data Shape: {df.shape}")


--- Day 3: Merging Data ---

--- Day 4: Structuring and Cleaning Types ---
Duplicates dropped. Data size: (1198083, 6)

--- Day 5: Missing Data Imputation ---

--- Day 6: Outlier Correction ---

--- Day 7: Base Feature Engineering ---

--- Day 8: Advanced Feature Engineering (Indicators & Lag) ---

--- Final Summary ---
DataFrame is completely cleaned and feature-engineered.
Final Data Shape: (1185203, 12)


  df = df.groupby('Ticker', group_keys=False).apply(create_advanced_features)
  df = df.groupby('Ticker', group_keys=False).apply(create_advanced_features)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

print("--- Day 9: Target Variable Creation and Data Splitting ---")

# --- 1. Target Variable Creation (Binary Classifier) ---
def create_target(group):
    # Shift(-1) looks one period ahead to get the future price.
    future_price = group['Close'].shift(-1)

    # Target is 1 if the future price is higher than today's price (Close).
    target = (future_price > group['Close']).astype(int)

    # Drop the last row of each group, as it has no future price (NaN target).
    return group.assign(Target=target).dropna(subset=['Target'])

# Apply the function across all Tickers (using groupby to respect time sequence per stock)
df = df.groupby('Ticker', group_keys=False).apply(create_target)

# --- 2. Define Features (X) and Target (y) ---
# X includes all engineered features (Log_Return, SMAs, RSI, MACD, Lag_Log_Return)
X = df.drop(columns=['Target', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker'])
y = df['Target']

# --- 3. Split Data (Chronological Split) ---
# test_size=0.2 means 80% of data (older) is for training, 20% (newer) for testing.
# shuffle=False is critical for time-series data.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False, stratify=None
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


--- Day 9: Target Variable Creation and Data Splitting ---
X_train shape: (948162, 6)
X_test shape: (237041, 6)


  df = df.groupby('Ticker', group_keys=False).apply(create_target)


though frustrating, we are going to learn machine learning babyy