<a href="https://colab.research.google.com/github/sifatbhuiyan0909/Dhaka-Finance-Navigator/blob/main/notebooks/Hyperparameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
uploaded= files.upload()

Saving Dhaka-Stock-Exchange-DSE-2021.csv to Dhaka-Stock-Exchange-DSE-2021.csv
Saving Dhaka-Stock-Exchange-DSE-2020.csv to Dhaka-Stock-Exchange-DSE-2020.csv
Saving Dhaka-Stock-Exchange-DSE-2019.csv to Dhaka-Stock-Exchange-DSE-2019.csv
Saving Dhaka-Stock-Exchange-DSE-2018.csv to Dhaka-Stock-Exchange-DSE-2018.csv
Saving Dhaka-Stock-Exchange-DSE-2017.csv to Dhaka-Stock-Exchange-DSE-2017.csv
Saving Dhaka-Stock-Exchange-DSE-2016.csv to Dhaka-Stock-Exchange-DSE-2016.csv
Saving Dhaka-Stock-Exchange-DSE-2015.csv to Dhaka-Stock-Exchange-DSE-2015.csv
Saving Dhaka-Stock-Exchange-DSE-2014.csv to Dhaka-Stock-Exchange-DSE-2014.csv
Saving Dhaka-Stock-Exchange-DSE-2013.csv to Dhaka-Stock-Exchange-DSE-2013.csv
Saving Dhaka-Stock-Exchange-DSE-2012.csv to Dhaka-Stock-Exchange-DSE-2012.csv
Saving Dhaka-Stock-Exchange-DSE-2011.csv to Dhaka-Stock-Exchange-DSE-2011.csv
Saving Dhaka-Stock-Exchange-DSE-2010.csv to Dhaka-Stock-Exchange-DSE-2010.csv
Saving Dhaka-Stock-Exchange-DSE-2009.csv to Dhaka-Stock-Exchange

In [3]:
import pandas as pd
import glob
import os
import numpy as np
import pandas_ta as ta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# --- DATA PREPARATION: Days 3-8 ---
print("--- 1. Initializing Data Pipeline (Days 3-8) ---")
all_filenames = glob.glob(os.path.join('.', '*.csv'))
master_df = pd.concat([pd.read_csv(f, header=None) for f in all_filenames], ignore_index=True)
master_df.columns = ['Ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']
df = master_df.copy()

# Day 4: Cleaning and Structuring
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
for col in ['Open', 'Low', 'Volume']:
    df[col] = pd.to_numeric(df[col].astype(str).str.replace('-', '', regex=False), errors='coerce')
df = df.drop_duplicates(subset=['Date', 'Ticker'], keep='first')
df = df.set_index('Date')

# Day 5 & 6: Imputation and Outlier Correction
cols_to_fill = ['Open', 'High', 'Low', 'Close', 'Volume']
df[cols_to_fill] = df[cols_to_fill].ffill()
df['Daily_Return'] = df['Close'].pct_change()
mu, sigma = df['Daily_Return'].mean(), df['Daily_Return'].std()
outlier_mask = (df['Daily_Return'] < mu - 3 * sigma) | (df['Daily_Return'] > mu + 3 * sigma)
df.loc[outlier_mask, cols_to_fill] = np.nan
df[cols_to_fill] = df[cols_to_fill].ffill()
df = df.drop(columns=['Daily_Return'])

# Day 7: Base Feature Engineering
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
df['SMA_5'] = df['Close'].rolling(window=5).mean()
df['SMA_20'] = df['Close'].rolling(window=20).mean()

# Day 8: Advanced Feature Engineering (Groupby Fix)
df = df.reset_index()
def create_advanced_features(group):
    group['RSI'] = ta.rsi(close=group['Close'], length=14)
    macd_result = ta.macd(close=group['Close'])
    if macd_result is not None:
        if 'MACD_12_26_9' in macd_result.columns:
            group['MACD'] = macd_result['MACD_12_26_9']
        else:
            group['MACD'] = macd_result.iloc[:, 0]
    else:
        group['MACD'] = np.nan
    group['Lag_Log_Return'] = group['Log_Return'].shift(1)
    return group.dropna()
df = df.groupby('Ticker', group_keys=False).apply(create_advanced_features)
df = df.set_index('Date')
print("--- Data Preparation (Days 3-8) Complete ---")

# --- MODEL PREPARATION: Day 9 ---
print("\n--- 2. Day 9: Target Creation and Data Splitting ---")
def create_target(group):
    future_price = group['Close'].shift(-1)
    target = (future_price > group['Close']).astype(int)
    return group.assign(Target=target).dropna(subset=['Target'])

df = df.groupby('Ticker', group_keys=False).apply(create_target)

# Define X (Features) and y (Target)
X = df.drop(columns=['Target', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker'])
y = df['Target']

# Chronological Split (shuffle=False is essential for time series)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False, stratify=None
)
print("--- Data Split (Day 9) Complete. ---")

# --- MODEL TRAINING: Day 10 ---
print("\n--- 3. Day 10: Model Training and Evaluation (Baseline) ---")

# 1. Initialize and Train the Model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
print("Training Random Forest Classifier on historical data...")
model.fit(X_train, y_train)

# 2. Predict on the Test Set
y_pred = model.predict(X_test)

# 3. Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print("\n--- Evaluation Results ---")
print(f"Overall Accuracy on Test Set: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)

print("\n--- Pipeline Execution (Days 3-10) Complete. ---")

--- 1. Initializing Data Pipeline (Days 3-8) ---


  df = df.groupby('Ticker', group_keys=False).apply(create_advanced_features)
  df = df.groupby('Ticker', group_keys=False).apply(create_advanced_features)


--- Data Preparation (Days 3-8) Complete ---

--- 2. Day 9: Target Creation and Data Splitting ---


  df = df.groupby('Ticker', group_keys=False).apply(create_target)


--- Data Split (Day 9) Complete. ---

--- 3. Day 10: Model Training and Evaluation (Baseline) ---
Training Random Forest Classifier on historical data...

--- Evaluation Results ---
Overall Accuracy on Test Set: 57.78%

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.79      0.69    139885
           1       0.47      0.27      0.35     97156

    accuracy                           0.58    237041
   macro avg       0.54      0.53      0.52    237041
weighted avg       0.55      0.58      0.55    237041


--- Pipeline Execution (Days 3-10) Complete. ---


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# X_train, y_train, X_test, y_test are already in memory from your previous run.

print("\n--- Day 11: Hyperparameter Tuning with GridSearchCV ---")

# 1. Define the Parameter Grid
# The search will test all 8 combinations of these parameters.
param_grid = {
    'n_estimators': [50, 100],      # Number of trees (voters)
    'max_depth': [5, 10],           # Max depth of each tree (limits complexity)
    'min_samples_leaf': [5, 10]     # Min samples required at a leaf (makes rules stricter)
}

# 2. Initialize Grid Search (The tuning mechanism)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='accuracy', # The metric to optimize during the search
    cv=3,               # Cross-validation folds
    verbose=1
)

print("Starting Grid Search to find optimal parameters...")
grid_search.fit(X_train, y_train)

# 3. Get the Best Model and Evaluate
best_model = grid_search.best_estimator_

print("\nBest Parameters Found:")
print(grid_search.best_params_)

y_pred_tuned = best_model.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

print(f"\nTuned Model Accuracy: {accuracy_tuned * 100:.2f}%")
print("\nTuned Classification Report:")
print(classification_report(y_test, y_pred_tuned, zero_division=0))


--- Day 11: Hyperparameter Tuning with GridSearchCV ---
Starting Grid Search to find optimal parameters...
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best Parameters Found:
{'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 100}

Tuned Model Accuracy: 59.84%

Tuned Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.95      0.74    139885
           1       0.56      0.10      0.16     97156

    accuracy                           0.60    237041
   macro avg       0.58      0.52      0.45    237041
weighted avg       0.58      0.60      0.50    237041

