# Structured Modeling Notebook
This notebook is modularized into configuration, preprocessing, and three types of models.

## Configuration

In [None]:
broker = 'UBS'
data_folder = './your_data_path_here'

## Data Preprocessing

In [None]:

import pandas as pd
import os

def load_and_preprocess_data(broker, data_folder):
    raw_folder = os.path.join(data_folder, "raw data")
    stock_file = os.path.join(raw_folder, f"stock_{broker}.xlsx")
    broker_file = os.path.join(raw_folder, f"{broker}_{broker}.xlsx")
    treasury_file = os.path.join(data_folder, "10year_treasury.xlsx")

    df1 = pd.read_excel(stock_file, engine="openpyxl")
    df2 = pd.read_excel(broker_file, engine="openpyxl")
    df3 = pd.read_excel(treasury_file, engine="openpyxl")
    df3["date"] = pd.to_datetime(df3["date"], format="%Y-%m-%d", yearfirst=True)
    df3["date"] = df3["date"].dt.strftime("%Y-%m-%d")

    df = pd.merge(df2, df1, on=["日期"], how='outer')
    # Add more cleaning if necessary
    return df


## Logistic and OLS Modeling

In [None]:

from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

def run_logistic_ols(df):
    # Example: Binary label creation
    df['label'] = (df['收盤價'] > df['收盤價'].shift(1)).astype(int)
    df.dropna(inplace=True)

    X = df[['開盤價', '最高價', '最低價']]
    y = df['label']

    # Logistic Regression
    model_logit = LogisticRegression()
    model_logit.fit(X, y)

    # OLS model
    X_ols = sm.add_constant(X)
    model_ols = sm.OLS(df['收盤價'], X_ols).fit()

    return model_logit, model_ols


## Random Forest Modeling

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def run_random_forest(df):
    df['label'] = (df['收盤價'] > df['收盤價'].shift(1)).astype(int)
    df.dropna(inplace=True)

    X = df[['開盤價', '最高價', '最低價']]
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    return rf, X_test, y_test


## XGBoost Modeling

In [None]:

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def run_xgboost(df):
    df['label'] = (df['收盤價'] > df['收盤價'].shift(1)).astype(int)
    df.dropna(inplace=True)

    X = df[['開盤價', '最高價', '最低價']]
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    return model, X_test, y_test, acc



## Run All Models

In [None]:

# Load data
df = load_and_preprocess_data(broker, data_folder)

# Run models
logit_model, ols_model           = run_logistic_ols(df)
rf_model,  X_test, y_test        = run_random_forest(df)
xgb_model, X_xgb, y_xgb, xgb_acc = run_xgboost(df)


In [None]:
# Show summary for OLS
print(ols_model.summary())

# Show summary for Random Forest
print(f"Random Forest Accuracy: {rf_acc:.4f}")

# Show summary for XGBoost
print(f"XGBoost Accuracy: {xgb_acc:.4f}")

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confmat(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()

In [None]:
plot_confmat(y_rf, rf_model.predict(X_rf), title="Random Forest")
plot_confmat(y_xgb, xgb_model.predict(X_xgb), title="XGBoost")