In [None]:
import os
import pandas as pd
import numpy as np
import zipfile
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# === STEP 1: SET FILE PATHS ===
zip_path = "/content/dividend_data_per_company_5 (3) (1).zip"
yfinance_path = "/content/combined_stock_data.csv"
extract_dir = "/content/data_dividend_extracted"
out_file = "/content/ml_first_post_announcement_AugSep2025_LR_BESTFEAT.csv"

# === STEP 2: EXTRACT ZIP ===
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(extract_dir)
print(f"‚úÖ Extracted ZIP to: {extract_dir}")

# === STEP 3: LOAD DIVIDEND FILES AND COMPUTE FEATURES ===
div_rows = []
for root, dirs, files in os.walk(extract_dir):
    for f in files:
        if f.lower().endswith(".csv"):
            full_path = os.path.join(root, f)
            ticker_folder = os.path.basename(root)
            df = pd.read_csv(full_path)
            if "Announcement Date" not in df.columns or "Dividend (‚Çπ)" not in df.columns:
                continue
            df = df[["Announcement Date", "Dividend (‚Çπ)"]].dropna()
            df["ticker"] = ticker_folder
            df["announcement_date"] = pd.to_datetime(df["Announcement Date"], errors="coerce")
            df["div_amt"] = df["Dividend (‚Çπ)"]
            div_rows.append(df[["ticker", "announcement_date", "div_amt"]])

div_df = pd.concat(div_rows, ignore_index=True)
div_df = div_df.sort_values(["ticker", "announcement_date"])
div_df["log_dividend"] = np.log1p(div_df["div_amt"])
div_df["mean_div_so_far"] = (
    div_df.groupby("ticker")["div_amt"].expanding().mean().shift(1).reset_index(level=0, drop=True)
)
div_df["dividend_ratio"] = div_df.apply(
    lambda x: 1 if pd.isna(x["mean_div_so_far"]) or x["mean_div_so_far"] == 0
    else x["div_amt"] / x["mean_div_so_far"],
    axis=1
)
print(f"‚úÖ Dividend records loaded: {len(div_df)}")

# === STEP 4: LOAD YFINANCE DATA ===
yf = pd.read_csv(yfinance_path)
yf["date"] = pd.to_datetime(yf["Date"], errors="coerce")
yf["ticker"] = (
    yf["Symbol"].astype(str).str.upper().str.strip()
    if "Symbol" in yf.columns
    else yf["Company Name"].astype(str).str.upper().str.replace(r"\s+", "", regex=True)
)
yf = yf.sort_values(["ticker", "date"]).reset_index(drop=True)
yf["prev_close"] = yf.groupby("ticker")["Close"].shift(1)
yf["return_pct"] = (yf["Close"] - yf["prev_close"]) / yf["prev_close"] * 100

# Merge dividend features
yf = yf.merge(
    div_df[["ticker", "announcement_date", "log_dividend", "dividend_ratio"]],
    how="left",
    left_on=["ticker", "date"],
    right_on=["ticker", "announcement_date"]
)
yf["log_dividend"].fillna(0, inplace=True)
yf["dividend_ratio"].fillna(1, inplace=True)

# === STEP 5: FEATURE ENGINEERING ===
yf["lag1"] = yf.groupby("ticker")["return_pct"].shift(1).fillna(0)
yf["lag2"] = yf.groupby("ticker")["return_pct"].shift(2).fillna(0)
yf["lag3"] = yf.groupby("ticker")["return_pct"].shift(3).fillna(0)
yf["lag5"] = yf.groupby("ticker")["return_pct"].shift(1).rolling(5).mean().reset_index(0, drop=True).fillna(0)
yf["lag10"] = yf.groupby("ticker")["return_pct"].shift(1).rolling(10).mean().reset_index(0, drop=True).fillna(0)
yf["vol20"] = yf.groupby("ticker")["return_pct"].rolling(20).std().reset_index(0, drop=True).fillna(0)
yf["momentum10"] = yf.groupby("ticker")["return_pct"].shift(1).rolling(10).sum().reset_index(0, drop=True).fillna(0)

yf["return_next"] = yf.groupby("ticker")["return_pct"].shift(-1)
yf["target"] = yf["return_next"].apply(lambda x: 1 if pd.notna(x) and x >= 0 else (0 if pd.notna(x) else pd.NA))
yf["isdividendday"] = (yf["log_dividend"] > 0).astype(int)
yf["nonannouncement"] = (yf["isdividendday"] == 0).astype(int)

# === CHOOSE BEST FEATURE COMBO (based on earlier tests) ===
features = ["lag1", "lag2", "lag3", "vol20", "log_dividend", "dividend_ratio"]

# === STEP 6: HELPER FUNCTION ===
def get_first_post_dividend(df, div_df_filtered):
    rows = []
    for tkr, sub in df.groupby("ticker"):
        sub = sub.sort_values("date").reset_index(drop=True)
        div_dates = div_df_filtered[div_df_filtered["ticker"] == tkr]["announcement_date"]
        for dd in div_dates:
            candidate = sub[sub["date"] > dd]
            if not candidate.empty:
                rows.append(candidate.iloc[[0]])
    return pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=df.columns)

# === STEP 7: TRAIN/TEST SPLIT ===
train_df = get_first_post_dividend(yf, div_df[div_df["announcement_date"] < "2025-08-01"])
test_df = get_first_post_dividend(yf, div_df[(div_df["announcement_date"] >= "2025-08-01") & (div_df["announcement_date"] <= "2025-09-30")])
train_df = train_df.dropna(subset=["target"])
test_df = test_df.dropna(subset=["target"])
X_train, y_train = train_df[features], train_df["target"].astype(int)
X_test, y_test = test_df[features], test_df["target"].astype(int)

# === STEP 8: SCALING & CLASS BALANCING ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compute class weights to handle imbalance
classes = np.unique(y_train)
weights = compute_class_weight("balanced", classes=classes, y=y_train)
class_weights = {cls: w for cls, w in zip(classes, weights)}

# === STEP 9: TRAIN LOGISTIC REGRESSION ===
model = LogisticRegression(max_iter=1000, random_state=42, class_weight=class_weights)
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

print("\n=== Training Performance ===")
print(f"Training F1: {f1_score(y_train, y_train_pred):.4f}")
print(classification_report(y_train, y_train_pred, digits=4))

print("\n=== Test Performance ===")
print(f"Test F1: {f1_score(y_test, y_test_pred):.4f}")
print(classification_report(y_test, y_test_pred, digits=4))

# === STEP 10: STRATEGY RETURNS ===
test_df["ml_predicted_sign"] = [1 if p >= 0.5 else -1 for p in model.predict_proba(X_test_scaled)[:, 1]]
test_df["ml_strategy_return"] = test_df["ml_predicted_sign"] * test_df["return_next"]
test_df["oracle_strategy_return"] = np.sign(test_df["return_next"]) * test_df["return_next"]

ml_avg_return = test_df["ml_strategy_return"].mean()
oracle_avg_return = test_df["oracle_strategy_return"].mean()
actual_avg_return = test_df["return_next"].mean()

print("\n=== Strategy Return Summary ===")
print(f"ML strategy avg return: {ml_avg_return:.6f}")
print(f"Oracle (perfect) avg return: {oracle_avg_return:.6f}")
print(f"Actual avg return: {actual_avg_return:.6f}")

# === STEP 11: SAVE RESULTS ===
test_df.to_csv(out_file, index=False)
print(f"\n‚úÖ Saved output: {out_file}")
print(f"üìÅ Rows in output: {len(test_df)}")
