# Imports

In [39]:
# === Standard Library ===
import os
from pathlib import Path

# === Third-Party Libraries ===
import numpy as np
import pandas as pd

# === Scikit-learn: Preprocessing ===
from sklearn.preprocessing import OneHotEncoder

# === Scikit-learn: Model Selection ===
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# === Scikit-learn: Models ===
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# === XGBoost ===
from xgboost import XGBClassifier

# === Scikit-learn: Evaluation ===
from sklearn.metrics import classification_report

# === Persistence ===
from joblib import dump

# === Imbalanced-learn ===
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Data Settings

In [40]:
tfidf_path = Path(f"../words_TFIDF/")
model_save_path = Path(f"./models/")

quarters = [
    "Q1-2023",
    "Q2-2023",
    "Q3-2023",
    "Q4-2023",
    "Q1-2024",
    "Q2-2024",
    "Q3-2024",
    "Q4-2024"
]

In [41]:
import pandas as pd
import os

# Load TF-IDF for Q1 2025
tfidf_df_q12025 = pd.read_csv(f"{tfidf_path}/tfidf_q1_2025.csv")

# Load TF-IDF for Q1 2023 - Q4 2024
tfidf_df_otherq = pd.read_csv(f"{tfidf_path}/tfidf_non_q1_2025.csv")

# Load Data


In [42]:
import pandas as pd
import glob
import os

# Recursively find all EPS CSVs under data/
eps_files = glob.glob("../../data/**/EPS-*.csv", recursive=True)

# Build company-to-DataFrame map (case-insensitive)
eps_data = {}

for filepath in eps_files:
    # Use parent folder name as company name
    company_name = os.path.basename(os.path.dirname(filepath))
    try:
        df = pd.read_csv(filepath)
        df['Quarter'] = df['Quarter'].str.strip().str.upper()
        df['Company'] = company_name
        eps_data[company_name.lower()] = df  # Store using lowercase key
    except Exception as e:
        print(f"Error loading {filepath}: {e}")

# === Load TF-IDF Data ===
tfidf_train = pd.read_csv(f"{tfidf_path}/tfidf_non_q1_2025.csv")  # Q1 2023 to Q4 2024
    

# Preprocess Train Data
Merge EPS data with the current TFIDF data and label.
1. Label = 1  if  EPS in current_quarter > EPS in past_quarter
2. Label = 0  otherwise (EPS decreased or stayed the same)

In [43]:
# === Function to get EPS label (EPS increase between two quarters) ===
def get_eps_label(company_name, past_quarter, current_quarter):
    key = company_name.lower()
    if key not in eps_data:
        return None
    df = eps_data[key]
    try:
        eps_before = df.loc[df['Quarter'] == past_quarter.upper(), 'EPS'].values[0]
        eps_current = df.loc[df['Quarter'] == current_quarter.upper(), 'EPS'].values[0]
        return int(eps_current > eps_before)
    except IndexError:
        return None
    
# List of TF-IDF quarters (dash format)
quarters_dash = [
    "Q1-2023",
    "Q2-2023",
    "Q3-2023",
    "Q4-2023",
    "Q1-2024",
    "Q2-2024",
    "Q3-2024",
    "Q4-2024"
]

# Convert to EPS-style format "Q12023, Q22023, Q32023, Q42023"
quarters_eps = [q.replace("-", "") for q in quarters_dash]

# Collect all labeled training rows
all_train_dfs = []

for idx in range(len(quarters_dash)):
    current_q_dash = quarters_dash[idx]
    current_q_eps = quarters_eps[idx]
    
    if idx == 0:
        past_q_eps = "Q42022"
    else:
        past_q_eps = quarters_eps[idx - 1]

    # Filter TF-IDF for current quarter
    df_train_q = tfidf_train[tfidf_train["quarter"] == current_q_dash].copy()
    
    # Label with EPS increase between past and current quarter
    df_train_q["Label"] = df_train_q["company"].apply(lambda c: get_eps_label(c, past_q_eps, current_q_eps))
    df_train_q = df_train_q.dropna(subset=["Label"])

    all_train_dfs.append(df_train_q)

# Final training data
train_df_combined = pd.concat(all_train_dfs, ignore_index=True)

# Classification
We will try to answer: "Will EPS increase in Q1 2025 compared to Q4 2024?"

- Features: All TFIDF words
- Target: Binary (0 = no increase, 1 = increase)

# Train Set Creation

In [44]:
from sklearn.decomposition import TruncatedSVD

# 1. Drop company and quarter
df = train_df_combined.drop(columns=["company", "quarter"])

# 2. Separate features and target
X_tfidf = df.drop(columns=["Sector", "Label"])  # TF-IDF (1000 cols)
y_train = df["Label"].astype(int)

# 3. Reduce dimensionality of TF-IDF
svd = TruncatedSVD(n_components=50, random_state=42)
X_tfidf_reduced = svd.fit_transform(X_tfidf)

# 4. One-hot encode sector
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
sector_encoded = encoder.fit_transform(df[["Sector"]])  # must be 2D

# 5. Concatenate reduced TF-IDF + Sector features
X_train = np.hstack([X_tfidf_reduced, sector_encoded])

In [45]:
# Define parameter grids
from sklearn.svm import SVC


param_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(class_weight="balanced", max_iter=1000),
        "params": {
            "model__C": [0.001, 0.01, 0.1, 1, 10, 100]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(class_weight="balanced", random_state=42),
        "params": {
            "model__n_estimators": [100, 200, 300],
            "model__max_depth": [None, 10, 20, 30],
            "model__min_samples_split": [2, 5],
            "model__max_features": ["sqrt", "log2"]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(eval_metric="logloss", random_state=42, use_label_encoder=False),
        "params": {
            "model__n_estimators": [100, 200],
            "model__max_depth": [3, 6, 9],
            "model__learning_rate": [0.01, 0.1],
            "model__scale_pos_weight": [1, 2]
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "model__n_neighbors": [3, 5, 7],
            "model__weights": ["uniform", "distance"],
            "model__metric": ["euclidean", "manhattan"]
        }
    },
    "SVM (RBF)": {
        "model": SVC(class_weight="balanced", probability=True),
        "params": {
            "model__C": [0.1, 1, 10],
            "model__gamma": ["scale", 0.01, 0.001],
            "model__kernel": ["rbf"]
        }
    }
}

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearch for each classifier
best_models = []
for name, spec in param_grids.items():
    print(f"\n=== GridSearchCV for {name} ===")

    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ("smote", SMOTE(random_state=42)),
        ("model", spec["model"])
    ])

    grid = GridSearchCV(pipeline, spec["params"], cv=skf, scoring="f1", n_jobs=-1)
    grid.fit(X_train, y_train)
    
    print(f"Best params: {grid.best_params_}")
    print(f"Best CV F1: {grid.best_score_:.4f}")
    
    best_model = grid.best_estimator_
    best_models.append((name, best_model))

    dump(best_model, f"{model_save_path}/best_model_{name.replace(' ', '_')}.joblib")
    print(f"Model saved: best_model_{name.replace(' ', '_')}.joblib")


=== GridSearchCV for Logistic Regression ===


Best params: {'model__C': 100}
Best CV F1: 0.5618
Model saved: best_model_Logistic_Regression.joblib

=== GridSearchCV for Random Forest ===
Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best CV F1: 0.5684
Model saved: best_model_Random_Forest.joblib

=== GridSearchCV for XGBoost ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best params: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 2}
Best CV F1: 0.7193
Model saved: best_model_XGBoost.joblib

=== GridSearchCV for KNN ===
Best params: {'model__metric': 'manhattan', 'model__n_neighbors': 5, 'model__weights': 'distance'}
Best CV F1: 0.5588
Model saved: best_model_KNN.joblib

=== GridSearchCV for SVM (RBF) ===
Best params: {'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
Best CV F1: 0.5634
Model saved: best_model_SVM_(RBF).joblib


# Preprocess Test Data

In [46]:
eps_data

{'welltower':          Date   EPS Quarter    Company
 0  2025-04-28  0.40  Q12025  Welltower
 1  2025-02-11  0.19  Q42024  Welltower
 2  2024-10-28  0.73  Q32024  Welltower
 3  2024-07-29  0.42  Q22024  Welltower
 4  2024-04-29  0.22  Q12024  Welltower
 5  2024-02-13  0.15  Q42023  Welltower
 6  2023-10-30  0.24  Q32023  Welltower
 7  2023-07-31  0.20  Q22023  Welltower
 8  2023-05-02  0.05  Q12023  Welltower
 9  2023-02-15 -0.01  Q42022  Welltower,
 'cbre':          Date   EPS Quarter Company
 0  2025-04-30  0.41  Q12025    CBRE
 1  2025-01-31  1.33  Q42024    CBRE
 2  2024-10-31  1.51  Q32024    CBRE
 3  2024-07-31  1.52  Q22024    CBRE
 4  2024-04-30  0.78  Q12024    CBRE
 5  2024-01-31  1.63  Q42023    CBRE
 6  2023-10-31  1.33  Q32023    CBRE
 7  2023-07-31  1.58  Q22023    CBRE
 8  2023-04-30  0.92  Q12023    CBRE
 9  2023-01-31  1.16  Q42022    CBRE,
 'qualcomm':          Date   EPS Quarter   Company
 0  2025-03-31  2.52  Q12025  Qualcomm
 1  2024-12-31  2.83  Q42024  Qualcomm
 

In [47]:
# === Function to get EPS label (EPS increase between two quarters) ===
def get_eps_label(company_name, past_quarter, current_quarter):
    key = company_name.lower()
    if key not in eps_data:
        return None
    df = eps_data[key]
    try:
        eps_before = df.loc[df['Quarter'] == past_quarter.upper(), 'EPS'].values[0]
        eps_current = df.loc[df['Quarter'] == current_quarter.upper(), 'EPS'].values[0]
        return int(eps_current > eps_before)
    except IndexError:
        return None
    
tfidf_test = pd.read_csv(f"{tfidf_path}/tfidf_q1_2025.csv")       # Q1 2025
df_test_q = tfidf_test.copy()

df_test_q["Label"] = df_test_q["company"].apply(lambda c: get_eps_label(c, "Q42024", "Q12025"))
df_test_q = df_test_q.dropna(subset=["Label"])

In [48]:
tfidf_test

Unnamed: 0,company,quarter,Sector,ability,able,accelerate,accelerated,accelerating,acceleration,access,...,window,won,wondering,working,workload,world,written,written consent,yield,york
0,Welltower,Q1-2025,RealEstate,0.005441,0.020699,0.00738,0.0,0.0,0.008116,0.007623,...,0.0,0.0,0.006822,0.010499,0.0,0.010613,0.0,0.0,0.007654,0.0
1,CBRE,Q1-2025,RealEstate,0.034911,0.00664,0.0,0.030979,0.0,0.0,0.019563,...,0.0,0.010459,0.008753,0.0,0.0,0.027235,0.0,0.0,0.0,0.0
2,Qualcomm,Q1-2025,Semiconductors,0.0,0.022867,0.0,0.0,0.0,0.0,0.022457,...,0.074147,0.0,0.0,0.0,0.016585,0.015632,0.0,0.0,0.0,0.0
3,Microsoft,Q1-2025,Tech,0.015446,0.034276,0.013966,0.007615,0.0,0.0384,0.014426,...,0.119077,0.007713,0.012909,0.014902,0.063925,0.015063,0.0,0.0,0.0,0.0
4,Visa,Q1-2025,Finance,0.0,0.015217,0.016275,0.005916,0.013344,0.011933,0.0,...,0.0,0.011985,0.025074,0.011577,0.0,0.062413,0.0,0.0,0.005627,0.0
5,Intel,Q1-2025,Semiconductors,0.006925,0.026344,0.009392,0.0,0.0,0.0,0.0,...,0.048048,0.020748,0.034727,0.060128,0.100311,0.013506,0.0,0.0,0.038966,0.0
6,WellsFargo,Q1-2025,Finance,0.0199,0.004731,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014905,0.031184,0.0,0.0,0.009703,0.0,0.0,0.013997,0.0
7,TSMC,Q1-2025,Semiconductors,0.0,0.006526,0.0,0.006089,0.002289,0.0,0.0,...,0.0,0.0,0.005161,0.007944,0.0,0.0,0.052386,0.080653,0.003861,0.0
8,3M,Q1-2025,ConsumerGoods,0.006561,0.0,0.004449,0.004852,0.010943,0.004893,0.0,...,0.0,0.004914,0.0,0.022155,0.0,0.003199,0.131821,0.202949,0.0,0.0
9,United,Q1-2025,Airlines,0.030165,0.0,0.0,0.0,0.01677,0.0,0.014086,...,0.0,0.0,0.0,0.0,0.0,0.029416,0.0,0.0,0.0,0.08583


# Test Set Creation

In [49]:
# 1. Drop metadata columns
df_test = df_test_q.drop(columns=["company", "quarter"])

# 2. Separate TF-IDF and sector columns
X_test_tfidf = df_test.drop(columns=["Sector", "Label"])
y_test = df_test["Label"].astype(int)

# 3. Apply the same SVD transformation used in training
X_test_tfidf_reduced = svd.transform(X_test_tfidf)  # Use same svd from training

# 4. One-hot encode sector using the same encoder
sector_test_encoded = encoder.transform(df_test[["Sector"]])  # Use same encoder from training

# 5. Concatenate reduced TF-IDF and sector features
X_test = np.hstack([X_test_tfidf_reduced, sector_test_encoded])

# Load Trained Models

In [50]:
from joblib import load
import os

model_names = ["Logistic_Regression", "Random_Forest", "XGBoost", "KNN", "SVM_(RBF)"]
best_models_loaded = []

for name in model_names:
    model_path = f"{model_save_path}/best_model_{name}.joblib"
    if os.path.exists(model_path):
        model = load(model_path)
        best_models_loaded.append((name.replace("_", " "), model))
    else:
        print(f"[Warning] Model file not found: {model_path}")

# Evaluate

In [51]:
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, f1_score, roc_auc_score
)

results = []

for name, model in best_models_loaded:
    print(f"\n=== {name} Evaluation on Q1 2025 Test Set ===")
    y_pred = model.predict(X_test)

    # Get predicted probabilities or scores for AUC-ROC
    try:
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = model.decision_function(X_test)
    except Exception as e:
        print(f"Could not compute AUC-ROC for {name}: {e}")
        y_prob = None

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else float("nan")

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1 Score": f1,
        "AUC-ROC": auc
    })


=== Logistic Regression Evaluation on Q1 2025 Test Set ===
Accuracy: 0.4118
F1 Score: 0.5000
AUC-ROC: 0.5590
Classification Report:
              precision    recall  f1-score   support

           0     0.3333    0.2500    0.2857        16
           1     0.4545    0.5556    0.5000        18

    accuracy                         0.4118        34
   macro avg     0.3939    0.4028    0.3929        34
weighted avg     0.3975    0.4118    0.3992        34

Confusion Matrix:
[[ 4 12]
 [ 8 10]]

=== Random Forest Evaluation on Q1 2025 Test Set ===
Accuracy: 0.5294
F1 Score: 0.5789
AUC-ROC: 0.5174
Classification Report:
              precision    recall  f1-score   support

           0     0.5000    0.4375    0.4667        16
           1     0.5500    0.6111    0.5789        18

    accuracy                         0.5294        34
   macro avg     0.5250    0.5243    0.5228        34
weighted avg     0.5265    0.5294    0.5261        34

Confusion Matrix:
[[ 7  9]
 [ 7 11]]

=== XGBoost

In [52]:
import pandas as pd
results_df = pd.DataFrame(results).sort_values("F1 Score", ascending=False)
results_df

Unnamed: 0,Model,Accuracy,F1 Score,AUC-ROC
2,XGBoost,0.470588,0.64,0.454861
4,SVM (RBF),0.588235,0.611111,0.59375
3,KNN,0.588235,0.611111,0.583333
1,Random Forest,0.529412,0.578947,0.517361
0,Logistic Regression,0.411765,0.5,0.559028
