# Imports

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os

# Data Settings

In [16]:
tfidf_path = Path(f"../words_TFIDF/")
model_save_path = Path(f"./models/")

quarters = [
    "Q1-2023",
    "Q2-2023",
    "Q3-2023",
    "Q4-2023",
    "Q1-2024",
    "Q2-2024",
    "Q3-2024",
    "Q4-2024"
]

In [3]:
import pandas as pd
import os

# Load TF-IDF for Q1 2025
tfidf_df_q12025 = pd.read_csv(f"{tfidf_path}/tfidf_q1_2025.csv")

# Load TF-IDF for Q1 2023 - Q4 2024
tfidf_df_otherq = pd.read_csv(f"{tfidf_path}/tfidf_non_q1_2025.csv")

# Load Data


In [None]:
import pandas as pd
import glob
import os

# Recursively find all EPS CSVs under data/
eps_files = glob.glob("../../data/**/EPS-*.csv", recursive=True)

# Build company-to-DataFrame map (case-insensitive)
eps_data = {}

for filepath in eps_files:
    # Use parent folder name as company name
    company_name = os.path.basename(os.path.dirname(filepath))
    try:
        df = pd.read_csv(filepath)
        df['Quarter'] = df['Quarter'].str.strip().str.upper()
        df['Company'] = company_name
        eps_data[company_name.lower()] = df  # Store using lowercase key
    except Exception as e:
        print(f"Error loading {filepath}: {e}")

# === Load TF-IDF Data ===
tfidf_train = pd.read_csv(f"{tfidf_path}/tfidf_non_q1_2025.csv")  # Q1 2023 to Q4 2024
    

# Preprocess Train Data
Merge EPS data with the current TFIDF data and label.
1. Label = 1  if  EPS in current_quarter > EPS in past_quarter
2. Label = 0  otherwise (EPS decreased or stayed the same)

In [None]:
# === Function to get EPS label (EPS increase between two quarters) ===
def get_eps_label(company_name, past_quarter, current_quarter):
    key = company_name.lower()
    if key not in eps_data:
        return None
    df = eps_data[key]
    try:
        eps_before = df.loc[df['Quarter'] == past_quarter.upper(), 'EPS'].values[0]
        eps_current = df.loc[df['Quarter'] == current_quarter.upper(), 'EPS'].values[0]
        return int(eps_current > eps_before)
    except IndexError:
        return None
    
# List of TF-IDF quarters (dash format)
quarters_dash = [
    "Q1-2023",
    "Q2-2023",
    "Q3-2023",
    "Q4-2023",
    "Q1-2024",
    "Q2-2024",
    "Q3-2024",
    "Q4-2024"
]

# Convert to EPS-style format "Q12023, Q22023, Q32023, Q42023"
quarters_eps = [q.replace("-", "") for q in quarters_dash]

# Collect all labeled training rows
all_train_dfs = []

for idx in range(len(quarters_dash)):
    current_q_dash = quarters_dash[idx]
    current_q_eps = quarters_eps[idx]
    
    if idx == 0:
        past_q_eps = "Q42022"
    else:
        past_q_eps = quarters_eps[idx - 1]

    # Filter TF-IDF for current quarter
    df_train_q = tfidf_train[tfidf_train["quarter"] == current_q_dash].copy()
    
    # Label with EPS increase between past and current quarter
    df_train_q["Label"] = df_train_q["company"].apply(lambda c: get_eps_label(c, past_q_eps, current_q_eps))
    df_train_q = df_train_q.dropna(subset=["Label"])

    all_train_dfs.append(df_train_q)

# Final training data
train_df_combined = pd.concat(all_train_dfs, ignore_index=True)

# Classification
We will try to answer: "Will EPS increase in Q1 2025 compared to Q4 2024?"

- Features: All TFIDF words
- Target: Binary (0 = no increase, 1 = increase)

# Train Set Creation

In [6]:
# 1. Drop company and quarter
df = train_df_combined.drop(columns=["company", "quarter"])

# 2. Separate features and target
X_tfidf = df.drop(columns=["Sector", "Label"])  # TF-IDF (1000 cols)
y_train = df["Label"].astype(int)

# 3. One-hot encode sector
encoder = OneHotEncoder(sparse_output = False, handle_unknown='ignore')
sector_encoded = encoder.fit_transform(df[["Sector"]])  # must be 2D

# 4. Concatenate TF-IDF + Sector features
X_train = np.hstack([X_tfidf.values, sector_encoded])

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from joblib import dump

# Define parameter grids
param_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(class_weight="balanced", max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(class_weight="balanced", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "scale_pos_weight": [1, 2]  # for imbalance
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7]
        }
    },
    "Linear SVC": {
        "model": LinearSVC(class_weight="balanced", max_iter=2000),
        "params": {
            "C": [0.01, 0.1, 1]
        }
    }
}

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearch for each classifier
best_models = []
for name, spec in param_grids.items():
    print(f"\n=== GridSearchCV for {name} ===")
    grid = GridSearchCV(spec["model"], spec["params"], cv=skf, scoring="f1", n_jobs=-1)
    grid.fit(X_train, y_train)
    
    print(f"Best params: {grid.best_params_}")
    print(f"Best CV F1: {grid.best_score_:.4f}")
    
    best_model = grid.best_estimator_
    best_models.append((name, best_model))

    # Save best model
    dump(best_model, f"{model_save_path}/best_model_{name.replace(' ', '_')}.joblib")
    print(f"Model saved: best_model_{name.replace(' ', '_')}.joblib")


=== GridSearchCV for Logistic Regression ===
Best params: {'C': 0.1}
Best CV F1: 0.5813
Model saved: best_model_Logistic_Regression.joblib

=== GridSearchCV for Random Forest ===
Best params: {'max_depth': 10, 'n_estimators': 200}
Best CV F1: 0.6785
Model saved: best_model_Random_Forest.joblib

=== GridSearchCV for XGBoost ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best params: {'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 2}
Best CV F1: 0.6427
Model saved: best_model_XGBoost.joblib

=== GridSearchCV for KNN ===
Best params: {'n_neighbors': 7}
Best CV F1: 0.6426
Model saved: best_model_KNN.joblib

=== GridSearchCV for Linear SVC ===
Best params: {'C': 0.1}
Best CV F1: 0.5821
Model saved: best_model_Linear_SVC.joblib


# Preprocess Test Data

In [12]:
eps_data

{'welltower':          Date   EPS Quarter    Company
 0  2025-04-28  0.40  Q12025  Welltower
 1  2025-02-11  0.19  Q42024  Welltower
 2  2024-10-28  0.73  Q32024  Welltower
 3  2024-07-29  0.42  Q22024  Welltower
 4  2024-04-29  0.22  Q12024  Welltower
 5  2024-02-13  0.15  Q42023  Welltower
 6  2023-10-30  0.24  Q32023  Welltower
 7  2023-07-31  0.20  Q22023  Welltower
 8  2023-05-02  0.05  Q12023  Welltower
 9  2023-02-15 -0.01  Q42022  Welltower,
 'cbre':          Date   EPS Quarter Company
 0  2025-04-30  0.41  Q12025    CBRE
 1  2025-01-31  1.33  Q42024    CBRE
 2  2024-10-31  1.51  Q32024    CBRE
 3  2024-07-31  1.52  Q22024    CBRE
 4  2024-04-30  0.78  Q12024    CBRE
 5  2024-01-31  1.63  Q42023    CBRE
 6  2023-10-31  1.33  Q32023    CBRE
 7  2023-07-31  1.58  Q22023    CBRE
 8  2023-04-30  0.92  Q12023    CBRE
 9  2023-01-31  1.16  Q42022    CBRE,
 'qualcomm':          Date   EPS Quarter   Company
 0  2025-03-31  2.52  Q12025  Qualcomm
 1  2024-12-31  2.83  Q42024  Qualcomm
 

In [13]:
# === Function to get EPS label (EPS increase between two quarters) ===
def get_eps_label(company_name, past_quarter, current_quarter):
    key = company_name.lower()
    if key not in eps_data:
        return None
    df = eps_data[key]
    try:
        eps_before = df.loc[df['Quarter'] == past_quarter.upper(), 'EPS'].values[0]
        eps_current = df.loc[df['Quarter'] == current_quarter.upper(), 'EPS'].values[0]
        return int(eps_current > eps_before)
    except IndexError:
        return None
    
tfidf_test = pd.read_csv(f"{tfidf_path}/tfidf_q1_2025.csv")       # Q1 2025
df_test_q = tfidf_test.copy()

df_test_q["Label"] = df_test_q["company"].apply(lambda c: get_eps_label(c, "Q42024", "Q12025"))
df_test_q = df_test_q.dropna(subset=["Label"])

In [11]:
tfidf_test

Unnamed: 0,company,quarter,Sector,ability,able,accelerate,accelerated,accelerating,acceleration,access,...,window,won,wondering,working,workload,world,written,written consent,yield,york
0,3M,Q1-2025,ConsumerGoods,0.006557,0.0,0.004447,0.004849,0.010937,0.004891,0.0,...,0.0,0.004912,0.0,0.022142,0.0,0.003197,0.131743,0.202829,0.0,0.0
1,Amazon,Q1-2025,,0.029227,0.050031,0.015855,0.0,0.019498,0.008719,0.0,...,0.0,0.008756,0.0,0.022557,0.096764,0.034201,0.0,0.0,0.0,0.012474
2,AMD,Q1-2025,Semiconductors,0.0,0.0,0.021317,0.023246,0.019662,0.0,0.0,...,0.0,0.0,0.0,0.030327,0.07318,0.015328,0.0,0.0,0.0,0.0
3,AmericanAirlines,Q1-2025,Airlines,0.010984,0.006964,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.021193,0.0,0.014281,0.125046,0.192519,0.0103,0.085945
4,AmTower,Q1-2025,RealEstate,0.005321,0.020242,0.0,0.00787,0.035501,0.015875,0.007455,...,0.0,0.0,0.020013,0.015401,0.022022,0.010378,0.0,0.0,0.007485,0.0
5,ASML,Q1-2025,Semiconductors,0.016382,0.020772,0.0,0.0,0.0,0.0,0.00765,...,0.0,0.0,0.027382,0.031608,0.0,0.005325,0.0,0.0,0.015363,0.0
6,Blackstone,Q1-2025,Finance,0.02956,0.023426,0.0,0.007286,0.032868,0.0,0.055215,...,0.011394,0.0,0.024704,0.014258,0.0,0.048042,0.0,0.0,0.034651,0.0
7,BoA,Q1-2025,Finance,0.011524,0.014612,0.0,0.0,0.0,0.011459,0.005381,...,0.008884,0.011509,0.004816,0.014823,0.0,0.037459,0.030869,0.0,0.0,0.0
8,CBRE,Q1-2025,RealEstate,0.034808,0.00662,0.0,0.030887,0.0,0.0,0.019505,...,0.0,0.010428,0.008727,0.0,0.0,0.027154,0.0,0.0,0.0,0.0
9,Citigroup,Q1-2025,Finance,0.019128,0.018191,0.006485,0.007072,0.007976,0.0,0.0,...,0.0,0.0,0.005995,0.023067,0.0,0.032643,0.0,0.0,0.0,0.0


# Test Set Creation

In [14]:
# Drop metadata columns
df_test = df_test_q.drop(columns=["company", "quarter"])

# Separate TF-IDF and sector columns
X_test_tfidf = df_test.drop(columns=["Sector", "Label"])
y_test = df_test["Label"].astype(int)

# One-hot encode sector using the same encoder from training
sector_test_encoded = encoder.transform(df_test[["Sector"]])  # encoder must be from training

# Combine TF-IDF and sector features
X_test = np.hstack([X_test_tfidf.values, sector_test_encoded])

# Load Trained Models

In [18]:
from joblib import load
import os

model_names = ["Logistic_Regression", "Random_Forest", "XGBoost", "KNN", "Linear_SVC"]
best_models_loaded = []

for name in model_names:
    model_path = f"{model_save_path}/best_model_{name}.joblib"
    if os.path.exists(model_path):
        model = load(model_path)
        best_models_loaded.append((name.replace("_", " "), model))
    else:
        print(f"[Warning] Model file not found: {model_path}")

# Evaluate

In [21]:
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, f1_score, roc_auc_score
)

results = []

for name, model in best_models_loaded:
    print(f"\n=== {name} Evaluation on Q1 2025 Test Set ===")
    y_pred = model.predict(X_test)

    # Get predicted probabilities or scores for AUC-ROC
    try:
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = model.decision_function(X_test)
    except Exception as e:
        print(f"Could not compute AUC-ROC for {name}: {e}")
        y_prob = None

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else float("nan")

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1 Score": f1,
        "AUC-ROC": auc
    })


=== Logistic Regression Evaluation on Q1 2025 Test Set ===
Accuracy: 0.5000
F1 Score: 0.4848
AUC-ROC: 0.5035
Classification Report:
              precision    recall  f1-score   support

           0     0.4737    0.5625    0.5143        16
           1     0.5333    0.4444    0.4848        18

    accuracy                         0.5000        34
   macro avg     0.5035    0.5035    0.4996        34
weighted avg     0.5053    0.5000    0.4987        34

Confusion Matrix:
[[ 9  7]
 [10  8]]

=== Random Forest Evaluation on Q1 2025 Test Set ===
Accuracy: 0.5000
F1 Score: 0.6531
AUC-ROC: 0.5312
Classification Report:
              precision    recall  f1-score   support

           0     0.3333    0.0625    0.1053        16
           1     0.5161    0.8889    0.6531        18

    accuracy                         0.5000        34
   macro avg     0.4247    0.4757    0.3792        34
weighted avg     0.4301    0.5000    0.3953        34

Confusion Matrix:
[[ 1 15]
 [ 2 16]]

=== XGBoost

In [22]:
import pandas as pd
results_df = pd.DataFrame(results).sort_values("F1 Score", ascending=False)
results_df

Unnamed: 0,Model,Accuracy,F1 Score,AUC-ROC
1,Random Forest,0.5,0.653061,0.53125
2,XGBoost,0.558824,0.634146,0.482639
3,KNN,0.529412,0.6,0.560764
4,Linear SVC,0.5,0.540541,0.541667
0,Logistic Regression,0.5,0.484848,0.503472
