# Imports

In [18]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os

# Data Settings

In [None]:
tfidf_path = Path(f"../words_TFIDF/")

quarters = [
    "Q1-2023",
    "Q2-2023",
    "Q3-2023",
    "Q4-2023",
    "Q1-2024",
    "Q2-2024",
    "Q3-2024",
    "Q4-2024"
]

In [4]:
import pandas as pd
import os

# Load TF-IDF for Q1 2025
tfidf_df_q12025 = pd.read_csv(f"{tfidf_path}/tfidf_q1_2025.csv")

# Load TF-IDF for Q1 2023 - Q4 2024
tfidf_df_otherq = pd.read_csv(f"{tfidf_path}/tfidf_non_q1_2025.csv")

# Load Data


In [None]:
import pandas as pd
import glob
import os

# Recursively find all EPS CSVs under data/
eps_files = glob.glob("../../data/**/EPS-*.csv", recursive=True)

# Build company-to-DataFrame map (case-insensitive)
eps_data = {}

for filepath in eps_files:
    # Use parent folder name as company name
    company_name = os.path.basename(os.path.dirname(filepath))
    try:
        df = pd.read_csv(filepath)
        df['Quarter'] = df['Quarter'].str.strip().str.upper()
        df['Company'] = company_name
        eps_data[company_name.lower()] = df  # Store using lowercase key
    except Exception as e:
        print(f"Error loading {filepath}: {e}")

# === Load TF-IDF Data ===
tfidf_train = pd.read_csv(f"{tfidf_path}/tfidf_non_q1_2025.csv")  # Q1 2023 to Q4 2024
tfidf_test = pd.read_csv(f"{tfidf_path}/tfidf_q1_2025.csv")       # Q1 2025
    

# Preprocess Data
Merge EPS data with the current TFIDF data and label.
1. Label = 1  if  EPS in current_quarter > EPS in past_quarter
2. Label = 0  otherwise (EPS decreased or stayed the same)

In [None]:
# === Function to get EPS label (EPS increase between two quarters) ===
def get_eps_label(company_name, past_quarter, current_quarter):
    key = company_name.lower()
    if key not in eps_data:
        return None
    df = eps_data[key]
    try:
        eps_before = df.loc[df['Quarter'] == past_quarter.upper(), 'EPS'].values[0]
        eps_current = df.loc[df['Quarter'] == current_quarter.upper(), 'EPS'].values[0]
        return int(eps_current > eps_before)
    except IndexError:
        return None
    
# List of TF-IDF quarters (dash format)
quarters_dash = [
    "Q1-2023",
    "Q2-2023",
    "Q3-2023",
    "Q4-2023",
    "Q1-2024",
    "Q2-2024",
    "Q3-2024",
    "Q4-2024"
]

# Convert to EPS-style format "Q12023, Q22023, Q32023, Q42023"
quarters_eps = [q.replace("-", "") for q in quarters_dash]

# Collect all labeled training rows
all_train_dfs = []

for idx in range(len(quarters_dash)):
    current_q_dash = quarters_dash[idx]
    current_q_eps = quarters_eps[idx]
    
    if idx == 0:
        past_q_eps = "Q42022"
    else:
        past_q_eps = quarters_eps[idx - 1]

    # Filter TF-IDF for current quarter
    df_q = tfidf_train[tfidf_train["quarter"] == current_q_dash].copy()
    
    # Label with EPS increase between past and current quarter
    df_q["Label"] = df_q["company"].apply(lambda c: get_eps_label(c, past_q_eps, current_q_eps))
    df_q = df_q.dropna(subset=["Label"])

    all_train_dfs.append(df_q)

# Final training data
train_df_combined = pd.concat(all_train_dfs, ignore_index=True)

# # === Label test data (Q4 2024 ➜ Q1 2025) ===
# test_df = tfidf_test.copy()
# test_df["Label"] = test_df["company"].apply(lambda c: get_eps_label(c, "Q42024", "Q12025"))
# test_df = test_df.dropna(subset=["Label"])

# X_test = test_df.iloc[:, 3:]
# y_test = test_df["Label"].astype(int)

# X_train.head()

# Classification
We will try to answer: "Will EPS increase in Q1 2025 compared to Q4 2024?"

- Features: All TFIDF words
- Target: Binary (0 = no increase, 1 = increase)

# Train Set Creation

In [23]:
# 1. Drop company and quarter
df = train_df_combined.drop(columns=["company", "quarter"])

# 2. Separate features and target
X_tfidf = df.drop(columns=["sector", "Label"])  # TF-IDF (1000 cols)
y_train = df["Label"].astype(int)

# 3. One-hot encode sector
encoder = OneHotEncoder(sparse_output = False, handle_unknown='ignore')
sector_encoded = encoder.fit_transform(df[["sector"]])  # must be 2D

# 4. Concatenate TF-IDF + Sector features
X_train = np.hstack([X_tfidf.values, sector_encoded])

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from joblib import dump

# Define parameter grids
param_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(class_weight="balanced", max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(class_weight="balanced", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "scale_pos_weight": [1, 2]  # for imbalance
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7]
        }
    },
    "Linear SVC": {
        "model": LinearSVC(class_weight="balanced", max_iter=2000),
        "params": {
            "C": [0.01, 0.1, 1]
        }
    }
}

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearch for each classifier
best_models = []
for name, spec in param_grids.items():
    print(f"\n=== GridSearchCV for {name} ===")
    grid = GridSearchCV(spec["model"], spec["params"], cv=skf, scoring="f1", n_jobs=-1)
    grid.fit(X_train, y_train)
    
    print(f"Best params: {grid.best_params_}")
    print(f"Best CV F1: {grid.best_score_:.4f}")
    
    best_model = grid.best_estimator_
    best_models.append((name, best_model))

    # Save best model
    dump(best_model, f"best_model_{name.replace(' ', '_')}.joblib")
    print(f"Model saved: best_model_{name.replace(' ', '_')}.joblib")

# Test Set Creation

In [None]:
# For test set
df_test = test_df.drop(columns=["company", "quarter"])
X_test_tfidf = df_test.drop(columns=["sector", "Label"])
y_test = df_test["Label"].astype(int)

sector_test_encoded = encoder.transform(df_test[["sector"]])
X_test = np.hstack([X_test_tfidf.values, sector_test_encoded])


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [8]:
print(tfidf_train.columns.tolist())

['company', 'quarter', 'Sector', 'ability', 'able', 'accelerate', 'accelerated', 'accelerating', 'acceleration', 'access', 'account', 'achieve', 'achieved', 'acquisition', 'action', 'active', 'activity', 'actual result', 'added', 'adding', 'addition', 'additionally', 'address', 'adhesive', 'adjusted', 'adjustment', 'adoption', 'advanced', 'advantage', 'advertiser', 'advertising', 'advisor', 'advisory', 'aerospace', 'affiliated', 'affiliated company', 'affo', 'ag', 'ag kgaa', 'ago', 'agreement', 'ai', 'air', 'air line', 'aircraft', 'akash', 'alastair', 'allocation', 'allow', 'allows', 'alternative', 'amazon', 'america', 'analysis', 'answer', 'answer session', 'anticipate', 'anticipated', 'app', 'application', 'approach', 'approximately', 'apps', 'architecture', 'area', 'asia', 'asked', 'asking', 'asset', 'associate', 'associated', 'assume', 'assumption', 'attractive', 'audience', 'auto', 'automation', 'automotive', 'availability', 'average', 'away', 'aws', 'backdrop', 'backlog', 'balanc