In [5]:
import pandas as pd
import numpy as np


In [6]:
df = pd.read_csv("final_ml_dataset.csv")
df.head()


Unnamed: 0,quarter,eps,revenue,pat_margin,ebitda,ebitda_margin,pat,company,ret_1m,ret_3m,vol_30d,vol_60d,abnormal_volume,qtr_Q2,qtr_Q3,qtr_Q4,covid_dummy
0,2017Q2,2.8,3598.12,4.86,326.05,9.06,174.77,AVENUE,0.093874,0.297413,0.013936,0.025774,0.711724,True,False,False,0
1,2017Q3,3.06,3506.92,5.45,339.14,9.67,191.04,AVENUE,0.037171,0.323089,0.026681,0.022271,2.377819,False,True,False,0
2,2017Q4,4.03,4093.89,6.15,435.4,10.64,251.77,AVENUE,0.049576,0.131616,0.012112,0.019751,0.816669,False,False,True,0
3,2018Q1,2.68,3809.96,4.39,309.38,8.12,167.1,AVENUE,0.026022,0.15185,0.021122,0.019811,0.786405,False,False,False,0
4,2018Q2,4.02,4559.42,5.5,436.93,9.58,250.61,AVENUE,-0.055179,0.094379,0.024817,0.020871,0.281471,True,False,False,0


In [7]:
# check dataframes now
[var for var in globals().keys() if not var.startswith("_")]


['In',
 'Out',
 'get_ipython',
 'exit',
 'quit',
 'open',
 'json',
 'getpass',
 'hashlib',
 'import_pandas_safely',
 'is_data_frame',
 'dataframe_columns',
 'dtypes_str',
 'dataframe_hash',
 'get_dataframes',
 'pd',
 'np',
 'df']

In [8]:
print("Shape:", df.shape)
df.head()


Shape: (712, 17)


Unnamed: 0,quarter,eps,revenue,pat_margin,ebitda,ebitda_margin,pat,company,ret_1m,ret_3m,vol_30d,vol_60d,abnormal_volume,qtr_Q2,qtr_Q3,qtr_Q4,covid_dummy
0,2017Q2,2.8,3598.12,4.86,326.05,9.06,174.77,AVENUE,0.093874,0.297413,0.013936,0.025774,0.711724,True,False,False,0
1,2017Q3,3.06,3506.92,5.45,339.14,9.67,191.04,AVENUE,0.037171,0.323089,0.026681,0.022271,2.377819,False,True,False,0
2,2017Q4,4.03,4093.89,6.15,435.4,10.64,251.77,AVENUE,0.049576,0.131616,0.012112,0.019751,0.816669,False,False,True,0
3,2018Q1,2.68,3809.96,4.39,309.38,8.12,167.1,AVENUE,0.026022,0.15185,0.021122,0.019811,0.786405,False,False,False,0
4,2018Q2,4.02,4559.42,5.5,436.93,9.58,250.61,AVENUE,-0.055179,0.094379,0.024817,0.020871,0.281471,True,False,False,0


In [4]:
# sort properly (VERY IMPORTANT)
df = df.sort_values(["company", "quarter"]).reset_index(drop=True)

# create eps lag of 4 quarters (YoY)
df["eps_lag4"] = df.groupby("company")["eps"].shift(4)

# target variable: earnings surprise
df["surprise"] = (df["eps"] > df["eps_lag4"]).astype(int)

df[["company", "quarter", "eps", "eps_lag4", "surprise"]].head(10)


NameError: name 'df' is not defined

In [4]:
# create lagged EPS features
df["eps_lag1"] = df.groupby("company")["eps"].shift(1)
df["eps_lag2"] = df.groupby("company")["eps"].shift(2)

df[[
    "company", "quarter",
    "eps", "eps_lag1", "eps_lag2", "eps_lag4", "surprise"
]].head(12)


Unnamed: 0,company,quarter,eps,eps_lag1,eps_lag2,eps_lag4,surprise
0,AVENUE,2017Q2,2.8,,,,0
1,AVENUE,2017Q3,3.06,2.8,,,0
2,AVENUE,2017Q4,4.03,3.06,2.8,,0
3,AVENUE,2018Q1,2.68,4.03,3.06,,0
4,AVENUE,2018Q2,4.02,2.68,4.03,2.8,1
5,AVENUE,2018Q3,3.62,4.02,2.68,3.06,1
6,AVENUE,2018Q4,4.12,3.62,4.02,4.03,1
7,AVENUE,2019Q1,3.25,4.12,3.62,2.68,1
8,AVENUE,2019Q2,5.37,3.25,4.12,4.02,1
9,AVENUE,2019Q3,5.34,5.37,3.25,3.62,1


In [5]:
model_df = df.dropna(subset=[
    "eps_lag1", "eps_lag2", "eps_lag4",
    "ret_1m", "ret_3m", "vol_30d", "vol_60d", "abnormal_volume"
]).reset_index(drop=True)

model_df.shape


(632, 21)

In [6]:
# target
y = model_df["surprise"]

# features to EXCLUDE
exclude_cols = [
    "surprise",
    "company",
    "quarter",
    "eps"   # current EPS must NOT be a feature
]

X = model_df.drop(columns=exclude_cols)

X.shape, y.shape


((632, 17), (632,))

In [7]:
# get unique quarters in chronological order
quarters = sorted(model_df["quarter"].unique())

# helper to extract year
def q_year(q):
    return int(q[:4])

# define test years (we ensure real OOS testing)
test_years = sorted(set(q_year(q) for q in quarters))[3:]  
# we skip early years to ensure enough training data

splits = []

for year in test_years:
    test_quarters = [q for q in quarters if q.startswith(str(year))]
    train_quarters = [q for q in quarters if q < test_quarters[0]]

    if len(test_quarters) == 4 and len(train_quarters) > 0:
        splits.append((train_quarters, test_quarters))

len(splits)


6

In [8]:
# inspect first split
train_q, test_q = splits[0]

print("TRAIN QUARTERS:")
print(train_q[:5], "...", train_q[-5:])

print("\nTEST QUARTERS:")
print(test_q)


TRAIN QUARTERS:
['2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4'] ... ['2017Q4', '2018Q1', '2018Q2', '2018Q3', '2018Q4']

TEST QUARTERS:
['2019Q1', '2019Q2', '2019Q3', '2019Q4']


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# get first split
train_quarters, test_quarters = splits[0]

# split data
train_df = model_df[model_df["quarter"].isin(train_quarters)]
test_df  = model_df[model_df["quarter"].isin(test_quarters)]

X_train = train_df[X.columns]
y_train = train_df["surprise"]

X_test = test_df[X.columns]
y_test = test_df["surprise"]

# train logistic regression
logit = LogisticRegression(
    max_iter=1000,
    solver="lbfgs"
)

logit.fit(X_train, y_train)

# predictions
y_pred = logit.predict(X_test)
y_prob = logit.predict_proba(X_test)[:, 1]

# evaluation
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

acc, auc


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.8289473684210527, 0.6257575757575758)

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# get first split
train_quarters, test_quarters = splits[0]

train_df = model_df[model_df["quarter"].isin(train_quarters)]
test_df  = model_df[model_df["quarter"].isin(test_quarters)]

X_train = train_df[X.columns]
y_train = train_df["surprise"]

X_test = test_df[X.columns]
y_test = test_df["surprise"]

# SCALE (fit only on training data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# train logistic regression (scaled)
logit = LogisticRegression(
    max_iter=2000,
    solver="lbfgs"
)

logit.fit(X_train_scaled, y_train)

# predictions
y_pred = logit.predict(X_test_scaled)
y_prob = logit.predict_proba(X_test_scaled)[:, 1]

# evaluation
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

acc, auc


(0.8421052631578947, 0.5818181818181818)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

results = []

for i, (train_quarters, test_quarters) in enumerate(splits):
    train_df = model_df[model_df["quarter"].isin(train_quarters)]
    test_df  = model_df[model_df["quarter"].isin(test_quarters)]

    X_train = train_df[X.columns]
    y_train = train_df["surprise"]

    X_test = test_df[X.columns]
    y_test = test_df["surprise"]

    # scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # model
    logit = LogisticRegression(max_iter=2000, solver="lbfgs")
    logit.fit(X_train_scaled, y_train)

    # predict
    y_pred = logit.predict(X_test_scaled)
    y_prob = logit.predict_proba(X_test_scaled)[:, 1]

    # metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    results.append({
        "fold": i + 1,
        "test_year": test_quarters[0][:4],
        "accuracy": acc,
        "auc": auc
    })

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,fold,test_year,accuracy,auc
0,1,2019,0.842105,0.581818
1,2,2020,0.623377,0.701351
2,3,2021,0.625,0.5795
3,4,2022,0.65,0.757735
4,5,2023,0.725,0.807827
5,6,2024,0.7375,0.717091


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

rf_results = []

for i, (train_quarters, test_quarters) in enumerate(splits):
    train_df = model_df[model_df["quarter"].isin(train_quarters)]
    test_df  = model_df[model_df["quarter"].isin(test_quarters)]

    X_train = train_df[X.columns]
    y_train = train_df["surprise"]

    X_test = test_df[X.columns]
    y_test = test_df["surprise"]

    rf = RandomForestClassifier(
        n_estimators=300,
        min_samples_leaf=20,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"
    )

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    rf_results.append({
        "fold": i + 1,
        "test_year": test_quarters[0][:4],
        "accuracy": acc,
        "auc": auc
    })

rf_results_df = pd.DataFrame(rf_results)
rf_results_df


Unnamed: 0,fold,test_year,accuracy,auc
0,1,2019,0.710526,0.684848
1,2,2020,0.636364,0.695946
2,3,2021,0.625,0.51251
3,4,2022,0.7125,0.654378
4,5,2023,0.7375,0.760307
5,6,2024,0.6875,0.68


In [14]:
!pip install xgboost



Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 7.2 MB/s eta 0:00:10
   -- ------------------------------------- 4.5/72.0 MB 16.1 MB/s eta 0:00:05
   ---- ----------------------------------- 8.9/72.0 MB 18.6 MB/s eta 0:00:04
   ------- -------------------------------- 12.8/72.0 MB 18.6 MB/s eta 0:00:04
   --------- ------------------------------ 17.6/72.0 MB 19.0 MB/s eta 0:00:03
   ------------ --------------------------- 22.0/72.0 MB 19.2 MB/s eta 0:00:03
   -------------- ------------------------- 26.2/72.0 MB 19.3 MB/s eta 0:00:03
   ----------------- ---------------------- 30.7/72.0 MB 19.4 MB/s eta 0:00:03
   ------------------- -------------------- 35.7/72.0 MB 19.9 MB/s eta 0:00:02
   ---------------------- ----------------- 40.1/72.0 MB 19.6 MB/s eta 0:00:

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

xgb_results = []

for i, (train_quarters, test_quarters) in enumerate(splits):
    train_df = model_df[model_df["quarter"].isin(train_quarters)]
    test_df  = model_df[model_df["quarter"].isin(test_quarters)]

    X_train = train_df[X.columns]
    y_train = train_df["surprise"]

    X_test = test_df[X.columns]
    y_test = test_df["surprise"]

    xgb = XGBClassifier(
        n_estimators=300,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="auc",
        random_state=42,
        n_jobs=-1
    )

    xgb.fit(X_train, y_train)

    y_pred = xgb.predict(X_test)
    y_prob = xgb.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    xgb_results.append({
        "fold": i + 1,
        "test_year": test_quarters[0][:4],
        "accuracy": acc,
        "auc": auc
    })

xgb_results_df = pd.DataFrame(xgb_results)
xgb_results_df


Unnamed: 0,fold,test_year,accuracy,auc
0,1,2019,0.815789,0.474242
1,2,2020,0.506494,0.724324
2,3,2021,0.7375,0.60452
3,4,2022,0.7,0.747202
4,5,2023,0.775,0.827393
5,6,2024,0.75,0.727273


In [16]:
# scaled earnings surprise
model_df["scaled_surprise"] = (
    (model_df["eps"] - model_df["eps_lag4"]) / model_df["eps_lag4"].abs()
)

model_df[["company", "quarter", "eps", "eps_lag4", "scaled_surprise"]].head()


Unnamed: 0,company,quarter,eps,eps_lag4,scaled_surprise
0,AVENUE,2018Q2,4.02,2.8,0.435714
1,AVENUE,2018Q3,3.62,3.06,0.183007
2,AVENUE,2018Q4,4.12,4.03,0.022333
3,AVENUE,2019Q1,3.25,2.68,0.212687
4,AVENUE,2019Q2,5.37,4.02,0.335821


In [17]:
# compute global quantile cutoffs
upper_q = model_df["scaled_surprise"].quantile(0.70)
lower_q = model_df["scaled_surprise"].quantile(0.30)

upper_q, lower_q


(np.float64(0.24200913242009145), np.float64(0.006430868167202578))

In [18]:
# initialize as NaN
model_df["surprise_q"] = np.nan

# assign labels
model_df.loc[model_df["scaled_surprise"] >= upper_q, "surprise_q"] = 1
model_df.loc[model_df["scaled_surprise"] <= lower_q, "surprise_q"] = 0

# drop noisy middle
model_df_q = model_df.dropna(subset=["surprise_q"]).reset_index(drop=True)

model_df_q["surprise_q"].value_counts(), model_df_q.shape


(surprise_q
 1.0    190
 0.0    190
 Name: count, dtype: int64,
 (380, 23))

In [19]:
# GST dummy: 1 from 2017Q3 onwards
model_df_q["gst_dummy"] = (
    model_df_q["quarter"] >= "2017Q3"
).astype(int)

model_df_q[["quarter", "gst_dummy"]].drop_duplicates().sort_values("quarter").head(10)


Unnamed: 0,quarter,gst_dummy
14,2016Q4,0
15,2017Q1,0
40,2017Q2,0
64,2017Q3,1
16,2017Q4,1
17,2018Q1,1
0,2018Q2,1
19,2018Q3,1
20,2018Q4,1
21,2019Q1,1


In [20]:
model_df_q.columns


Index(['quarter', 'eps', 'revenue', 'pat_margin', 'ebitda', 'ebitda_margin',
       'pat', 'company', 'ret_1m', 'ret_3m', 'vol_30d', 'vol_60d',
       'abnormal_volume', 'qtr_Q2', 'qtr_Q3', 'qtr_Q4', 'covid_dummy',
       'eps_lag4', 'surprise', 'eps_lag1', 'eps_lag2', 'scaled_surprise',
       'surprise_q', 'gst_dummy'],
      dtype='object')

In [21]:
# YoY growth features (within company)
model_df_q = model_df_q.sort_values(["company", "quarter"]).reset_index(drop=True)

model_df_q["revenue_yoy"] = (
    model_df_q["revenue"] / model_df_q.groupby("company")["revenue"].shift(4) - 1
)

model_df_q["ebitda_yoy"] = (
    model_df_q["ebitda"] / model_df_q.groupby("company")["ebitda"].shift(4) - 1
)

model_df_q["pat_yoy"] = (
    model_df_q["pat"] / model_df_q.groupby("company")["pat"].shift(4) - 1
)

model_df_q[
    ["company", "quarter", "revenue_yoy", "ebitda_yoy", "pat_yoy"]
].head(10)


Unnamed: 0,company,quarter,revenue_yoy,ebitda_yoy,pat_yoy
0,AVENUE,2018Q2,,,
1,AVENUE,2019Q2,,,
2,AVENUE,2019Q3,,,
3,AVENUE,2019Q4,,,
4,AVENUE,2020Q1,0.358403,0.037374,0.144687
5,AVENUE,2020Q2,-0.336872,-0.736461,-0.852196
6,AVENUE,2020Q3,-0.122854,-0.273834,-0.368571
7,AVENUE,2021Q1,0.081634,0.115155,0.103206
8,AVENUE,2021Q2,-0.18758,-0.432048,-0.598668
9,AVENUE,2021Q3,0.995612,3.389139,8.057708


In [22]:
# helper: subtract quarter-wise median
def relative_to_median(df, col):
    return df[col] - df.groupby("quarter")[col].transform("median")

# create peer-relative features
model_df_q["revenue_yoy_rel"] = relative_to_median(model_df_q, "revenue_yoy")
model_df_q["ebitda_yoy_rel"]  = relative_to_median(model_df_q, "ebitda_yoy")
model_df_q["pat_yoy_rel"]     = relative_to_median(model_df_q, "pat_yoy")

model_df_q["ebitda_margin_rel"] = relative_to_median(model_df_q, "ebitda_margin")
model_df_q["pat_margin_rel"]    = relative_to_median(model_df_q, "pat_margin")

model_df_q[
    ["quarter", "revenue_yoy_rel", "ebitda_yoy_rel", "pat_yoy_rel"]
].head()


Unnamed: 0,quarter,revenue_yoy_rel,ebitda_yoy_rel,pat_yoy_rel
0,2018Q2,,,
1,2019Q2,,,
2,2019Q3,,,
3,2019Q4,,,
4,2020Q1,0.437696,0.0,0.211781


In [23]:
# define target
y = model_df_q["surprise_q"]

# columns to exclude from features
exclude_cols = [
    "surprise",        # old target
    "surprise_q",      # new target
    "scaled_surprise", # continuous target proxy
    "company",
    "quarter",
    "eps"              # never use current EPS
]

X = model_df_q.drop(columns=exclude_cols)

X.shape, y.value_counts()


((380, 26),
 surprise_q
 1.0    190
 0.0    190
 Name: count, dtype: int64)

In [24]:
# rebuild splits using the new dataset
quarters_q = sorted(model_df_q["quarter"].unique())

def q_year(q):
    return int(q[:4])

test_years_q = sorted(set(q_year(q) for q in quarters_q))[2:]  # ensure enough training data

splits_q = []

for year in test_years_q:
    test_quarters = [q for q in quarters_q if q.startswith(str(year))]
    train_quarters = [q for q in quarters_q if q < test_quarters[0]]

    if len(test_quarters) == 4 and len(train_quarters) > 0:
        splits_q.append((train_quarters, test_quarters))

len(splits_q)


7

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

final_results = []

for i, (train_quarters, test_quarters) in enumerate(splits_q):
    train_df = model_df_q[model_df_q["quarter"].isin(train_quarters)]
    test_df  = model_df_q[model_df_q["quarter"].isin(test_quarters)]

    X_train = train_df[X.columns]
    y_train = train_df["surprise_q"]

    X_test = test_df[X.columns]
    y_test = test_df["surprise_q"]

    # scale (fit only on training)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # logistic regression
    logit = LogisticRegression(max_iter=2000, solver="lbfgs")
    logit.fit(X_train_scaled, y_train)

    # predictions
    y_prob = logit.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)

    # metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    final_results.append({
        "fold": i + 1,
        "test_year": test_quarters[0][:4],
        "accuracy": acc,
        "auc": auc
    })

final_results_df = pd.DataFrame(final_results)
final_results_df


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [26]:
# drop rows with any missing feature values
model_df_q_clean = model_df_q.dropna().reset_index(drop=True)

model_df_q_clean.shape


(300, 32)

In [27]:
model_df_q_clean["surprise_q"].value_counts()


surprise_q
0.0    157
1.0    143
Name: count, dtype: int64

In [28]:
# target
y = model_df_q_clean["surprise_q"]

# columns to exclude from features
exclude_cols = [
    "surprise",
    "surprise_q",
    "scaled_surprise",
    "company",
    "quarter",
    "eps"
]

X = model_df_q_clean.drop(columns=exclude_cols)

X.shape, y.value_counts()


((300, 26),
 surprise_q
 0.0    157
 1.0    143
 Name: count, dtype: int64)

In [29]:
# rebuild rolling splits on cleaned dataset
quarters_clean = sorted(model_df_q_clean["quarter"].unique())

def q_year(q):
    return int(q[:4])

test_years_clean = sorted(set(q_year(q) for q in quarters_clean))[2:]

splits_clean = []

for year in test_years_clean:
    test_quarters = [q for q in quarters_clean if q.startswith(str(year))]
    train_quarters = [q for q in quarters_clean if q < test_quarters[0]]

    if len(test_quarters) == 4 and len(train_quarters) > 0:
        splits_clean.append((train_quarters, test_quarters))

len(splits_clean)


5

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

final_results = []

for i, (train_quarters, test_quarters) in enumerate(splits_clean):
    train_df = model_df_q_clean[model_df_q_clean["quarter"].isin(train_quarters)]
    test_df  = model_df_q_clean[model_df_q_clean["quarter"].isin(test_quarters)]

    X_train = train_df[X.columns]
    y_train = train_df["surprise_q"]

    X_test = test_df[X.columns]
    y_test = test_df["surprise_q"]

    # scale (fit only on training)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # logistic regression
    logit = LogisticRegression(max_iter=3000, solver="lbfgs")
    logit.fit(X_train_scaled, y_train)

    # predictions
    y_prob = logit.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)

    # metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    final_results.append({
        "fold": i + 1,
        "test_year": test_quarters[0][:4],
        "accuracy": acc,
        "auc": auc
    })

final_results_df = pd.DataFrame(final_results)
final_results_df


Unnamed: 0,fold,test_year,accuracy,auc
0,1,2020,0.530612,0.651709
1,2,2021,0.490196,0.5
2,3,2022,0.685185,0.737728
3,4,2023,0.686275,0.737578
4,5,2024,0.770833,0.841739


In [1]:
import pandas as pd
import numpy as np

print("Shape:", df.shape)
print("\nTarget summary:")
print(df['earnings_surprise'].describe())

print("\nMissing values (top 15):")
df.isna().sum().sort_values(ascending=False).head(15)


NameError: name 'df' is not defined

In [2]:
# List all variables in memory
[var for var in globals().keys() if not var.startswith("_")]


['In',
 'Out',
 'get_ipython',
 'exit',
 'quit',
 'open',
 'json',
 'getpass',
 'hashlib',
 'import_pandas_safely',
 'is_data_frame',
 'dataframe_columns',
 'dtypes_str',
 'dataframe_hash',
 'get_dataframes',
 'pd',
 'np']