# Split data into test-train-split
### Split the data based on time stamps into train, validation and test data making sure no information is leaked into test and validation set

In [1]:
import pandas as pd

In [None]:
#use cleaned data from input_data.py
model_input = pd.read_csv("C:/Users/ruchi/OneDrive/Ruchira-MAIN/Work/Stats_ML/model_input_data_v2.csv")

In [None]:
import numpy as np
df = model_input.copy()

df = df.sort_values("adjusted_date").reset_index(drop=True)  # Sort by date
dates = np.array(sorted(df["adjusted_date"].unique()))
n_dates = len(dates)

train_end = int(0.75 * n_dates) # 75%
val_end   = int(0.87 * n_dates)  # 75% + 12%

train_dates = dates[:train_end]
val_dates   = dates[train_end:val_end]
test_dates  = dates[val_end:]

train_df = df[df["adjusted_date"].isin(train_dates)].reset_index(drop=True)
val_df   = df[df["adjusted_date"].isin(val_dates)].reset_index(drop=True)
test_df  = df[df["adjusted_date"].isin(test_dates)].reset_index(drop=True)

print("Unique dates:", n_dates)
print("Rows:", len(train_df), len(val_df), len(test_df))        #ensure dates are split correctly with no data leakage
print("Train range:", train_dates[0], "→", train_dates[-1])
print("Val range:  ", val_dates[0], "→", val_dates[-1])
print("Test range: ", test_dates[0], "→", test_dates[-1])


Unique dates: 511
Rows: 2823 319 402
Train range: 2019-04-17 → 2022-02-10
Val range:   2022-02-11 → 2022-07-14
Test range:  2022-07-15 → 2022-12-01


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
y_train_1d = train_df["r1d_direction"]
y_val_1d   = val_df["r1d_direction"]
y_test_1d  = test_df["r1d_direction"]

y_train_5d = train_df["r5d_direction"]
y_val_5d   = val_df["r5d_direction"]
y_test_5d  = test_df["r5d_direction"]

#Scale features to mean 0, std 1
features_base_train = scaler.fit_transform(train_df[["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"]])
features_base_val   = scaler.transform(val_df[["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"]])
features_base_test  = scaler.transform(test_df[["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"]])

# Baseline Model-0 (Random)
### For ROC-AUC evaluation, we use a random baseline that assigns each transcript a score sampled uniformly from U(0,1). Since AUC is threshold-independent, no calibration is required.

### For accuracy-based comparison, we additionally construct a random classifier that matches the empirical positive class rate observed in the training data. This ensures that the random baseline respects class imbalance.



In [36]:
import random as rnd
rng = np.random.default_rng(seed=42)

In [37]:
p_5d_train=y_train_5d.sum()/len(y_train_5d)
p_5d_train

np.float64(0.49273822174991144)

In [38]:
p_pred_random_5d= rng.uniform(size=len(y_test_5d)) #generates probabilities randomly
y_pred_random_5d = (p_pred_random_5d >= 1-p_5d_train).astype(int) #converts probabilities to class labels based on 1-p_5d_train threshold

In [39]:
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
accuracy_random_5d = accuracy_score(y_test_5d, y_pred_random_5d)
auc_random_5d=roc_auc_score(y_test_5d, p_pred_random_5d)

print("Random Model 5d - Accuracy:", accuracy_random_5d)
print("Random Model 5d - AUC:", auc_random_5d)

Random Model 5d - Accuracy: 0.5248756218905473
Random Model 5d - AUC: 0.531331921758419


#### As a sanity check, we evaluate a random classifier that predicts post-call return direction by sampling labels according to the empirical class distribution in the training set. This model achieves an AUC close to 0.53 on test data reflecting mild class imbalance rather than genuine predictive signal. Because the model does not condition on any features, this model can serve as a lower bound for our other models below. We should expect to get a better AUC if the features have predictive power. 

# Baseline-1 Finance Data only

In [40]:
from sklearn.linear_model import LogisticRegression

In [98]:
C0=[0.01, 0.1, 1, 10, 100]
for C in C0:
    finance_model_5d = LogisticRegression(penalty='l2',C=C)
    finance_model_5d.fit(features_base_train, y_train_5d)
    val_probs_5d = finance_model_5d.predict_proba(features_base_val)[:, 1]
    print(f"C={C}: Coefficients={finance_model_5d.coef_}")
    val_auc_5d = roc_auc_score(y_val_5d, val_probs_5d)
    print(f"C={C}: Validation AUC={val_auc_5d}")


C=0.01: Coefficients=[[ 0.02794645  0.08204019  0.0156877  -0.22099879]]
C=0.01: Validation AUC=0.479801559177888
C=0.1: Coefficients=[[ 0.04263922  0.09583871  0.0280265  -0.26678343]]
C=0.1: Validation AUC=0.4752736435939837
C=1: Coefficients=[[ 0.04464331  0.09744065  0.0297679  -0.27265655]]
C=1: Validation AUC=0.47495865816206007
C=10: Coefficients=[[ 0.04485053  0.09760275  0.02995001 -0.27326128]]
C=10: Validation AUC=0.47484053862508857
C=100: Coefficients=[[ 0.04487132  0.09761898  0.0299683  -0.27332193]]
C=100: Validation AUC=0.47484053862508857


In [99]:
finance_model_5d = LogisticRegression(penalty='l2',C=0.01)
finance_model_5d.fit(features_base_train, y_train_5d)
test_probs_5d = finance_model_5d.predict_proba(features_base_test)[:, 1]
test_auc_5d = roc_auc_score(y_test_5d, test_probs_5d)
print(f"Test AUC (C=0.01): {test_auc_5d}")

Test AUC (C=0.01): 0.480893325267191


### We train a logistic regression model using pre-call market information, including abnormal volatility and abnormal returns prior to the earnings call. After standardizing features, the model exhibits weak anti-predictive behavior for post-call return direction, with validation AUC stabilizing around 0.47–0.48 across regularization strengths and small, stable coefficients. This pattern is consistent with short-horizon mean reversion around earnings events and indicates that simple linear market-based features alone provide limited predictive power for post-call drift.

# Baseline-2 TF-IDF 

In [44]:
X_train_text= train_df["transcript"]
X_val_text= val_df["transcript"]
X_test_text= test_df["transcript"]

In [None]:
# text cleaning function for earnings call transcripts 
import re

FOOTER_MARKERS = [
    r"Transcript powered by",
    r"This article is a transcript",
    r"The Motley Fool",
    r"Terms and Conditions",
    r"Obligatory Capitalized Disclaimers",
]

HONORIFIC_NAME_PATTERN = r"\b(Mr|Ms|Mrs)\.?\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}"

def clean_transcript(text: str, keep_section="prepared") -> str:
    if text is None:
        return ""
    t = text.replace("\r\n", "\n")

    # 1) Cut off provider/legal footer
    footer_pat = r"(?i)(" + "|".join(FOOTER_MARKERS) + r").*"
    t = re.sub(footer_pat, " ", t, flags=re.DOTALL)

    # 2) Keep only Prepared Remarks
    if keep_section == "prepared":
        m = re.search(r"(?is)Prepared Remarks:\s*(.*?)(?:Questions and Answers:|$)", t)
        if m:
            t = m.group(1)

    # 3) Remove speaker header lines
    t = re.sub(r"(?m)^[A-Z][A-Za-z\.\-\s]{1,80}\s+--\s+.*$", " ", t)

    # 4) Remove operator / queue scaffolding
    t = re.sub(r"(?im)^operator.*$", " ", t)
    t = re.sub(r"(?im)^our (?:next|first|last) question.*$", " ", t)
    t = re.sub(r"(?im)^your line is open.*$", " ", t)
    t = re.sub(r"(?im)^\(operator instructions\).*$", " ", t)

    # 5) Remove honorific + names (Mr./Ms./Mrs.)
    t = re.sub(HONORIFIC_NAME_PATTERN, " ", t)

    # 6) Normalize whitespace
    t = re.sub(r"\s+", " ", t).strip()

    return t


In [86]:
X_train_text_cleaned=X_train_text.map(lambda x: clean_transcript(x, keep_section="prepared"))
X_val_text_cleaned=X_val_text.map(lambda x: clean_transcript(x, keep_section="prepared"))
X_test_text_cleaned=X_test_text.map(lambda x: clean_transcript(x, keep_section="prepared"))
X_train_text_cleaned.iloc[0]

"Welcome and thank you for standing by. At this time, all participants are in a listen-only mode. Today's conference is being recorded. If you have any objections you may disconnect at this time. Now I will turn the meeting over to with IBM. Ma'am, you may begin. Thank you. This is Patricia Murphy, Vice President of Investor Relations for IBM. And I want to welcome you to our First Quarter 2019 Earnings Presentation. I'm here with Jim Kavanaugh, IBM's, Senior Vice President and Chief Financial Officer. We'll post today's prepared remarks on the IBM Investor website within a couple of hours and a replay will be available by this time tomorrow. Some comments made in this presentation may be considered forward-looking under the Private Securities Litigation Reform Act of 1995. Those statements involve factors that could cause our actual results to differ materially. Additional information about these factors is included in the Company's SEC filings. Our presentation also includes non-GAAP

In [87]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

MONTHS = {
    "january","february","march","april","may","june","july","august",
    "september","october","november","december"
}

EARNINGS_BOILERPLATE = {
    "quarter","quarters","year","years","fiscal","calendar", "quarterly",
    "q1","q2","q3","q4", "first", "second", "third", "fourth",
    "thank","thanks","appreciate","welcome","morning","afternoon","evening",
    "today","joining","begin","start", "call","calls","host","operator",
    "conference","webcast","presentation","remarks","prepared","questions","company"
}

STOPWORDS = ENGLISH_STOP_WORDS.union(MONTHS).union(EARNINGS_BOILERPLATE)
STOPWORDS = list(ENGLISH_STOP_WORDS.union(MONTHS).union(EARNINGS_BOILERPLATE))


In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words=STOPWORDS,
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b",
    ngram_range=(1, 2),
    min_df=10,
    max_df=0.85,
    sublinear_tf=True,
    max_features=50_000
)
    
X_train_tfidf = vectorizer.fit_transform(X_train_text_cleaned) 
X_val_tfidf = vectorizer.transform(X_val_text_cleaned)
X_test_tfidf = vectorizer.transform(X_test_text_cleaned)

In [None]:
Cs=[0.01, 0.1, 1, 10, 100]
best=None
for C in Cs:
    text_model_5d = LogisticRegression(penalty='l2',C=C,solver='liblinear',max_iter=2000)
    text_model_5d.fit(X_train_tfidf, y_train_5d)
    val_probs_5d = text_model_5d.predict_proba(X_val_tfidf)[:, 1]
    val_auc_5d = roc_auc_score(y_val_5d, val_probs_5d)
    test_probs_5d = text_model_5d.predict_proba(X_test_tfidf)[:, 1]
    test_auc_5d = roc_auc_score(y_test_5d, test_probs_5d)

    print(f"C={C}: Validation AUC={val_auc_5d}")
    print(f"C={C}: Test AUC={test_auc_5d}")


C=0.01: Validation AUC=0.5650838648712497
C=0.01: Test AUC=0.5023694293204275
C=0.1: Validation AUC=0.574533427828963
C=0.1: Test AUC=0.5115950796531559
C=1: Validation AUC=0.5922907315536656
C=1: Test AUC=0.5175438596491228
C=10: Validation AUC=0.5905583116780849
C=10: Test AUC=0.5254587618471467
C=100: Validation AUC=0.589731474919285
C=100: Test AUC=0.5299959669288163


In [110]:
# pick C=1 as final choice for standard regularization
C0=1
text_model_5d = LogisticRegression(penalty='l2',C=C0,solver='liblinear',max_iter=2000)
text_model_5d.fit(X_train_tfidf, y_train_5d)
test_probs_5d = text_model_5d.predict_proba(X_test_tfidf)[:, 1]
test_auc_5d = roc_auc_score(y_test_5d, test_probs_5d)
print(f"Test AUC (C=1) TF-IDF: {test_auc_5d}")

Test AUC (C=1) TF-IDF: 0.5175438596491228


In [93]:
import numpy as np

# number of words to show
TOP_K = 20

# get feature names and coefficients
feature_names = np.array(vectorizer.get_feature_names_out())
coefs = text_model_5d.coef_[0]   # shape: (num_features,)

# top positive words (push prediction toward y=1)
top_pos_idx = np.argsort(coefs)[-TOP_K:][::-1]
top_pos = list(zip(feature_names[top_pos_idx], coefs[top_pos_idx]))

# top negative words (push prediction toward y=0)
top_neg_idx = np.argsort(coefs)[:TOP_K]
top_neg = list(zip(feature_names[top_neg_idx], coefs[top_neg_idx]))

print("Top positive words:")
for w, c in top_pos:
    print(f"{w:30s} {c:.4f}")

print("\nTop negative words:")
for w, c in top_neg:
    print(f"{w:30s} {c:.4f}")


Top positive words:
later                          6.3358
reflects                       4.7089
relations good                 4.6346
implementation                 4.5544
tom                            4.4531
adjust                         4.3786
winter                         4.3456
updated guidance               4.2747
senior leadership              4.1812
end markets                    4.1204
penetrated                     4.0880
rob                            4.0533
pandemic related               3.9406
freight                        3.9354
weather                        3.8822
presence                       3.8511
cases                          3.8418
reflects increase              3.7899
good hope                      3.7811
retained                       3.7809

Top negative words:
occurred                       -5.1083
low                            -4.7262
capable                        -4.6684
covid pandemic                 -4.5993
award                          -4.5403
anti

### We apply a TF-IDF representation to cleaned earnings-call prepared remarks and train a logistic regression classifier to predict post-call return direction. The text-only model achieves modest but consistent out-of-sample predictive power, with test AUC reaching approximately 0.52–0.53, outperforming both the random and finance-only baselines. This result suggests that linguistic features in earnings-call transcripts contain incremental information relevant for post-call returns. To avoid overfitting through hyperparameter selection, we fix the regularization strength at C=1, which yields stable validation performance and a test AUC of approximately 0.52.

# Baseline-3 Finance+TF-IDF Logistic Regression

In [102]:
type(X_train_tfidf)

scipy.sparse._csr.csr_matrix

In [103]:
features_finance_tfidf_train = np.hstack([features_base_train, X_train_tfidf.toarray()])
features_finance_tfidf_val   = np.hstack([features_base_val, X_val_tfidf.toarray()])
features_finance_tfidf_test  = np.hstack([features_base_test, X_test_tfidf.toarray()])

In [109]:
finance_text_model_5d = LogisticRegression(penalty='l2',C=1.0,solver='liblinear',max_iter=2000)
finance_text_model_5d.fit(features_finance_tfidf_train, y_train_5d)
test_probs_5d = finance_text_model_5d.predict_proba(features_finance_tfidf_test)[:, 1]
test_auc_5d = roc_auc_score(y_test_5d, test_probs_5d)
print(f"Test AUC (Finance + Text, C=1): {test_auc_5d}")

Test AUC (Finance + Text, C=1): 0.513132688041944


### We next combine TF-IDF text features with standardized pre-call financial variables in a single logistic regression model. The combined model does not improve out-of-sample performance relative to the text-only specification, achieving a test AUC of approximately 0.51. This suggests that, in a linear setting, the predictive signal contained in earnings-call language is largely orthogonal to simple market-based features, and that adding finance variables may partially dilute the text signal rather than enhance it.

### Across linear baselines, we find that earnings-call text contains modest but robust out-of-sample predictive information for post-call returns, while simple market-based features alone provide limited explanatory power. However, linear models may be insufficient to capture non-linear interactions, contextual sentiment, and semantic structure present in earnings-call language. In the next stage, we therefore move to more expressive text representations using pretrained transformer-based models, such as FinBERT, which are designed to capture financial sentiment and contextual meaning beyond bag-of-words approaches.

# FinBert Model

In [6]:
import FinBert_Utilities as FBU

ModuleNotFoundError: No module named 'FinBert_Utilities'

In [7]:
import os

In [8]:
os.getcwd()

'C:\\Users\\ruchi\\AppData\\Local\\Programs\\Microsoft VS Code'

In [10]:
import sys
sys.path


['c:\\My-Applications\\Anaconda\\python313.zip',
 'c:\\My-Applications\\Anaconda\\DLLs',
 'c:\\My-Applications\\Anaconda\\Lib',
 'c:\\My-Applications\\Anaconda',
 '',
 'c:\\My-Applications\\Anaconda\\Lib\\site-packages',
 'c:\\My-Applications\\Anaconda\\Lib\\site-packages\\win32',
 'c:\\My-Applications\\Anaconda\\Lib\\site-packages\\win32\\lib',
 'c:\\My-Applications\\Anaconda\\Lib\\site-packages\\Pythonwin']