In [1]:
%pip install -r ../requirements.txt

Collecting fredapi (from -r ../requirements.txt (line 3))
  Downloading fredapi-0.5.2-py3-none-any.whl.metadata (5.0 kB)
Collecting pyodbc (from -r ../requirements.txt (line 7))
  Downloading pyodbc-5.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting python-dotenv (from -r ../requirements.txt (line 19))
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow->-r ../requirements.txt (line 11))
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow->-r ../requirements.txt (line 11))
  Downloading opentelemetry_api-1.31.1-py3-none-any.whl.metadata (1.6 kB)
Collecting importlib_metadata!=4.7.0,<9,>=3.7.0 (from mlflow-skinny==3.1.1->mlflow->-r ../requirements.txt (line 11))
  Downloading importlib_metadata-8.6.1-py3-none-any.whl.metadata (4.7 kB)
INFO: pip is looking at multiple versions of azureml-mlflow to

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Note: Ensure your .env file is in the root of your user folder in Azure ML
load_dotenv()

SYNAPSE_CONN_STRING = os.getenv("SYNAPSE_CONN_STRING")

# Verify that the connection string was loaded
if not SYNAPSE_CONN_STRING:
    raise ValueError("SYNAPSE_CONN_STRING environment variable not found. Please check your .env file.")

In [2]:
engine = create_engine(SYNAPSE_CONN_STRING, connect_args={'timeout': 60})

query = "SELECT * FROM v_financials_quarterly"

with engine.connect() as connection:
    df = pd.read_sql_query(text(query), connection)

print("Data loaded successfully.")
df.info()

Data loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   prediction_quarter                          243 non-null    object 
 1   ticker                                      243 non-null    object 
 2   report_date                                 243 non-null    object 
 3   revenues                                    243 non-null    int64  
 4   research_and_development_expense            243 non-null    int64  
 5   selling_general_and_administrative_expense  243 non-null    int64  
 6   net_income                                  243 non-null    int64  
 7   assets                                      243 non-null    int64  
 8   liabilities                                 243 non-null    int64  
 9   gdp                                         243 non-null    f

It is observed that prediction quarter and report_date are objects that need to be converted to dateTime and ticker is also object that needs to be converted to string maybe in future.

In [3]:
df['prediction_quarter']=pd.to_datetime(df['prediction_quarter'])
df['report_date']=pd.to_datetime(df['report_date'])

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
revenues,243.0,36810090000.0,29276460000.0,1384495000.0,13435000000.0,26470000000.0,55751500000.0,124300000000.0
research_and_development_expense,243.0,3678233000.0,3070424000.0,95772000.0,1367000000.0,2783000000.0,5430500000.0,13808000000.0
selling_general_and_administrative_expense,243.0,2129588000.0,1697939000.0,104851000.0,1007500000.0,1425000000.0,2979000000.0,7175000000.0
net_income,243.0,9393598000.0,8358866000.0,-6302000000.0,3059500000.0,6558000000.0,14430500000.0,36330000000.0
assets,243.0,203511200000.0,140054800000.0,4497718000.0,71138500000.0,180098000000.0,331553500000.0,619003000000.0
liabilities,243.0,100899100000.0,91314860000.0,543861000.0,25398500000.0,64909000000.0,174137500000.0,309259000000.0
gdp,243.0,19312.7,4920.852,12922.66,14980.19,18279.78,21751.24,30485.73
cpi,243.0,244.8372,33.81961,193.6667,217.9343,237.4783,258.8177,320.8003
unemployment,243.0,5.783128,2.12774,3.533333,4.133333,4.966667,7.233333,13.0


In [5]:
df.head()

Unnamed: 0,prediction_quarter,ticker,report_date,revenues,research_and_development_expense,selling_general_and_administrative_expense,net_income,assets,liabilities,gdp,cpi,unemployment
0,2025-07-01,AAPL,2025-06-30,94036000000,8866000000,6650000000,23434000000,331495000000,265665000000,30485.729,320.800333,4.166667
1,2025-04-01,AAPL,2025-03-31,95359000000,8550000000,6728000000,24780000000,331233000000,264437000000,30042.113,319.492,4.1
2,2025-01-01,AAPL,2024-12-31,124300000000,8268000000,7175000000,36330000000,344085000000,277327000000,29825.182,316.538667,4.133333
3,2024-10-01,AAPL,2024-09-30,94930000000,7765000000,6523000000,14736000000,364980000000,308030000000,29511.664,314.182667,4.166667
4,2024-07-01,AAPL,2024-06-30,85777000000,8006000000,6320000000,21448000000,331612000000,264904000000,29147.044,313.095667,4.0


In [6]:
# CRITICAL: Sort the data by company and then by date.
# This is essential for all time-series operations.
df.sort_values(by=['ticker', 'prediction_quarter'], inplace=True)
df.reset_index(drop=True, inplace=True)

print("Data sorted and prepared.")
df.head()

Data sorted and prepared.


Unnamed: 0,prediction_quarter,ticker,report_date,revenues,research_and_development_expense,selling_general_and_administrative_expense,net_income,assets,liabilities,gdp,cpi,unemployment
0,2005-07-01,AAPL,2005-06-30,3520000000,145000000,472000000,319000000,10488000000,3667000000,12922.656,193.666667,5.1
1,2005-10-01,AAPL,2005-09-30,3678000000,147000000,470000000,430000000,11551000000,4085000000,13142.642,196.6,4.966667
2,2006-01-01,AAPL,2005-12-31,5749000000,182000000,632000000,565000000,14181000000,5801000000,13324.204,198.433333,4.966667
3,2006-04-01,AAPL,2006-03-31,4359000000,176000000,592000000,410000000,13911000000,5229000000,13599.16,199.466667,4.733333
4,2006-07-01,AAPL,2006-06-30,4370000000,175000000,584000000,472000000,15114000000,5784000000,13753.424,201.266667,4.633333


In [7]:
# --- Create Shareholder's Equity ---
df['shareholder_equity'] = df['assets'] - df['liabilities']

# --- Create the Target Variable (y) ---
df['target_revenue_next_q'] = df.groupby('ticker')['revenues'].shift(-1)

# --- Create Time-Based Features (X) ---
df['prediction_quarter'] = pd.to_datetime(df['prediction_quarter']) # Ensure datetime type before using .dt
df['quarter'] = df['prediction_quarter'].dt.quarter
df['year'] = df['prediction_quarter'].dt.year

# --- Create Lag and Rolling Features (X) ---
features_to_engineer = {
    'revenues': {'lags': [1, 2, 3, 4], 'rolling_avg': [4]},
    'net_income': {'lags': [2, 3, 4], 'rolling_avg': [4]},
    'research_and_development_expense': {'lags': [4, 5, 6, 7, 8]},
    'selling_general_and_administrative_expense': {'lags': [1, 2], 'rolling_avg': [4]},
    'assets': {'lags': [2, 3, 4]},
    'liabilities': {'lags': [2, 3, 4]},
    'shareholder_equity': {'lags': [2, 3, 4]},
    'gdp': {'lags': [1, 2]},
    'cpi': {'lags': [1]},
    'unemployment': {'lags': [1, 2]}
}

for col, params in features_to_engineer.items():
    if 'lags' in params:
        for lag in params['lags']:
            df[f'{col}_lag_{lag}'] = df.groupby('ticker')[col].shift(lag)
    if 'rolling_avg' in params:
        for window in params['rolling_avg']:
            # ✅ --- FIX: Use .transform() for a robust, guaranteed alignment ---
            df[f'{col}_rolling_avg_{window}'] = df.groupby('ticker')[col].transform(
                lambda x: x.rolling(window=window).mean()
            )

df['revenue_seasonal_diff'] = df['revenues'] - df['revenues_rolling_avg_4']

print("Robust feature engineering complete.")

Robust feature engineering complete.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 48 columns):
 #   Column                                                    Non-Null Count  Dtype         
---  ------                                                    --------------  -----         
 0   prediction_quarter                                        243 non-null    datetime64[ns]
 1   ticker                                                    243 non-null    object        
 2   report_date                                               243 non-null    datetime64[ns]
 3   revenues                                                  243 non-null    int64         
 4   research_and_development_expense                          243 non-null    int64         
 5   selling_general_and_administrative_expense                243 non-null    int64         
 6   net_income                                                243 non-null    int64         
 7   assets                                      

In [8]:
# Drop rows with any NaN values created by the feature engineering
df_model = df.dropna().copy()
df_model.set_index('prediction_quarter', inplace=True)

# --- Define X and y from the now-indexed DataFrame ---
feature_cols = [c for c in df_model.columns if '_lag_' in c or '_rolling_avg_' in c or '_seasonal_diff_' in c]
time_cols = ['year', 'quarter']
X = df_model[feature_cols + time_cols].copy()

# y is now a DataFrame containing the target and the ticker ---
y = df_model[['target_revenue_next_q', 'ticker']]

# Convert quarter to a categorical type for the model
X['quarter'] = X['quarter'].astype('category')

print("Final feature set (X) columns:")
print(X.columns)
print(f"\nFinal shape of feature data (X): {X.shape}")
print(f"Final shape of target data (y): {y.shape}")

Final feature set (X) columns:
Index(['revenues_lag_1', 'revenues_lag_2', 'revenues_lag_3', 'revenues_lag_4',
       'revenues_rolling_avg_4', 'net_income_lag_2', 'net_income_lag_3',
       'net_income_lag_4', 'net_income_rolling_avg_4',
       'research_and_development_expense_lag_4',
       'research_and_development_expense_lag_5',
       'research_and_development_expense_lag_6',
       'research_and_development_expense_lag_7',
       'research_and_development_expense_lag_8',
       'selling_general_and_administrative_expense_lag_1',
       'selling_general_and_administrative_expense_lag_2',
       'selling_general_and_administrative_expense_rolling_avg_4',
       'assets_lag_2', 'assets_lag_3', 'assets_lag_4', 'liabilities_lag_2',
       'liabilities_lag_3', 'liabilities_lag_4', 'shareholder_equity_lag_2',
       'shareholder_equity_lag_3', 'shareholder_equity_lag_4', 'gdp_lag_1',
       'gdp_lag_2', 'cpi_lag_1', 'unemployment_lag_1', 'unemployment_lag_2',
       'year', 'quarter'],

In [9]:
# Define the time periods for our three datasets
train_end_date = pd.to_datetime('2022-12-31')
validation_end_date = pd.to_datetime('2023-12-31')

# Create the training set (all data before 2023)
X_train = X[X.index <= train_end_date]
y_train = y[X.index <= train_end_date]

# Create the validation set (all data in 2023)
X_val = X[(X.index > train_end_date) & (X.index <= validation_end_date)]
y_val = y[(X.index > train_end_date) & (X.index <= validation_end_date)]

# Create the test set (all data from 2024 onwards)
X_test = X[X.index > validation_end_date]
y_test = y[X.index > validation_end_date]

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 186
Validation set size: 12
Test set size: 18


In [10]:
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import itertools

# Define the Hyperparameter Grid
param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.7, 1.0],
}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

best_score = float('inf')
best_params = None

print(f"Starting hyperparameter search across {len(all_params)} combinations...")

for params in all_params:
    model = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical=True, random_state=42, **params)
    
    # ✅ --- FIX: Use the specific target column for fitting ---
    model.fit(X_train, y_train['target_revenue_next_q'], verbose=False)
    
    val_preds = model.predict(X_val)
    
    # ✅ --- FIX: Use the specific target column for scoring ---
    score = mean_absolute_error(y_val['target_revenue_next_q'], val_preds)
    
    if score < best_score:
        best_score = score
        best_params = params

print("\nHyperparameter tuning complete.")
print(f"Best MAE on Validation Set: ${best_score:,.2f}")
print("Best hyperparameters found:")
print(best_params)

Starting hyperparameter search across 16 combinations...

Hyperparameter tuning complete.
Best MAE on Validation Set: $4,061,859,877.33
Best hyperparameters found:
{'n_estimators': 500, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.7}


In [11]:
# Combine the original training and validation sets for training the final model on all available data before the test set
X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])


In [13]:
# 1. Define the quantiles for our interval
lower_alpha = 0.1
median_alpha = 0.5
upper_alpha = 0.9

# --- Train a separate model for each quantile ---

model_lower = xgb.XGBRegressor(objective='reg:quantileerror', quantile_alpha=lower_alpha, enable_categorical=True, random_state=42, **best_params)
model_median = xgb.XGBRegressor(objective='reg:quantileerror', quantile_alpha=median_alpha, enable_categorical=True, random_state=42, **best_params)
model_upper = xgb.XGBRegressor(objective='reg:quantileerror', quantile_alpha=upper_alpha, enable_categorical=True, random_state=42, **best_params)

print("Training 3 separate models for the prediction interval...")
model_lower.fit(X_train_full, y_train_full['target_revenue_next_q'])
model_median.fit(X_train_full, y_train_full['target_revenue_next_q'])
model_upper.fit(X_train_full, y_train_full['target_revenue_next_q'])

# --- Generate the prediction interval on the test set ---

lower_bound = model_lower.predict(X_test)
median_pred = model_median.predict(X_test)
upper_bound = model_upper.predict(X_test)

# --- Create a DataFrame to show the raw results ---

interval_results = pd.DataFrame({
    'Ticker': y_test['ticker'],
    'Actual_Revenue': y_test['target_revenue_next_q'],
    'Raw_Lower_Bound': lower_bound,
    'Raw_Prediction': median_pred,
    'Raw_Upper_Bound': upper_bound
}, index=y_test.index)

# --- Enforce the logical order to prevent quantile crossing ---

interval_results['Corrected_Lower_Bound'] = interval_results[['Raw_Lower_Bound', 'Raw_Prediction', 'Raw_Upper_Bound']].min(axis=1)
interval_results['Corrected_Upper_Bound'] = interval_results[['Raw_Lower_Bound', 'Raw_Prediction', 'Raw_Upper_Bound']].max(axis=1)
interval_results['Final_Prediction'] = interval_results['Raw_Prediction']

# --- Display the final, corrected results ---

# Select and reorder columns for a clean final view
final_display = interval_results[[
    'Ticker',
    'Actual_Revenue', 
    'Corrected_Lower_Bound', 
    'Final_Prediction', 
    'Corrected_Upper_Bound'
]]

# ✅ --- FIX: Use the correct syntax to sort by column, then by index ---
final_display = final_display.sort_values(by='Ticker').sort_index()

pd.options.display.float_format = '${:,.2f}'.format
print("\nFinal, Corrected Prediction Interval Results:")
display(final_display)

Training 3 separate models for the prediction interval...

Final, Corrected Prediction Interval Results:


Unnamed: 0_level_0,Ticker,Actual_Revenue,Corrected_Lower_Bound,Final_Prediction,Corrected_Upper_Bound
prediction_quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-01,AAPL,"$90,753,000,000.00","$71,455,866,880.00","$105,416,228,864.00","$105,416,228,864.00"
2024-01-01,MSFT,"$61,858,000,000.00","$61,654,548,480.00","$62,827,401,216.00","$62,827,401,216.00"
2024-01-01,GOOGL,"$80,539,000,000.00","$73,269,002,240.00","$76,510,314,496.00","$82,438,799,360.00"
2024-04-01,AAPL,"$85,777,000,000.00","$72,671,166,464.00","$80,046,374,912.00","$83,774,087,168.00"
2024-04-01,MSFT,"$64,727,000,000.00","$60,824,547,328.00","$60,824,547,328.00","$64,200,187,904.00"
2024-04-01,GOOGL,"$84,742,000,000.00","$74,159,136,768.00","$76,299,681,792.00","$79,457,681,408.00"
2024-07-01,AAPL,"$94,930,000,000.00","$68,181,692,416.00","$88,014,585,856.00","$91,170,447,360.00"
2024-07-01,MSFT,"$65,585,000,000.00","$60,347,969,536.00","$60,347,969,536.00","$65,855,942,656.00"
2024-07-01,GOOGL,"$88,268,000,000.00","$71,121,428,480.00","$83,732,430,848.00","$88,747,229,184.00"
2024-10-01,MSFT,"$69,632,000,000.00","$61,757,747,200.00","$62,853,713,920.00","$75,512,692,736.00"


In [14]:
# Go back to our full feature-engineered dataframe (before rows were dropped)
# and get the very last row for each company.
future_input_df = df.groupby('ticker').last()

# The 'prediction_quarter' for our forecast is the next quarter after the last known one
future_input_df['prediction_quarter'] = future_input_df['prediction_quarter'] + pd.DateOffset(months=3)

# Ensure the feature columns match the model's training columns exactly
X_future = future_input_df[X_train.columns]

# Convert quarter to the correct categorical type
X_future['quarter'] = X_future['quarter'].astype('category')

print("Feature set for the upcoming quarter:")
display(X_future)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_future['quarter'] = X_future['quarter'].astype('category')


Unnamed: 0_level_0,revenues_lag_1,revenues_lag_2,revenues_lag_3,revenues_lag_4,revenues_rolling_avg_4,net_income_lag_2,net_income_lag_3,net_income_lag_4,net_income_rolling_avg_4,research_and_development_expense_lag_4,...,shareholder_equity_lag_2,shareholder_equity_lag_3,shareholder_equity_lag_4,gdp_lag_1,gdp_lag_2,cpi_lag_1,unemployment_lag_1,unemployment_lag_2,year,quarter
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAPL,"$95,359,000,000.00","$124,300,000,000.00","$94,930,000,000.00","$85,777,000,000.00","$102,156,250,000.00","$36,330,000,000.00","$14,736,000,000.00","$21,448,000,000.00","$24,820,000,000.00","$8,006,000,000.00",...,"$66,758,000,000.00","$56,950,000,000.00","$66,708,000,000.00","$30,042.11","$29,825.18",$319.49,$4.10,$4.13,2025,3
GOOGL,"$90,234,000,000.00","$96,469,000,000.00","$88,268,000,000.00","$84,742,000,000.00","$92,849,750,000.00","$26,536,000,000.00","$26,301,000,000.00","$23,619,000,000.00","$28,893,250,000.00","$11,860,000,000.00",...,"$325,084,000,000.00","$314,119,000,000.00","$300,753,000,000.00","$30,042.11","$29,825.18",$319.49,$4.10,$4.13,2025,3
MSFT,"$70,066,000,000.00","$69,632,000,000.00","$65,585,000,000.00","$64,727,000,000.00","$70,431,000,000.00","$24,108,000,000.00","$24,667,000,000.00","$22,036,000,000.00","$25,458,000,000.00","$8,056,000,000.00",...,"$302,695,000,000.00","$287,723,000,000.00","$268,477,000,000.00","$30,042.11","$29,825.18",$319.49,$4.10,$4.13,2025,3


In [15]:
df.tail(10)

Unnamed: 0,prediction_quarter,ticker,report_date,revenues,research_and_development_expense,selling_general_and_administrative_expense,net_income,assets,liabilities,gdp,...,liabilities_lag_4,shareholder_equity_lag_2,shareholder_equity_lag_3,shareholder_equity_lag_4,gdp_lag_1,gdp_lag_2,cpi_lag_1,unemployment_lag_1,unemployment_lag_2,revenue_seasonal_diff
233,2023-04-01,MSFT,2023-03-31,52857000000,6984000000,1643000000,18299000000,380088000000,185405000000,"$27,216.44",...,"$181,683,000,000.00","$173,566,000,000.00","$166,542,000,000.00","$162,924,000,000.00","$26,770.51","$26,336.30",$298.50,$3.57,$3.53,"$959,250,000.00"
234,2023-07-01,MSFT,2023-06-30,56189000000,6739000000,2197000000,20081000000,411976000000,205753000000,"$27,530.06",...,"$198,298,000,000.00","$183,136,000,000.00","$173,566,000,000.00","$166,542,000,000.00","$27,216.44","$26,770.51",$301.19,$3.53,$3.57,"$3,210,250,000.00"
235,2023-10-01,MSFT,2023-09-30,56517000000,6659000000,1474000000,22291000000,445785000000,225071000000,"$28,074.85",...,"$186,218,000,000.00","$194,683,000,000.00","$183,136,000,000.00","$173,566,000,000.00","$27,530.06","$27,216.44",$303.42,$3.53,$3.53,"$1,939,500,000.00"
236,2024-01-01,MSFT,2023-12-31,62020000000,7142000000,1977000000,21870000000,470558000000,232290000000,"$28,424.72",...,"$181,416,000,000.00","$206,223,000,000.00","$194,683,000,000.00","$183,136,000,000.00","$28,074.85","$27,530.06",$306.04,$3.67,$3.53,"$5,124,250,000.00"
237,2024-04-01,MSFT,2024-03-31,61858000000,7653000000,1912000000,21939000000,484275000000,231123000000,"$28,708.16",...,"$185,405,000,000.00","$220,714,000,000.00","$206,223,000,000.00","$194,683,000,000.00","$28,424.72","$28,074.85",$308.16,$3.80,$3.67,"$2,712,000,000.00"
238,2024-07-01,MSFT,2024-06-30,64727000000,8056000000,2246000000,22036000000,512163000000,243686000000,"$29,147.04",...,"$205,753,000,000.00","$238,268,000,000.00","$220,714,000,000.00","$206,223,000,000.00","$28,708.16","$28,424.72",$310.97,$3.83,$3.80,"$3,446,500,000.00"
239,2024-10-01,MSFT,2024-09-30,65585000000,7544000000,1673000000,24667000000,523013000000,235290000000,"$29,511.66",...,"$225,071,000,000.00","$253,152,000,000.00","$238,268,000,000.00","$220,714,000,000.00","$29,147.04","$28,708.16",$313.10,$4.00,$3.83,"$2,037,500,000.00"
240,2025-01-01,MSFT,2024-12-31,69632000000,7917000000,1823000000,24108000000,533898000000,231203000000,"$29,825.18",...,"$232,290,000,000.00","$268,477,000,000.00","$253,152,000,000.00","$238,268,000,000.00","$29,511.66","$29,147.04",$314.18,$4.17,$4.00,"$4,181,500,000.00"
241,2025-04-01,MSFT,2025-03-31,70066000000,8198000000,1737000000,25824000000,562624000000,240733000000,"$30,042.11",...,"$231,123,000,000.00","$287,723,000,000.00","$268,477,000,000.00","$253,152,000,000.00","$29,825.18","$29,511.66",$316.54,$4.13,$4.17,"$2,563,500,000.00"
242,2025-07-01,MSFT,2025-06-30,76441000000,8829000000,1990000000,27233000000,619003000000,275524000000,"$30,485.73",...,"$243,686,000,000.00","$302,695,000,000.00","$287,723,000,000.00","$268,477,000,000.00","$30,042.11","$29,825.18",$319.49,$4.10,$4.13,"$6,010,000,000.00"


In [16]:
# Combine all historical data
X_all_history = pd.concat([X_train_full, X_test])
y_all_history = pd.concat([y_train_full, y_test])

print(f"Re-training models on all {len(X_all_history)} available historical data points...")

# Re-train all three quantile models on the complete dataset
model_lower.fit(X_all_history, y_all_history['target_revenue_next_q'])
model_median.fit(X_all_history, y_all_history['target_revenue_next_q'])
model_upper.fit(X_all_history, y_all_history['target_revenue_next_q'])

print("Final models are ready.")

Re-training models on all 216 available historical data points...
Final models are ready.


In [18]:
# Generate the prediction interval for the future quarter
future_lower = model_lower.predict(X_future)
future_median = model_median.predict(X_future)
future_upper = model_upper.predict(X_future)

# Create a DataFrame to show the forecast
forecast_df = pd.DataFrame({
    'Ticker': X_future.index,
    # ✅ --- FIX: Get the date from 'future_input_df', not 'X_future' ---
    'Forecast_Quarter': future_input_df['prediction_quarter'],
    'Lower_Bound': future_lower,
    'Forecast': future_median,
    'Upper_Bound': future_upper
}).set_index('Ticker')

# Apply the same post-processing to prevent quantile crossing
forecast_df['Lower_Bound'] = forecast_df[['Lower_Bound', 'Forecast', 'Upper_Bound']].min(axis=1)
forecast_df['Upper_Bound'] = forecast_df[['Lower_Bound', 'Forecast', 'Upper_Bound']].max(axis=1)

print("\n--- FINAL REVENUE FORECAST ---")
display(forecast_df)


--- FINAL REVENUE FORECAST ---


Unnamed: 0_level_0,Forecast_Quarter,Lower_Bound,Forecast,Upper_Bound
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,2025-10-01,"$77,680,574,464.00","$96,196,132,864.00","$96,196,132,864.00"
GOOGL,2025-10-01,"$78,687,600,640.00","$100,290,977,792.00","$102,920,724,480.00"
MSFT,2025-10-01,"$74,857,472,000.00","$77,237,428,224.00","$77,237,428,224.00"
