In [8]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import joblib
import lightgbm as lgb
import kaggle_evaluation.default_inference_server

#CONFIG STUFF

KAGGLE_INPUT_PATH = "/kaggle/input/hull-tactical-market-prediction/"
MODEL_SAVE_PATH = "/tmp/final_lgbm_model.pkl"  
TARGET_COL = "market_forward_excess_returns"

#So this function converts our predictions to portfolio weights
#Using tanh because it works well in practice.
def convert_predictions_to_weights(preds):
    preds = np.asarray(preds)
    # Standardize first
    z_scores = (preds - preds.mean()) / (preds.std() + 1e-9)  # avoid division by zero
    weights = 1.0 + np.tanh(z_scores)  # maps to [0, 2] range
    return np.clip(weights, 0.0, 2.0)  # just to be extra safe

#Global vars-not ideal but works for this use case
model_instance = None
feature_list = []
is_model_loaded = False
#TRAINING PHASE-runs once when notebook starts

try:
    training_file = Path(KAGGLE_INPUT_PATH) / "train.csv"
    train_df = pd.read_csv(training_file)
    
    print(f"Loaded training data with {len(train_df)} rows")

    # Feature engineering - excluding obvious non-predictive columns
    cols_to_exclude = ["date_id", "forward_returns", "risk_free_rate", TARGET_COL]
    
    #Getting all numeric columns that aren't in the exclude list
    numeric_features = []
    for col in train_df.columns:
        if col not in cols_to_exclude and np.issubdtype(train_df[col].dtype, np.number):
            numeric_features.append(col)
    
    feature_list = numeric_features.copy()
    print(f"Selected {len(feature_list)} features for training")

    #Data cleaning-converting everything to proper numeric format
    for feature in feature_list:
        train_df[feature] = pd.to_numeric(train_df[feature], errors="coerce")

    #Removing rows where target is missing (can't train on those)
    train_df = train_df.dropna(subset=[TARGET_COL])
    print(f"After cleaning: {len(train_df)} training samples")

    #Preparing training data
    X_train = train_df[feature_list]
    y_train = train_df[TARGET_COL].astype(float)

    #LightGBM setup -these parameters worked well in experiments from NotebookA
    lgbm_model = lgb.LGBMRegressor(
        objective="regression",
        metric="rmse",
        n_estimators=900,      #Maybe morethanrequired
        learning_rate=0.05,    #conservative learning rate
        num_leaves=100,
        max_depth=10,
        n_jobs=-1,             #use all available cores
        random_state=42,       #for reproducibility
        verbose=-1,            #quiet training
    )

    print("Starting model training...")
    lgbm_model.fit(X_train, y_train)
    print("Training completed successfully!")

    #Saving the trained model and feature names
    model_data = {
        "model": lgbm_model,
        "features": feature_list
    }
    joblib.dump(model_data, MODEL_SAVE_PATH)
    print(f"Model saved to: {MODEL_SAVE_PATH}")

except Exception as error:
    print(f"Training failed with error: {error}")
    #Creating a dummy model as fallback
    class FallbackModel:
        def predict(self, X):
            return np.zeros(len(X))  # return zeros if everything fails
    
    fallback_data = {"model": FallbackModel(), "features": []}
    joblib.dump(fallback_data, MODEL_SAVE_PATH)
    print("Created fallback model")
    
# PREDICTION FUNCTION -Which is found on Kaggle's evaluation system

def predict(test_data: pl.DataFrame) -> float:
    """
    This gets called for each test sample during live evaluation.
    Returns a portfolio weight between 0 and 2.
    """
    global is_model_loaded, model_instance, feature_list

    # Load model on first call (lazy loading)
    if not is_model_loaded:
        try:
            saved_data = joblib.load(MODEL_SAVE_PATH)
            model_instance = saved_data["model"]
            feature_list = saved_data["features"]
            is_model_loaded = True
            print("Model loaded successfully for inference")
        except Exception as load_error:
            print(f"Failed to load model: {load_error}")
            return 1.0  # return neutral weight as fallback

    # Convert polars to pandas (easier to work with)
    if isinstance(test_data, pl.DataFrame):
        test_df = test_data.to_pandas()
    else:
        test_df = test_data

    #Prepare feature matrix for prediction
    X_pred = pd.DataFrame(index=test_df.index)
    
    #Fill in features, using NaN for missing ones
    for feature_name in feature_list:
        if feature_name in test_df.columns:
            X_pred[feature_name] = pd.to_numeric(test_df[feature_name], errors="coerce")
        else:
            X_pred[feature_name] = np.nan  # LightGBM can handle NaN values

    #Get prediction from model
    prediction = float(model_instance.predict(X_pred[feature_list])[0])
    
    #Convert raw prediction to portfolio weight
    portfolio_weight = convert_predictions_to_weights([prediction])[0]

    return float(portfolio_weight)
#STARTING THE INFERENCE SERVER
#Setting up the inference server
server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

#Run in appropriate mode based on environment
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    print("Running in competition mode...")
    server.serve()
else:
    print("Running in local test mode...")
    server.run_local_gateway((KAGGLE_INPUT_PATH,))

Loaded training data with 9021 rows
Selected 94 features for training
After cleaning: 9021 training samples
Starting model training...
Training completed successfully!
Model saved to: /tmp/final_lgbm_model.pkl
Running in local test mode...


                This exceeds the startup time limit of 900 seconds that the gateway will enforce
                during the rerun on the hidden test set. Start the server before performing any time consuming steps.


Model loaded successfully for inference
