## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

## Constants

In [None]:
# Set a style for all plots
plt.style.use("ggplot")
# Set the maximum number of columns to display for DataFrames
pd.options.display.max_columns = 100

# --- File Paths ---
BASE_PATH = "./dataset/"

TRAIN_PATH = f"{BASE_PATH}train.csv"
LABELS_PATH = f"{BASE_PATH}train_labels.csv"
PAIRS_PATH = f"{BASE_PATH}target_pairs.csv"
TEST_PATH = f"{BASE_PATH}test.csv"

## Training

In [None]:
# --- Section 1: Analyze train.csv ---
print("--- Loading train.csv ---")
df_train = pd.read_csv(TRAIN_PATH)
df_train['date_id'] = pd.to_datetime(df_train['date_id'])
print(f"Train data shape: {df_train.shape}")


# --- Visualization of Time Series ---
print("\n--- Generating Time Series Visualizations ---")
all_columns = df_train.columns.tolist()

# Find the first column for each source. This is robust.
lme_col_to_plot = [col for col in all_columns if col.startswith('LME_')][0]
jpx_col_to_plot = [col for col in all_columns if col.startswith('JPX_')][0]
us_stock_col_to_plot = [col for col in all_columns if col.startswith('US_STOCK_')][0] if any(col.startswith('US_STOCK_') for col in all_columns) else None
fx_col_to_plot = [col for col in all_columns if col.startswith('FX_')][0] if any(col.startswith('FX_') for col in all_columns) else None

fig, axes = plt.subplots(2, 2, figsize=(18, 10))
fig.suptitle('Sample Time Series from Different Markets', fontsize=16)

def plot_series(ax, column_name, title):
    if column_name:
        sns.lineplot(data=df_train, x='date_id', y=column_name, ax=ax)
        ax.set_title(title)
    else:
        ax.set_title(f'{title}\n(No data found)')
    ax.set_xlabel('Date')
    ax.set_ylabel('Value')

plot_series(axes[0, 0], lme_col_to_plot, f'LME: {lme_col_to_plot}')
plot_series(axes[0, 1], jpx_col_to_plot, f'JPX: {jpx_col_to_plot}')
plot_series(axes[1, 0], us_stock_col_to_plot, f'US Stock: {us_stock_col_to_plot or "N/A"}')
plot_series(axes[1, 1], fx_col_to_plot, f'FX: {fx_col_to_plot or "N/A"}')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


# --- Section 2: Analyze train_labels.csv ---
print("\n--- Loading train_labels.csv ---")
df_labels = pd.read_csv(LABELS_PATH)
df_labels['date_id'] = pd.to_datetime(df_labels['date_id'])
print(f"Train labels shape: {df_labels.shape}")

print("\n--- Generating Target Distribution Visualizations ---")
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
fig.suptitle('Distribution of Sample Target Values', fontsize=16)

sns.histplot(df_labels['target_10'], kde=True, ax=axes[0, 0], bins=50).set_title('Distribution of target_10')
sns.histplot(df_labels['target_110'], kde=True, ax=axes[0, 1], bins=50).set_title('Distribution of target_110')
sns.histplot(df_labels['target_210'], kde=True, ax=axes[1, 0], bins=50).set_title('Distribution of target_210')
sns.histplot(df_labels['target_410'], kde=True, ax=axes[1, 1], bins=50).set_title('Distribution of target_410')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


# --- Section 3: Analyze target_pairs.csv ---
print("\n--- Loading target_pairs.csv ---")
df_pairs = pd.read_csv(PAIRS_PATH)
print(f"Target pairs shape: {df_pairs.shape}")

print("\n--- Analyzing Target Recipes ---")
df_pairs['is_pair'] = df_pairs['pair'].apply(lambda x: '-' in str(x))
pair_counts = df_pairs['is_pair'].value_counts()

print(f"Number of single-asset targets: {pair_counts.get(False, 0)}")
print(f"Number of paired-asset targets: {pair_counts.get(True, 0)}")

lag_counts = df_pairs['lag'].value_counts()
print("\n--- Distribution of Time Lags ---")
print(lag_counts)

In [None]:


# --- Handling Missing Values ---

# We'll use forward-fill to handle missing data.
# This propagates the last valid observation forward.
# We will fill the remaining NaN's with 0
df_train_processed = df_train.fillna(method='ffill').fillna(0)


# Verify that there are no more missing values
print("Missing values after forward-filling and back-filling:")
print(df_train_processed.isnull().sum().sum())



In [None]:


# --- Creating Lag Features ---

# Let's create lag features for a few selected columns to demonstrate.
# We'll pick the same columns we visualized earlier.
cols_to_lag = [
    'LME_AH_Close', 
    'JPX_Gold_Standard_Futures_Close',
    'US_Stock_VTV_adj_close',
    'FX_USDJPY'
]

# We are creating 1, 2, and 3-day lags
for col in cols_to_lag:
    for lag in range(1, 4):
        # The new column name will be like 'LME_AH_Close_lag1'
        df_train_processed[f'{col}_lag{lag}'] = df_train_processed[col].shift(lag)

# Display the new lag features for the first few rows
# Note: The first few rows will have NaN for lag features, which is expected.
print("DataFrame with new Lag Features (showing tail):")
display(df_train_processed.tail())



In [None]:
# --- Creating Rolling Window Features ---

# We'll use the same columns for consistency
cols_to_roll = [
    'LME_AH_Close', 
    'JPX_Gold_Standard_Futures_Close',
    'US_Stock_VTV_adj_close',
    'FX_USDJPY'
]

# Let's define a window size, e.g., 5 days and 20 days
window_sizes = [5, 20]

for col in cols_to_roll:
    for window in window_sizes:
        # Rolling Mean
        df_train_processed[f'{col}_roll_mean_{window}'] = df_train_processed[col].rolling(window=window).mean()
        # Rolling Standard Deviation (Volatility)
        df_train_processed[f'{col}_roll_std_{window}'] = df_train_processed[col].rolling(window=window).std()

# Display the new rolling features
print("DataFrame with new Rolling Window Features (showing tail):")
display(df_train_processed.tail())

In [None]:


# --- Merging Features with Labels ---

# Merge the processed training data with the labels on 'date_id'
final_df = pd.merge(df_train_processed, df_labels, on='date_id', how='inner')

# After creating lagged features, the first few rows will have NaNs.
# It's best practice to drop these rows as they cannot be used for training.
final_df = final_df.dropna()

print(f"Shape of the final, merged DataFrame: {final_df.shape}")
print("Final DataFrame ready for model training (showing head):")
display(final_df.head())



In [None]:
# --- Visualize Rolling Mean ---

# We will use the final_df which has been processed and merged
# Let's focus on one asset to see the effect clearly
asset_to_plot = 'LME_AH_Close'
window_sizes = [5, 20] # The windows we created earlier

plt.figure(figsize=(15, 7))
# Plot the original closing price
plt.plot(final_df['date_id'], final_df[asset_to_plot], label='Original Price', alpha=0.6)

# Plot the rolling means
for window in window_sizes:
    plt.plot(final_df['date_id'], final_df[f'{asset_to_plot}_roll_mean_{window}'], label=f'{window}-Day Rolling Mean')

plt.title(f'Original Price vs. Rolling Means for {asset_to_plot}', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()


In [None]:


# --- Visualize Rolling Volatility ---

asset_to_plot = 'JPX_Gold_Standard_Futures_Close'
window_size = 20 # A common window for monthly volatility

plt.figure(figsize=(15, 7))
plt.plot(final_df['date_id'], final_df[f'{asset_to_plot}_roll_std_{window_size}'])
plt.title(f'{window_size}-Day Rolling Volatility for {asset_to_plot}', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Standard Deviation (Volatility)')
plt.show()



In [None]:
# --- Calculate and Visualize Feature Correlation ---

# Select the features we created for a specific asset
features_to_correlate = [
    'LME_AH_Close_lag1', 'LME_AH_Close_lag2', 'LME_AH_Close_lag3',
    'LME_AH_Close_roll_mean_5', 'LME_AH_Close_roll_std_5',
    'LME_AH_Close_roll_mean_20', 'LME_AH_Close_roll_std_20'
]

# Select a target to correlate against
target_col = 'target_0'

# Calculate the correlation of each feature with the target
correlations = final_df[features_to_correlate + [target_col]].corr()[target_col].drop(target_col)

# Plot the correlations
plt.figure(figsize=(12, 7))
correlations.sort_values().plot(kind='barh', color='skyblue')
plt.title(f'Correlation of Engineered Features with {target_col}', fontsize=16)
plt.xlabel('Correlation Coefficient')
plt.ylabel('Feature')
plt.grid(axis='x', linestyle='--')
plt.show()

In [None]:


# --- Visualize Feature Distributions ---

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Distribution of Engineered Features', fontsize=16)

# Plot distribution of a lag feature
sns.histplot(final_df['LME_AH_Close_lag1'], kde=True, ax=axes[0, 0], bins=40)
axes[0, 0].set_title('Distribution of LME_AH_Close_lag1')

# Plot distribution of a rolling mean feature
sns.histplot(final_df['LME_AH_Close_roll_mean_20'], kde=True, ax=axes[0, 1], bins=40)
axes[0, 1].set_title('Distribution of LME_AH_Close_roll_mean_20')

# Plot distribution of a rolling std feature
sns.histplot(final_df['JPX_Gold_Standard_Futures_Close_roll_std_20'], kde=True, ax=axes[1, 0], bins=40)
axes[1, 0].set_title('Distribution of JPX Gold Volatility')

# Plot distribution of another rolling std feature
sns.histplot(final_df['US_Stock_VTV_adj_close_roll_std_20'], kde=True, ax=axes[1, 1], bins=40)
axes[1, 1].set_title('Distribution of US Stock VTV Volatility')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()



In [None]:
# ==================================================================
# First, re-create final_df with the corrected index logic
# ==================================================================
import os 
# Merging Features with Labels (Corrected)
final_df = pd.merge(df_train_processed, df_labels, on='date_id', how='inner')

# After creating lagged features, the first few rows will have NaNs.
final_df = final_df.dropna()

# --- THIS IS THE FIX ---
# Reset the index to be a clean 0, 1, 2, ... sequence
final_df = final_df.reset_index(drop=True)

print(f"Shape of the final, merged DataFrame: {final_df.shape}")
print("Final DataFrame ready for model training (showing head):")
display(final_df.head())


# ==================================================================
# Now, run the full training pipeline with the corrected final_df
# ==================================================================
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import spearmanr
import gc

# --- 0. Configuration ---
class CFG:
    N_SPLITS = 5
    N_TOP_FEATURES = 200
    N_TARGETS_TO_TRAIN = 423
    TARGET_COLS = [f'target_{i}' for i in range(423)]
    
    LGB_PARAMS = {
        'objective': 'regression_l1',
        'metric': 'mae',
        'n_estimators': 5000, # <-- INCREASE THIS VALUE
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'num_leaves': 31,
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42,
        'boosting_type': 'gbdt',
    }

# --- 1. Custom Evaluation Metric ---
def sharpe_ratio_metric(preds, labels):
    daily_correlations = []
    for i in range(len(preds)):
        if np.std(preds[i]) > 0 and np.std(labels[i]) > 0:
            daily_correlations.append(spearmanr(preds[i], labels[i]).correlation)
        else:
            daily_correlations.append(0)
    
    daily_correlations = np.array(daily_correlations)
    mean_corr = np.nanmean(daily_correlations)
    std_corr = np.nanstd(daily_correlations)
    
    if std_corr > 0:
        return mean_corr / std_corr
    else:
        return 0

# --- 2. Feature Selection ---
print("--- Starting Feature Selection ---")
all_features = [col for col in final_df.columns if col not in ['date_id'] + CFG.TARGET_COLS]
X = final_df[all_features]
y = final_df[CFG.TARGET_COLS]
y_proxy = y.mean(axis=1)

temp_model = lgb.LGBMRegressor(**CFG.LGB_PARAMS)
temp_model.fit(X, y_proxy)

feature_importances = pd.DataFrame({
    'feature': all_features,
    'importance': temp_model.feature_importances_
}).sort_values('importance', ascending=False)

top_features = feature_importances['feature'].head(CFG.N_TOP_FEATURES).tolist()
print(f"Selected {len(top_features)} features.")
X = X[top_features]

print(f"\n--- Starting Full CV Pipeline for {CFG.N_TARGETS_TO_TRAIN} Targets ---")

oof_df = pd.DataFrame(index=X.index, columns=CFG.TARGET_COLS)
y_oof = y.copy()
tscv = TimeSeriesSplit(n_splits=CFG.N_SPLITS)
targets_to_run = CFG.TARGET_COLS[:CFG.N_TARGETS_TO_TRAIN]

for i, target_name in enumerate(targets_to_run):
    print(f"\n--- Training for {target_name} ({i+1}/{len(targets_to_run)}) ---")
    y_single_target = y[target_name]

    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y_single_target.iloc[train_index], y_single_target.iloc[val_index]

        model = lgb.LGBMRegressor(**CFG.LGB_PARAMS)
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(100, verbose=False)])
        
        # This line is where the predictions are generated and stored
        oof_df.loc[val_index, target_name] = model.predict(X_val)
        
        del X_train, X_val, y_train, y_val, model
        gc.collect()

# ==================================================================
# PART 4: FINAL EVALUATION
# Now that oof_df is filled, we can calculate the score.
# ==================================================================
print("\n--- Calculating Final CV Score ---")
oof_df.dropna(how='all', inplace=True)
y_oof = y_oof.loc[oof_df.index]
targets_trained = [col for col in targets_to_run if col in oof_df.columns]
oof_df = oof_df[targets_trained]
y_oof = y_oof[targets_trained]

oof_preds_np = oof_df.to_numpy()
y_oof_np = y_oof.to_numpy()

final_sharpe_score = sharpe_ratio_metric(oof_preds_np, y_oof_np)

print(f"\n==========================================================")
print(f"Final Cross-Validated Sharpe Ratio: {final_sharpe_score:.4f}")
print(f"==========================================================")


# ==================================================================
# PART 5: TRAIN FINAL MODELS AND SAVE ARTIFACTS FOR SUBMISSION
# This is the final step, done after evaluation is complete.
# ==================================================================
print("\n--- Training final models on all data and saving artifacts ---")

# --- 5a. Save the list of top features ---
FEATURES_PATH = '/kaggle/working/top_200_features.txt'
with open(FEATURES_PATH, 'w') as f:
    for feature in top_features:
        f.write(f"{feature}\n")
print(f"Successfully saved feature list to: {FEATURES_PATH}")

# --- 5b. Train and save one final model per target ---
MODELS_DIR = '/kaggle/working/models/'
os.makedirs(MODELS_DIR, exist_ok=True)

for i, target_name in enumerate(CFG.TARGET_COLS):
    print(f"  -> Training final model for {target_name} ({i+1}/{len(CFG.TARGET_COLS)})")
    y_single_target = y[target_name]
    final_model = lgb.LGBMRegressor(**CFG.LGB_PARAMS)
    final_model.fit(X, y_single_target)
    model_path = os.path.join(MODELS_DIR, f'model_{target_name}.txt')
    final_model.booster_.save_model(model_path)

print(f"\nSuccessfully trained and saved {len(CFG.TARGET_COLS)} models to the '{MODELS_DIR}' directory.")
print("\nYour artifacts are now ready for the submission/inference notebook!")


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import gc

# ==================================================================
# PART 1: LOAD ARTIFACTS AND INITIALIZE GLOBAL STATE
# This part runs only once when your notebook starts.
# ==================================================================
print("--- Loading models and features for submission ---")

MODELS_DIR = '/kaggle/input/trained-feature-model/models'
FEATURES_PATH = '/kaggle/input/trained-feature-model/top_200_features.txt'

# Load the list of top features we trained on
with open(FEATURES_PATH, 'r') as f:
    top_features = [line.strip() for line in f]

# Load all 424 trained LightGBM models into memory
loaded_models = {}
for i in range(424):
    target_name = f'target_{i}'
    model_path = os.path.join(MODELS_DIR, f'model_{target_name}.txt')
    if os.path.exists(model_path):
        loaded_models[target_name] = lgb.Booster(model_file=model_path)
print(f"Loaded {len(loaded_models)} models.")

# Initialize a global history buffer with the tail of our training data
# This is crucial for calculating rolling features on the first test day.
# `final_df` should be the final training dataframe from your previous cells.
history_df = final_df.tail(60).copy()
ycols = [f'target_{i}' for i in range(424)]


# ==================================================================
# PART 2: THE PREDICT FUNCTION
# The Kaggle environment will call this function repeatedly.
# ==================================================================

def predict(
    test: pd.DataFrame,
    lag1: pd.DataFrame, 
    lag2: pd.DataFrame,
    lag3: pd.DataFrame,
    lag4: pd.DataFrame,
) -> pd.DataFrame:
    """
    Predicts target values using our trained LightGBM models and feature engineering.
    """
    global history_df, loaded_models, top_features, ycols

    # --- 1. Data Conversion and Handling Empty Input ---
    if hasattr(test, 'to_pandas'):
        test = test.to_pandas()
    
    if len(test) == 0:
        return pd.DataFrame(0, index=range(1), columns=ycols)

    # --- 2. Feature Engineering with History Buffer ---
    # Append new test data to our history
    history_df = pd.concat([history_df, test], ignore_index=True)
    latest_row_idx = history_df.index[-1]
    
    # Calculate features for the new row on-the-fly
    for col in top_features:
        if '_lag' in col:
            original_col, lag_str = col.rsplit('_lag', 1)
            lag_num = int(lag_str)
            if original_col in history_df.columns:
                history_df.loc[latest_row_idx, col] = history_df.loc[latest_row_idx - lag_num, original_col]
        elif '_roll_' in col:
            parts = col.split('_roll_')
            original_col = parts[0]
            agg_type, window_str = parts[1].split('_', 1)
            window = int(window_str)
            if original_col in history_df.columns:
                window_data = history_df[original_col].tail(window)
                if agg_type == 'mean':
                    history_df.loc[latest_row_idx, col] = window_data.mean()
                elif agg_type == 'std':
                    history_df.loc[latest_row_idx, col] = window_data.std()
    
    # Get the final feature vector for the current day
    current_features = history_df[top_features].iloc[-1].fillna(0)

    # --- 3. Prediction with LightGBM Models ---
    preds = {}
    for i in range(424):
        target_name = f'target_{i}'
        model = loaded_models.get(target_name)
        if model:
            # Predict using the single feature vector
            pred = model.predict(current_features.values.reshape(1, -1))[0]
            preds[target_name] = pred
        else:
            preds[target_name] = 0.0 # Default if model is missing

    # Create the final prediction DataFrame
    preds_df = pd.DataFrame([preds], columns=ycols)
    
    return preds_df

# ==================================================================
# PART 3: API INITIALIZATION AND SERVER RUN
# This part connects our function to the Kaggle environment.
# ==================================================================
# Add the directory containing the API script to the Python path
import sys
sys.path.append('/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/')

import kaggle_evaluation.mitsui_inference_server
inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/mitsui-commodity-prediction-challenge/',))
# Display the local test submission file
# display(pd.read_parquet('/kaggle/working/submission.parquet'))

