In [None]:
! git clone https://github.com/priorlabs/tabpfn-extensions.git
! pip install -e tabpfn-extensions

In [1]:
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
import warnings

# Suppress potential warnings for a cleaner output
warnings.filterwarnings("ignore")

# --- 1. Load Data ---
# Your file paths are correct for Kaggle notebooks.
try:
    train_df = pd.read_csv("/kaggle/input/training/train.csv")
    test_df = pd.read_csv("/kaggle/input/testing/test.csv")
    sample_submission_df = pd.read_csv("/kaggle/input/samplesubmission/sample_solution.csv")
except FileNotFoundError as e:
    print(f"Error: Could not find data files at the specified Kaggle paths.")
    exit()


# --- THIS IS THE CORRECTED FEATURE ENGINEERING FUNCTION ---
def feature_engineer_v2(df):
    """
    Creates new features for the model.
    This version is adapted to the specific column names in your CSV files.
    """
    # Use the correct column names ending in '_fraction'
    comp_frac_cols = [
        'Component1_fraction', 'Component2_fraction', 'Component3_fraction',
        'Component4_fraction', 'Component5_fraction'
    ]
    
    prop_names = [f'Property{i}' for i in range(1, 11)]

    # Create Weighted Property Features
    for prop in prop_names:
        weighted_prop_col_name = f'Weighted_{prop}'
        df[weighted_prop_col_name] = 0
        for frac_col in comp_frac_cols:
            # Updated logic to correctly get the component number (e.g., '1')
            comp_num = frac_col.split('_')[0].replace('Component', '')
            # Construct the correct property column name (e.g., 'Component1_Property1')
            comp_prop_col = f'Component{comp_num}_{prop}'
            df[weighted_prop_col_name] += df[frac_col] * df[comp_prop_col]

    # Create Interaction Features
    for i in range(len(comp_frac_cols)):
        for j in range(i + 1, len(comp_frac_cols)):
            col1 = comp_frac_cols[i]
            col2 = comp_frac_cols[j]
            interaction_col_name = f'{col1}_x_{col2}'
            df[interaction_col_name] = df[col1] * df[col2]
            
    return df

# Apply the CORRECT feature engineering to both training and test data
X_train_featured = feature_engineer_v2(train_df.copy())
X_test_featured = feature_engineer_v2(test_df.copy())


# --- 2. Prepare Data ---
test_ids = X_test_featured['ID']
y_train = X_train_featured.filter(like="Blend") # Targets are now in the featured DF

# Important: Use the featured dataframes to create X_train and X_test
X_train = X_train_featured.drop(columns=y_train.columns)
if 'ID' in X_train.columns:
    X_train = X_train.drop(columns=['ID'])
X_test = X_test_featured.drop(columns=['ID'])

# Align columns
X_test = X_test[X_train.columns]

print(f"Number of features after engineering: {X_train.shape[1]}")

# --- 3. Define and Train TabPFN Model ---
print("\nDefining and training TabPFN model on FEATURED data...")
base_worker_model = AutoTabPFNRegressor(device='cuda')
multi_output_manager = MultiOutputRegressor(base_worker_model)
multi_output_manager.fit(X_train, y_train)
print("Training complete.")


# --- 4. Make Predictions & Create Submission File ---
print("Making predictions...")
predictions = multi_output_manager.predict(X_test)
submission_df = pd.DataFrame(predictions, columns=sample_submission_df.columns[1:])
submission_df.insert(0, 'ID', test_ids)
submission_df.to_csv("submission_tabpfn_featured.csv", index=False)

print("\nSubmission file 'submission_tabpfn_featured.csv' has been created successfully.")
print(submission_df.head())

Number of features after engineering: 75

Defining and training TabPFN model on FEATURED data...


tabpfn-v2-regressor.ckpt:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

tabpfn-v2-regressor.ckpt:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

tabpfn-v2-regressor-2noar4o2.ckpt:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

Training complete.
Making predictions...

Submission file 'submission_tabpfn_featured.csv' has been created successfully.
   ID  BlendProperty1  BlendProperty2  BlendProperty3  BlendProperty4  \
0   1        0.169386        0.222103        0.743162        0.631759   
1   2       -0.778638       -0.631997       -1.172039        0.037656   
2   3        1.757005        1.122206        1.063820        1.079432   
3   4       -0.474016        0.355459        0.969324       -0.739682   
4   5        0.175554       -1.198867        1.089620        0.481416   

   BlendProperty5  BlendProperty6  BlendProperty7  BlendProperty8  \
0        0.353870        0.703197        0.716492        0.390561   
1       -0.724650       -0.105775       -1.163194       -1.097612   
2        2.562512        1.868811        1.038993        1.986182   
3        1.898897       -0.451884        0.921535        1.727649   
4        2.369047        0.247823        1.045596       -0.146356   

   BlendProperty9  Blend

In [3]:
import pandas as pd

# --- Configuration ---
# List the filenames of the submission files you want to average.
# MAKE SURE YOU HAVE RUN THE OTHER SCRIPTS TO GENERATE THESE FILES FIRST!
files_to_ensemble = [
    '/kaggle/input/finale/submission_tabpfn_featured.csv',          # From the original simple script
    '/kaggle/input/finale/submission_v3_kfold_lgbm.csv',   # From the K-Fold LightGBM script
    '/kaggle/input/finale/submission_v4_kfold_catboost.csv'# From the K-Fold CatBoost script
]

# --- Load and Combine ---
print("Loading submission files for ensembling...")
# Load the first file
try:
    final_submission = pd.read_csv(files_to_ensemble[0])
except FileNotFoundError:
    print(f"ERROR: Cannot find the base file '{files_to_ensemble[0]}'. Please generate it first.")
    exit()

# Loop through the rest of the files and add their values
for i in range(1, len(files_to_ensemble)):
    file = files_to_ensemble[i]
    try:
        submission_to_add = pd.read_csv(file)
        # Add the numeric columns together
        final_submission.iloc[:, 1:] += submission_to_add.iloc[:, 1:]
    except FileNotFoundError:
        print(f"ERROR: Cannot find '{file}'. Skipping it. Please generate it for a full ensemble.")
        # We can choose to skip or exit. For now, we'll just print a warning.
        files_to_ensemble.pop(i) # Remove from list so our average is correct


# --- Calculate the Average ---
# Divide the summed values by the number of files successfully loaded
num_files = len(files_to_ensemble)
print(f"Averaging the predictions from {num_files} models...")
final_submission.iloc[:, 1:] = final_submission.iloc[:, 1:] / num_files

# --- Save the Final Ensemble ---
final_submission.to_csv('submission_final_ensemble.csv', index=False)

print("\nFinal ensembled submission 'submission_final_ensemble.csv' has been created successfully!")
print("This is the file you should submit to the competition.")
print(final_submission.head())

Loading submission files for ensembling...
Averaging the predictions from 3 models...

Final ensembled submission 'submission_final_ensemble.csv' has been created successfully!
This is the file you should submit to the competition.
   ID  BlendProperty1  BlendProperty2  BlendProperty3  BlendProperty4  \
0   1        0.120042        0.208411        0.642177        0.698759   
1   2       -0.733799       -0.549514       -1.105589        0.071160   
2   3        1.714802        1.108693        1.086873        1.100502   
3   4       -0.511516        0.411247        0.709046       -0.692952   
4   5        0.222256       -1.202253        1.104600        0.459137   

   BlendProperty5  BlendProperty6  BlendProperty7  BlendProperty8  \
0        0.342093        0.775974        0.619755        0.440912   
1       -0.722874       -0.084106       -1.104900       -1.080199   
2        2.343506        1.791394        1.064277        1.825335   
3        1.844998       -0.430841        0.691966    