In [33]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error


train_df = pd.read_csv('/kaggle/input/training/train.csv',skiprows=0)
# test_df = pd.read_csv('/kaggle/input/testing/test.csv')
Y = train_df.filter(like="Blend")
X = train_df.drop(Y.columns,axis=1)

In [34]:
def feature_expansion(X):

    x_engg = X.copy()
    new_features = []
    
    for i in range(1,6):
        fractionc = f"Component{i}_fraction"
        proportionc = X[fractionc]
        propertyc  = f"Component{i}_Property"
        ComponentPropertiesc = X.filter(like=propertyc)  
        partial_componets = proportionc.values.reshape(-1, 1) * ComponentPropertiesc
    
        partial_componets.columns = [f'weighted_C{i}_P{j}' for j in range(1, 11)]
        
        new_features.append(partial_componets)
    
    X_final = pd.concat([x_engg] + new_features, axis=1 )
    return X_final

In [39]:
X_final = feature_expansion(X)
Y_target = Y

X_train, X_val, y_train, y_val = train_test_split(
    X_final, Y_target, test_size=0.2, random_state=42
)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_val: {X_val.shape}")

model = RandomForestRegressor(n_estimators=200, max_depth=15)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_pred)
print(f"\nOur HONEST MAPE on the validation set is: {mape}")
reference_cost = 2.72 
leaderboard_score = 100 - (90 * mape) / reference_cost
final_score = max(10, leaderboard_score)
print(f"Our estimated Public Leaderboard Score is: {final_score}")

Shape of X_train: (1600, 105)
Shape of X_val: (400, 105)

Our HONEST MAPE on the validation set is: 4.571365959732033
Our estimated Public Leaderboard Score is: 10


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

SHIFT_VALUE = 10 # Our magic number to move away from zero

# --- 1. SHIFT THE TARGETS ---
Y_shifted = Y + SHIFT_VALUE

# Your feature expansion code to create X_final goes here...
X_final = feature_expansion(X)

# --- 2. SPLIT THE DATA ---
# Use the engineered features and the SHIFTED targets
X_train, X_val, y_train_shifted, y_val_shifted = train_test_split(
    X_final, Y_shifted, test_size=0.2, random_state=42
)

# --- 3. TRAIN THE MODEL ---
# Find the best depth by looping and checking the MAPE
depths_to_test = [8, 10, 12, 15, 20, 25]
best_score = float('inf')
best_depth = None

for depth in depths_to_test:
    model = RandomForestRegressor(n_estimators=200, max_depth=depth, random_state=42, n_jobs=-1)
    
    # Train on the shifted data
    model.fit(X_train, y_train_shifted)
    
    # Predict on the validation set. Predictions will be shifted.
    val_predictions_shifted = model.predict(X_val)
    
    # --- 4. SHIFT PREDICTIONS BACK! ---
    # This is the crucial step before scoring
    val_predictions_original = val_predictions_shifted - SHIFT_VALUE
    
    # We also need the original-scale true values for scoring
    y_val_original = y_val_shifted - SHIFT_VALUE
    
    # --- 5. SCORE ---
    # Now calculate MAPE using the original-scale values
    mape_cost = mean_absolute_percentage_error(y_val_original, val_predictions_original)
    
    print(f"Max Depth: {depth:2d} | Validation MAPE: {mape_cost:.4f}")
    
    if mape_cost < best_score:
        best_score = mape_cost
        best_depth = depth

print(f"\nFound best depth: {best_depth} with MAPE: {best_score:.4f}")

# Now calculate the final estimated score using the best_score
reference_cost = 2.72
leaderboard_score = 100 - (90 * best_score) / reference_cost
final_score = max(10, leaderboard_score)
print(f"BEST Estimated Public Leaderboard Score is: {final_score:.2f}")

Max Depth:  8 | Validation MAPE: 5.0259
Max Depth: 10 | Validation MAPE: 4.8616
Max Depth: 12 | Validation MAPE: 5.3986
Max Depth: 15 | Validation MAPE: 5.0185
Max Depth: 20 | Validation MAPE: 4.8514
Max Depth: 25 | Validation MAPE: 4.8514

Found best depth: 20 with MAPE: 4.8514
BEST Estimated Public Leaderboard Score is: 10.00
