In [29]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/backpack11/sample_submission.csv
/kaggle/input/backpack11/train.csv
/kaggle/input/backpack11/test.csv
/kaggle/input/backpack11/training_extra.csv


In [30]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna

In [31]:
train=pd.read_csv('/kaggle/input/backpack11/train.csv')
train_extra=pd.read_csv('/kaggle/input/backpack11/training_extra.csv')
test=pd.read_csv('/kaggle/input/backpack11/test.csv')

In [32]:
train = pd.concat([train, train_extra], ignore_index=True)

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 335.2+ MB


In [34]:
cat_cols=train.select_dtypes(include='object').columns.tolist()
cat_cols

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [35]:
num_cols=train.select_dtypes(include='float64').columns.tolist()
num_cols

['Compartments', 'Weight Capacity (kg)', 'Price']

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(train.drop(columns='Price'),train['Price'],test_size=0.2)


In [37]:
# Verify column names
print("Columns in X_train after split:", X_train.columns.tolist())


Columns in X_train after split: ['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)']


In [38]:
# Define numerical and categorical columns correctly
num_cols = ['Compartments', 'Weight Capacity (kg)']
cat_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']


In [39]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from lightgbm import LGBMRegressor

In [40]:
# Create pipeline
weight_capacity_pipe = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('weight_capacity_pipe', weight_capacity_pipe, num_cols),
        ('cat_pipeline', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

In [41]:
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import joblib  # For saving the model and pipeline

# Define the best hyperparameters obtained from Optuna
best_params = {
    'iterations': 500,
    'depth': 5,
    'learning_rate': 0.2001545136216897,
    'l2_leaf_reg': 0.009566731431493482,
    'random_seed': 42,
    'loss_function': 'RMSE',
    'verbose': 100  # Show progress every 100 iterations
}

# Ensure preprocessor is defined
assert 'preprocessor' in globals(), "Error: Define 'preprocessor' before using it in the pipeline."

# Number of folds for cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# DataFrame to store out-of-fold predictions
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_CatBoost', 'Fold'])

# Convert X_train and y_train to DataFrames if they aren't already
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

# Initialize array to store out-of-fold predictions
oof_preds = np.zeros(len(X_train))

# K-Fold Cross-Validation Loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train), start=1):
    print(f"\n🔄 Training Fold {fold}/{n_folds}...")

    # Split data into training and validation sets
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Create pipeline with preprocessor and CatBoost model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', CatBoostRegressor(**best_params))
    ])

    # Train the model
    pipeline.fit(X_tr, y_tr)

    # Predict on validation set (OOF predictions)
    y_val_pred = pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred  # Store OOF predictions

    # Compute fold RMSE
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"✅ Fold {fold} RMSE: {fold_rmse:.4f}")

    # Store fold results in DataFrame
    fold_df = pd.DataFrame({
        'ID': X_train.index[val_idx],  # Assuming index represents unique IDs
        'Actual': y_val.values,
        'OOF_Pred_CatBoost': y_val_pred,
        'Fold': fold
    })

    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall OOF RMSE
oof_rmse = mean_squared_error(y_train, oof_preds, squared=False)
print(f"\n🏆 Overall OOF RMSE: {oof_rmse:.4f}")

# Save OOF predictions to CSV
oof_df.to_csv('oof_predictions_catboost.csv', index=False)
print("📂 OOF predictions saved to 'oof_predictions_catboost.csv'.")

# Display first few rows of OOF predictions
oof_df.reset_index(drop=True, inplace=True)
print(oof_df.head())

# Save the entire pipeline (including preprocessing and model) as a .pkl file
joblib.dump(pipeline, 'catboost_pipeline.pkl')
print("Pipeline has been saved as 'catboost_pipeline.pkl'.")

# Save the entire pipeline (preprocessing + model) as a .pkl file to preserve the model in the pipeline
# We have already saved the full pipeline using joblib, so no need to save the model separately



🔄 Training Fold 1/5...
0:	learn: 38.9311396	total: 127ms	remaining: 1m 3s
100:	learn: 38.8692873	total: 13.7s	remaining: 54.1s
200:	learn: 38.8538851	total: 26.6s	remaining: 39.5s
300:	learn: 38.8420177	total: 39.7s	remaining: 26.2s
400:	learn: 38.8318189	total: 53.4s	remaining: 13.2s
499:	learn: 38.8221645	total: 1m 6s	remaining: 0us
✅ Fold 1 RMSE: 38.8703

🔄 Training Fold 2/5...


  oof_df = pd.concat([oof_df, fold_df], ignore_index=True)


0:	learn: 38.9272638	total: 127ms	remaining: 1m 3s
100:	learn: 38.8663537	total: 13.1s	remaining: 51.8s
200:	learn: 38.8510873	total: 26.1s	remaining: 38.8s
300:	learn: 38.8394677	total: 39.8s	remaining: 26.3s
400:	learn: 38.8291111	total: 52.8s	remaining: 13s
499:	learn: 38.8195257	total: 1m 6s	remaining: 0us
✅ Fold 2 RMSE: 38.8810

🔄 Training Fold 3/5...
0:	learn: 38.9371406	total: 129ms	remaining: 1m 4s
100:	learn: 38.8770961	total: 13.2s	remaining: 52s
200:	learn: 38.8615932	total: 26.5s	remaining: 39.4s
300:	learn: 38.8501206	total: 39.5s	remaining: 26.1s
400:	learn: 38.8398572	total: 53.1s	remaining: 13.1s
499:	learn: 38.8305653	total: 1m 5s	remaining: 0us
✅ Fold 3 RMSE: 38.8435

🔄 Training Fold 4/5...
0:	learn: 38.9227270	total: 126ms	remaining: 1m 3s
100:	learn: 38.8618054	total: 13.6s	remaining: 53.7s
200:	learn: 38.8464491	total: 26.5s	remaining: 39.4s
300:	learn: 38.8347913	total: 39.7s	remaining: 26.3s
400:	learn: 38.8241737	total: 53.4s	remaining: 13.2s
499:	learn: 38.8144

In [42]:
# Make predictions on the test set
predictions = pipeline.predict(test)

# Create the submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],         # Ensure 'id' exists in the test set
    'Price': predictions      # Use predictions on the test set
})

# Save the DataFrame to a CSV file
submission.to_csv('submission111.csv', index=False)

# Save the OOF predictions DataFrame to a CSV file
oof_df.to_csv('oof_predictions.csv', index=False)
print("OOF predictions saved to 'oof_predictions.csv'")
oof_predictions.csv.head()

OOF predictions saved to 'oof_predictions.csv'


NameError: name 'oof_predictions' is not defined