In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bagpack/sample_submission.csv
/kaggle/input/bagpack/train.csv
/kaggle/input/bagpack/test.csv
/kaggle/input/bagpack/training_extra.csv


In [2]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna


# Suppressing warnings to avoid clutter in output
warnings.filterwarnings('ignore')


In [3]:
train=pd.read_csv('/kaggle/input/bagpack/train.csv')
train_extra=pd.read_csv('/kaggle/input/bagpack/training_extra.csv')
test=pd.read_csv('/kaggle/input/bagpack/test.csv')

In [4]:
train.shape, test.shape, train_extra.shape

((300000, 11), (200000, 10), (3694318, 11))

In [5]:
train = pd.concat([train, train_extra], ignore_index=True)

In [6]:
train.shape

(3994318, 11)

In [7]:
train.sample(5)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
2155743,2355743,Jansport,Leather,Medium,6.0,No,No,Backpack,Black,22.144059,30.63479
332675,532675,Nike,Leather,Small,9.0,No,Yes,Backpack,Blue,26.793464,38.08527
2159732,2359732,Puma,Canvas,Large,2.0,No,No,Messenger,Pink,17.955198,47.82741
3755091,3955091,Adidas,,Large,8.0,Yes,No,Backpack,Pink,22.381331,29.34273
1557259,1757259,Nike,Canvas,Medium,4.0,No,Yes,Messenger,Black,24.276908,109.60189


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 335.2+ MB


In [9]:
# Add a column with the count of NaN values per row
train['nan_count'] = train.isna().sum(axis=1)
test['nan_count'] = test.isna().sum(axis=1)

In [10]:
train.head(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,nan_count
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875,0
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056,0
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732,0
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793,0
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312,0
5,5,Nike,Canvas,Medium,10.0,No,Yes,,Black,7.241812,20.01553,1
6,6,Nike,,Large,3.0,No,No,Backpack,Green,6.828123,84.805,1
7,7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815,0
8,8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652,0
9,9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741,0


In [11]:
cat_cols=train.select_dtypes(include='object').columns.tolist()
cat_cols

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(train.drop(columns='Price'),train['Price'],test_size=0.2)

Creating dataprocessing pipeline
This code builds a data preprocessing pipeline that prepares numerical and categorical features for a machine learning model. It does the following:

Standardizes numerical data (specifically Weight Capacity (kg)) using StandardScaler(), ensuring consistent scaling.
Encodes categorical features (cat_cols) using OneHotEncoder(), converting them into a machine-readable format.
Leaves any remaining columns unchanged to retain necessary data.

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from lightgbm import LGBMRegressor

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

# Define the pipeline components
weight_capacity_pipe = Pipeline(steps=[('scaler', StandardScaler())])

# Define the column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('weight_capacity_pipe', weight_capacity_pipe, ['Weight Capacity (kg)']),  # Scaling numeric column
        ('cat_pipeline', Pipeline(steps=[  # One-Hot Encoding categorical features
            ('encoder', OneHotEncoder())
        ]), cat_cols)
    ],
    remainder='passthrough'
)

# Save the preprocessor pipeline to a .pkl file
joblib.dump(preprocessor, 'preprocessor_pipeline.pkl')
print("📂 Preprocessor pipeline saved to 'preprocessor_pipeline.pkl'.")


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
import joblib  # For saving the model

# Define the best hyperparameters (Updated)
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.030198582603149962,
    'max_depth': 6,
    'num_leaves': 13,
    'min_child_samples': 4,
    'subsample': 0.8666752007530039,
    'colsample_bytree': 0.8227478250119855,
    'reg_alpha': 0.004905763778413513,
    'reg_lambda': 0.0011562120556175835
}

# Ensure preprocessor is defined (Replace with actual preprocessing steps)
#assert 'preprocessor' in globals(), "Define 'preprocessor' before using it in the pipeline."
# Define the pipeline components
weight_capacity_pipe = Pipeline(steps=[('scaler', StandardScaler())])

# Define the column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('weight_capacity_pipe', weight_capacity_pipe, ['Weight Capacity (kg)']),  # Scaling numeric column
        ('cat_pipeline', Pipeline(steps=[  # One-Hot Encoding categorical features
            ('encoder', OneHotEncoder())
        ]), cat_cols)
    ],
    remainder='passthrough'
)


# Initialize KFold
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# DataFrame to store out-of-fold predictions
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_LGB', 'Fold'])

# Convert X_train and y_train to DataFrames if they aren't already
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

# Initialize array to store out-of-fold predictions
oof_preds = np.zeros(len(X_train))

# K-Fold Cross-Validation Loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train), start=1):
    print(f"\n🔄 Training Fold {fold}/{n_folds}...")

    # Split data into training and validation sets
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Create pipeline with preprocessor and LGBM model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Replace with actual preprocessor
        ('model', LGBMRegressor(**best_params))
    ])

    # Train the model
    pipeline.fit(X_tr, y_tr)

    # Predict on validation set (OOF predictions)
    y_val_pred = pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred  # Store OOF predictions

    # Compute fold RMSE
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"✅ Fold {fold} RMSE: {fold_rmse:.4f}")

    # Store fold results in DataFrame
    fold_df = pd.DataFrame({
        'ID': X_train.index[val_idx],  # Assuming index represents unique IDs
        'Actual': y_val.values,
        'OOF_Pred_LGB': y_val_pred,
        'Fold': fold
    })

    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall OOF RMSE
oof_rmse = mean_squared_error(y_train, oof_preds, squared=False)
print(f"\n🏆 Overall OOF RMSE: {oof_rmse:.4f}")

# Save OOF predictions to CSV
oof_df.to_csv('oof_predictions_lgbm.csv', index=False)
print("📂 OOF predictions saved to 'oof_predictions_lgbm.csv'.")

# Save the pipeline (model and preprocessor)
model_filename = 'trained_model_lgbm.pkl'
joblib.dump(pipeline, model_filename)
print(f"📂 Model saved as {model_filename}.")



🔄 Training Fold 1/5...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 590
[LightGBM] [Info] Number of data points in the train set: 2556363, number of used features: 36
[LightGBM] [Info] Start training from score 81.347753
✅ Fold 1 RMSE: 38.8690

🔄 Training Fold 2/5...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076615 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 591
[LightGBM] [Info] Number of data points in the train set: 2556363, number of used features: 36
[LightGBM] [Info] Start training from score 81.346180
✅ Fold 2 RMSE: 38.9015

🔄 Training Fold 3/5...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the over

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
import joblib  # For saving the model

# Define the best hyperparameters (Updated)
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.030198582603149962,
    'max_depth': 6,
    'num_leaves': 13,
    'min_child_samples': 4,
    'subsample': 0.8666752007530039,
    'colsample_bytree': 0.8227478250119855,
    'reg_alpha': 0.004905763778413513,
    'reg_lambda': 0.0011562120556175835
}

# Ensure preprocessor is defined (Replace with actual preprocessing steps)
assert 'preprocessor' in globals(), "Define 'preprocessor' before using it in the pipeline."

# Initialize KFold
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# DataFrame to store out-of-fold predictions
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_LGB', 'Fold'])

# Convert X_train and y_train to DataFrames if they aren't already
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

# Initialize array to store out-of-fold predictions
oof_preds = np.zeros(len(X_train))

# K-Fold Cross-Validation Loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train), start=1):
    print(f"\n🔄 Training Fold {fold}/{n_folds}...")

    # Split data into training and validation sets
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Create pipeline with preprocessor and LGBM model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Replace with actual preprocessor
        ('model', LGBMRegressor(**best_params))
    ])

    # Train the model
    pipeline.fit(X_tr, y_tr)

    # Predict on validation set (OOF predictions)
    y_val_pred = pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred  # Store OOF predictions

    # Compute fold RMSE
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"✅ Fold {fold} RMSE: {fold_rmse:.4f}")

    # Store fold results in DataFrame
    fold_df = pd.DataFrame({
        'ID': X_train.index[val_idx],  # Assuming index represents unique IDs
        'Actual': y_val.values,
        'OOF_Pred_LGB': y_val_pred,
        'Fold': fold
    })

    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall OOF RMSE
oof_rmse = mean_squared_error(y_train, oof_preds, squared=False)
print(f"\n🏆 Overall OOF RMSE: {oof_rmse:.4f}")

# Save OOF predictions to CSV
oof_df.to_csv('oof_predictions_lgbm.csv', index=False)
print("📂 OOF predictions saved to 'oof_predictions_lgbm.csv'.")

# Save the pipeline (model and preprocessor)
model_filename = 'trained_model_lgbm.pkl'
joblib.dump(pipeline, model_filename)
print(f"📂 Model saved as {model_filename}.")


In [None]:
# Save the pipeline (model and preprocessor)
LGBoost = 'trained_model_lgbm.pkl'
joblib.dump(pipeline, LGBoost)
print(f"📂 Model saved as {LGBoost}.")


In [None]:
# Make predictions on the test set
predictions = pipeline.predict(test)

# Create the submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],         # Ensure 'id' exists in the test set
    'Price': predictions      # Use predictions on the test set
})

# Save the DataFrame to a CSV file
submission.to_csv('submission200.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

# Assuming you still have X_train and y_train available
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Save X_val as CSV if needed
X_val.to_csv('X_val.csv', index=False)

X_train.to_csv('X_train.csv', index=False)

In [None]:
# import optuna
# from lightgbm import LGBMRegressor
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import cross_val_score, KFold
# from sklearn.metrics import mean_squared_error, make_scorer
# import numpy as np

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
#         'max_depth': trial.suggest_int('max_depth', 3, 7),
#         'num_leaves': trial.suggest_int('num_leaves', 10, 50),
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1.0)
#     }
    
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('weight_capacity_pipe', Pipeline(steps=[('scaler', StandardScaler())]), ['Weight Capacity (kg)']),
#             ('cat_pipeline', OneHotEncoder(handle_unknown='ignore'), cat_cols)
#         ],
#         remainder='passthrough'
#     )
    
#     pipeline = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('model', LGBMRegressor(**params))
#     ])
    
#     rmse_scorer = make_scorer(mean_squared_error, squared=False)
#     kf = KFold(n_splits=3, shuffle=True, random_state=42)  # Reduced to 3 folds for faster execution
#     scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=rmse_scorer)
    
#     return np.mean(scores)

# study = optuna.create_study(direction='minimize')  # Minimize RMSE
# study.optimize(objective, n_trials=3, n_jobs=-1)  # Perform 3 trials

# best_params = study.best_params
# print(f"Best Parameters: {best_params}")

# final_model = LGBMRegressor(**best_params)
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', final_model)
# ])

# pipeline.fit(X_train, y_train)

# y_pred = pipeline.predict(X_test)
# test_rmse = mean_squared_error(y_test, y_pred, squared=False)
# print(f'Test Set RMSE: {test_rmse:.4f}')


In [None]:
# import optuna
# import numpy as np
# from xgboost import XGBRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split
# from category_encoders import TargetEncoder
# from sklearn.compose import ColumnTransformer
# import pandas as pd

# # Load dataset
# X = train.drop(columns='Price')
# y = train['Price']

# # Identify categorical columns
# cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# # Apply Target Encoding instead of OneHotEncoding
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', TargetEncoder(), cat_cols)
#     ], remainder='passthrough'
# )

# X = preprocessor.fit_transform(X, y)

# # Split data for validation
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# # Objective function for Optuna
# def objective(trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 3000, step=100),  # Reduced max estimators
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),  # Adjusted range
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
#         'gamma': trial.suggest_uniform('gamma', 0, 1),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1),
#         'objective': 'reg:squarederror',
#         'eval_metric': 'rmse',
#         'n_jobs': -1  # Enable parallel computation
#     }

#     # Train the model with early stopping
#     model = XGBRegressor(**param)
#     model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
    
#     # Predictions and evaluation
#     preds = model.predict(X_valid)
#     rmse = mean_squared_error(y_valid, preds, squared=False)
#     return rmse

# # Create Optuna study
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=5)  # Reduce trials for faster training

# # Get the best hyperparameters
# best_params = study.best_params
# print(f"Best hyperparameters: {best_params}")

# # Train the final model with the best hyperparameters
# best_model = XGBRegressor(**best_params, n_jobs=-1)
# best_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=True)
