<a href="https://www.kaggle.com/code/subarnasaikia/it-s-backpack-time-xgboost?scriptVersionId=220322073" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

In [2]:
train_file_path = "/kaggle/input/playground-series-s5e2/train.csv"
test_file_path = "/kaggle/input/playground-series-s5e2/test.csv"
sample_submission_file_path = "/kaggle/input/playground-series-s5e2/sample_submission.csv"

In [3]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [4]:
train_df.head(5)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [6]:
train_df.describe()

Unnamed: 0,id,Compartments,Weight Capacity (kg),Price
count,300000.0,300000.0,299862.0,300000.0
mean,149999.5,5.44359,18.029994,81.411107
std,86602.684716,2.890766,6.966914,39.03934
min,0.0,1.0,5.0,15.0
25%,74999.75,3.0,12.097867,47.38462
50%,149999.5,5.0,18.068614,80.95612
75%,224999.25,8.0,24.002375,115.01816
max,299999.0,10.0,30.0,150.0


In [7]:
train_df.describe(include=object)

Unnamed: 0,Brand,Material,Size,Laptop Compartment,Waterproof,Style,Color
count,290295,291653,293405,292556,292950,292030,290050
unique,5,4,3,2,2,3,6
top,Adidas,Polyester,Medium,Yes,Yes,Messenger,Pink
freq,60077,79630,101906,148342,148077,100031,51690


In [8]:
train_df.isnull().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [9]:
test_df.isnull().sum()

id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

### One-hot encoding 
reference: [One Hot Encoding](https://www.geeksforgeeks.org/ml-one-hot-encoding/)

In [10]:
def oneHotEncdoing(df):
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    print(f"categorical_columns \n{categorical_columns}")

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    one_hot_df = pd.DataFrame(
        one_hot_encoded,
        columns=encoder.get_feature_names_out(categorical_columns)
    )
    df_encoded = pd.concat([df, one_hot_df], axis=1)
    for column in categorical_columns:
        new_column = column + "_nan"
        if new_column in df_encoded.columns.tolist():
            df_encoded = df_encoded.drop(new_column, axis=1)
    df_encoded = df_encoded.drop(categorical_columns, axis=1)
    
    return df_encoded

### Missing Values
- Handling missing values with simpleImputer.

In [11]:
def imputation(df):
    my_imputer = SimpleImputer()
    imputed_df = pd.DataFrame(my_imputer.fit_transform(df))
    imputed_df.columns = df.columns
    return imputed_df

In [12]:
new_train_df = oneHotEncdoing(train_df)
new_train_df = imputation(new_train_df)
new_train_df.head(5)

categorical_columns 
['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']


Unnamed: 0,id,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Material_Canvas,...,Waterproof_Yes,Style_Backpack,Style_Messenger,Style_Tote,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red
0,0.0,7.0,11.611723,112.15875,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,10.0,27.078537,68.88056,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2.0,2.0,16.64376,39.1732,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3.0,8.0,12.93722,80.60793,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4.0,1.0,17.749338,86.02312,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
new_train_df.isnull().sum()

id                        0
Compartments              0
Weight Capacity (kg)      0
Price                     0
Brand_Adidas              0
Brand_Jansport            0
Brand_Nike                0
Brand_Puma                0
Brand_Under Armour        0
Material_Canvas           0
Material_Leather          0
Material_Nylon            0
Material_Polyester        0
Size_Large                0
Size_Medium               0
Size_Small                0
Laptop Compartment_No     0
Laptop Compartment_Yes    0
Waterproof_No             0
Waterproof_Yes            0
Style_Backpack            0
Style_Messenger           0
Style_Tote                0
Color_Black               0
Color_Blue                0
Color_Gray                0
Color_Green               0
Color_Pink                0
Color_Red                 0
dtype: int64

In [14]:
new_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 29 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      300000 non-null  float64
 1   Compartments            300000 non-null  float64
 2   Weight Capacity (kg)    300000 non-null  float64
 3   Price                   300000 non-null  float64
 4   Brand_Adidas            300000 non-null  float64
 5   Brand_Jansport          300000 non-null  float64
 6   Brand_Nike              300000 non-null  float64
 7   Brand_Puma              300000 non-null  float64
 8   Brand_Under Armour      300000 non-null  float64
 9   Material_Canvas         300000 non-null  float64
 10  Material_Leather        300000 non-null  float64
 11  Material_Nylon          300000 non-null  float64
 12  Material_Polyester      300000 non-null  float64
 13  Size_Large              300000 non-null  float64
 14  Size_Medium         

### Training and Evaluation

In [15]:
y = new_train_df.Price
X = new_train_df.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [16]:
model = XGBRegressor(
    n_estimators=2000, 
    learning_rate=0.01,
    early_stopping_rounds=90
)

model.fit(X_train, y_train, 
         eval_set=[(X_valid, y_valid)], 
         verbose=True
    )

[0]	validation_0-rmse:38.94214
[1]	validation_0-rmse:38.94164
[2]	validation_0-rmse:38.94115
[3]	validation_0-rmse:38.94065
[4]	validation_0-rmse:38.94020
[5]	validation_0-rmse:38.93976
[6]	validation_0-rmse:38.93931
[7]	validation_0-rmse:38.93888
[8]	validation_0-rmse:38.93847
[9]	validation_0-rmse:38.93809
[10]	validation_0-rmse:38.93771
[11]	validation_0-rmse:38.93732
[12]	validation_0-rmse:38.93688
[13]	validation_0-rmse:38.93650
[14]	validation_0-rmse:38.93614
[15]	validation_0-rmse:38.93575
[16]	validation_0-rmse:38.93537
[17]	validation_0-rmse:38.93504
[18]	validation_0-rmse:38.93467
[19]	validation_0-rmse:38.93426
[20]	validation_0-rmse:38.93386
[21]	validation_0-rmse:38.93350
[22]	validation_0-rmse:38.93310
[23]	validation_0-rmse:38.93272
[24]	validation_0-rmse:38.93232
[25]	validation_0-rmse:38.93191
[26]	validation_0-rmse:38.93156
[27]	validation_0-rmse:38.93122
[28]	validation_0-rmse:38.93085
[29]	validation_0-rmse:38.93053
[30]	validation_0-rmse:38.93027
[31]	validation_0-

In [17]:
pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, pred)
rmse = mean_squared_error(y_valid, pred, squared=False)
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

MAE: 33.67809198700806
RMSE: 38.908285617790106


In [18]:
new_test_df = oneHotEncdoing(test_df)
new_test_df = imputation(new_test_df)

categorical_columns 
['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']


In [19]:
Final_model = XGBRegressor(
    n_estimators=284, 
    learning_rate=0.01
)
Final_model.fit(X, y)
final_prediction = Final_model.predict(new_test_df)

In [20]:
submission = pd.DataFrame({'id': new_test_df['id'], 'Price': final_prediction})
submission.to_csv('submission.csv', index=False)