<a href="https://www.kaggle.com/code/subarnasaikia/it-s-backpack-time?scriptVersionId=220274048" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [40]:
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [41]:
train_file_path = "/kaggle/input/playground-series-s5e2/train.csv"
test_file_path = "/kaggle/input/playground-series-s5e2/test.csv"
sample_submission_file_path = "/kaggle/input/playground-series-s5e2/sample_submission.csv"

In [42]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [43]:
train_df.head(5)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=object)

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

### One-hot encoding 
reference: [One Hot Encoding](https://www.geeksforgeeks.org/ml-one-hot-encoding/)

In [44]:
def oneHotEncdoing(df):
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    print(f"categorical_columns \n{categorical_columns}")

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    one_hot_df = pd.DataFrame(
        one_hot_encoded,
        columns=encoder.get_feature_names_out(categorical_columns)
    )
    df_encoded = pd.concat([df, one_hot_df], axis=1)
    df_encoded = df_encoded.drop(categorical_columns, axis=1)
    
    return df_encoded

### Missing Values
- Handling missing values with simpleImputer.

In [45]:
def imputation(df):
    my_imputer = SimpleImputer()
    imputed_df = pd.DataFrame(my_imputer.fit_transform(df))
    imputed_df.columns = df.columns
    return imputed_df

In [46]:
new_train_df = oneHotEncdoing(train_df)
new_train_df = imputation(new_train_df)
new_train_df.head(5)

categorical_columns 
['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']


Unnamed: 0,id,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Brand_nan,...,Style_Messenger,Style_Tote,Style_nan,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red,Color_nan
0,0.0,7.0,11.611723,112.15875,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,10.0,27.078537,68.88056,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2.0,2.0,16.64376,39.1732,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3.0,8.0,12.93722,80.60793,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4.0,1.0,17.749338,86.02312,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
new_train_df.isnull().sum()

In [None]:
new_train_df.info()

### Training and Evaluation

In [47]:
y = new_train_df.Price
X = new_train_df.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [48]:
model = RandomForestRegressor(n_estimators=50, random_state=0)
model.fit(X_train, y_train)

In [49]:
pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, pred)
rmse = mean_squared_error(y_valid, pred, squared=False)
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

MAE: 34.14490281142333
RMSE: 39.71598565930566


In [50]:
new_test_df = oneHotEncdoing(test_df)
new_test_df = imputation(new_test_df)

categorical_columns 
['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']


In [51]:
Final_model = RandomForestRegressor(n_estimators=50, random_state=0)
Final_model.fit(X, y)
final_prediction = Final_model.predict(new_test_df)

In [52]:
submission = pd.DataFrame({'id': new_test_df['id'], 'Price': final_prediction})
submission.to_csv('submission.csv', index=False)