In [None]:
# Setup
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Import Data
train = pd.read_csv('./playground-series-s5e2/train.csv')
train_extra = pd.read_csv('./playground-series-s5e2/training_extra.csv')
train = pd.concat([train, train_extra]).drop(columns=['id'])
test = pd.read_csv('./playground-series-s5e2/test.csv')
# Done
print('Setup Complete')

Setup Complete


In [3]:
# Split Data for Training and Validation
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(train.drop(columns=['Price']), train['Price'], train_size=0.8)

In [None]:
# Start with a baseline XGBRegressor model
# Create a pipeline to simplify the process
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor 

# Identify categorical and numerical columns
cat_cols = ['Brand' , 'Material', 'Size', 'Compartments' , 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
num_cols = ['Weight Capacity (kg)']

# Specify the transformers for both types of data
num_trans = SimpleImputer(strategy='median')
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the final column transformer
col_trans = ColumnTransformer(transformers=[
    ('num', num_trans, num_cols),
    ('cat', cat_trans, cat_cols)
])

# Make the model and finalize the pipeline
model = XGBRegressor()
pipeline = Pipeline(steps=[
    ('preprocessor', col_trans), 
    ('model', model)
])

In [None]:
# Choosing XGBRegressor as a good baseline model
from sklearn.metrics import mean_absolute_error

# Train the model
pipeline.fit(xtrain, ytrain)

# Get predictions for validation and evaluate
preds = pipeline.predict(xvalid)
score = mean_absolute_error(yvalid, preds)
print('MAE:', score)

N: 100 MAE: 33.59494715893701
N: 110 MAE: 33.59467451809225
N: 120 MAE: 33.59461376004741
N: 130 MAE: 33.59526976955723
N: 140 MAE: 33.59527024145471


In [None]:
# Get predictions for test and save for submission
test['Price'] = pipeline.predict(test)
test[['id', 'Price']].to_csv('./playground-series-s5e2/submission.csv', index=False)

[81.47658  83.099495 83.242744 ... 81.47061  81.85084  81.57258 ]
