In [2]:
#imported necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


In [3]:
#loading data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
#removing null values in saleprice
train_data.dropna(subset=["SalePrice"], inplace=True)

In [5]:
#setting X and y
y = train_data["SalePrice"]
X = train_data.drop(["SalePrice", "Id"], axis=1)

In [7]:
#selecting columns
numerical_cols=[cname for cname in X.columns if X[cname].dtype in['int64','float64']]
categorical_cols=[cname for cname in X.columns if X[cname].dtype=="object"and X[cname].nunique()<10]
selected_cols=numerical_cols+categorical_cols
X=X[selected_cols]


In [8]:
#splitting
X_train,X_valid,y_train,y_valid=train_test_split(X,y,train_size=0.8,random_state=0)

In [14]:
#preprocessing
numerical_transform=SimpleImputer(strategy="mean")
categorical_transform=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('encode',OneHotEncoder(handle_unknown='ignore'))])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transform, numerical_cols),
        ('cat', categorical_transform, categorical_cols)
    ])


In [21]:
XGBRegressor(n_estimators=1000,learning_rate=0.03,max_depth=5,n_jobs=-1,early_stopping_rounds=5,random_state=0)


In [24]:
#full ml pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])


In [25]:
clf.fit(X_train, y_train)

preds = clf.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print(f"Validation MAE: {mae:.2f}")

Validation MAE: 17417.65
