In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
test_df = pd.read_csv('./data/test.csv')

In [4]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

X_train, X_val, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Preprocessing Pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

cols_to_drop = ['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature']

X_train = X_train.drop(cols_to_drop, axis=1)
X_val = X_val.drop(cols_to_drop, axis=1)
test_df_proc = test_df.drop(cols_to_drop, axis=1)

# Numerical Columns
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical Columns
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

num_features = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# Model
from sklearn.ensemble import RandomForestRegressor

model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42)),
])

model.fit(X_train, y_train)

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = model.predict(X_val)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

Mean Squared Error: 832906003.6122842
Mean Absolute Error: 17523.85767123288


In [6]:
test_pred = model.predict(test_df_proc)
test_pred

array([128321.83, 153056.  , 179475.85, ..., 152682.47, 118794.  ,
       225820.95])

In [7]:
submission = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': test_pred})

In [9]:
submission.to_csv('./data/submission.csv', index=False)