In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
file_path_train = "/content/drive/MyDrive/train_h.csv"
file_path_test = "/content/drive/MyDrive/test_h.csv"

In [None]:
train = pd.read_csv(file_path_train)
test = pd.read_csv(file_path_test)

In [None]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
important_features = [
    'OverallQual',
    'GrLivArea',
    'GarageCars',
    'TotalBsmtSF',
    'FullBath',
    'YearBuilt',
    'YearRemodAdd',
    '1stFlrSF',
    'TotRmsAbvGrd',
    'Fireplaces',
    'BsmtQual',
    'GarageFinish',
    'KitchenQual',
    'GarageArea',
    'MasVnrArea',
    'ExterQual',
    'Neighborhood',
    'Foundation',
    'BsmtFinSF1',
    'LotArea',
    'Exterior1st',
    'BsmtFullBath',
    'BedroomAbvGr',      # corrected
    'KitchenAbvGr',      # corrected
    'HeatingQC',
    'GarageType',
    'GarageYrBlt',
    'PavedDrive',
    'MSZoning',
    'HouseStyle'
]


In [None]:
y=train.SalePrice

In [None]:
X=train[important_features]
X_test = test[important_features]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split( X, y, train_size=0.8, test_size=0.2, random_state=0)


In [None]:
X.dtypes

Unnamed: 0,0
OverallQual,int64
GrLivArea,int64
GarageCars,int64
TotalBsmtSF,int64
FullBath,int64
YearBuilt,int64
YearRemodAdd,int64
1stFlrSF,int64
TotRmsAbvGrd,int64
Fireplaces,int64


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from statsmodels.imputation import mice


In [None]:
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']


In [None]:
# Numerical pipeline: fill missing values with median
numerical_transformer = SimpleImputer(strategy='mean')

# Categorical pipeline: fill missing with 'Missing', then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=1000, random_state=0,learning_rate=0.05, n_jobs=4))
])

In [None]:
model.fit(X_train, y_train)
preds = model.predict(X_valid)


In [None]:
from sklearn.metrics import mean_absolute_error
print("MAE:", mean_absolute_error(y_valid, preds))


MAE: 16431.728515625


In [None]:
model.score(X_train, y_train)

0.9998480081558228

In [None]:
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': model.predict(X_test)})
submission.to_csv('submission.csv', index=False)

In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>