In [167]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [187]:
df=pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')


In [188]:
# Use the same features as training
important_features = [
    'MSZoning', 'LotArea', 'Neighborhood', 'OverallQual', 'YearBuilt',
    'YearRemodAdd', 'ExterQual', 'Foundation', 'BsmtQual', 'TotalBsmtSF',
    'HeatingQC', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'KitchenQual',
    'Fireplaces', 'GarageFinish', 'GarageCars', 'GarageArea', 'SaleCondition'
]

test_df = test_df[important_features]
test_df.dropna(inplace=True)


In [169]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [175]:


# Define important features for predicting SalePrice
important_features = [
    'MSZoning', 'LotArea', 'Neighborhood', 'OverallQual', 'YearBuilt',
    'YearRemodAdd', 'ExterQual', 'Foundation', 'BsmtQual', 'TotalBsmtSF',
    'HeatingQC', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'KitchenQual',
    'Fireplaces', 'GarageFinish', 'GarageCars', 'GarageArea', 'SaleCondition',
    'SalePrice'  # Target variable
]

# Keep only the relevant columns
df = df[important_features]

# Basic missing value handling (you can improve this later)
df = df.dropna()

# Check the shape of the cleaned dataset
print("Cleaned dataset shape:", df.shape)

# Optional: Save to new CSV
df.to_csv("cleaned_house_prices.csv", index=False)


Cleaned dataset shape: (1349, 21)


In [176]:
df

Unnamed: 0,MSZoning,LotArea,Neighborhood,OverallQual,YearBuilt,YearRemodAdd,ExterQual,Foundation,BsmtQual,TotalBsmtSF,...,GrLivArea,FullBath,TotRmsAbvGrd,KitchenQual,Fireplaces,GarageFinish,GarageCars,GarageArea,SaleCondition,SalePrice
0,RL,8450,CollgCr,7,2003,2003,Gd,PConc,Gd,856,...,1710,2,8,Gd,0,RFn,2,548,Normal,208500
1,RL,9600,Veenker,6,1976,1976,TA,CBlock,Gd,1262,...,1262,2,6,TA,1,RFn,2,460,Normal,181500
2,RL,11250,CollgCr,7,2001,2002,Gd,PConc,Gd,920,...,1786,2,6,Gd,1,RFn,2,608,Normal,223500
3,RL,9550,Crawfor,7,1915,1970,TA,BrkTil,TA,756,...,1717,1,7,Gd,1,Unf,3,642,Abnorml,140000
4,RL,14260,NoRidge,8,2000,2000,Gd,PConc,Gd,1145,...,2198,2,9,Gd,1,RFn,3,836,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,7917,Gilbert,6,1999,2000,TA,PConc,Gd,953,...,1647,2,7,TA,1,RFn,2,460,Normal,175000
1456,RL,13175,NWAmes,6,1978,1988,TA,CBlock,Gd,1542,...,2073,2,7,TA,2,Unf,2,500,Normal,210000
1457,RL,9042,Crawfor,7,1941,2006,Ex,Stone,TA,1152,...,2340,2,9,Gd,2,RFn,1,252,Normal,266500
1458,RL,9717,NAmes,5,1950,1996,TA,CBlock,TA,1078,...,1078,1,5,Gd,0,Unf,1,240,Normal,142125


In [177]:
df.shape

(1349, 21)

In [178]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [179]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

In [180]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')  

In [181]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [183]:
model.fit(X_train, y_train)

In [184]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [185]:
print(f"✅ Root Mean Squared Error: {rmse:.2f}")
print(f"✅ R-squared Score: {r2:.3f}")

✅ Root Mean Squared Error: 25820.41
✅ R-squared Score: 0.841


In [186]:
sample = X_test.iloc[[0]]
prediction = model.predict(sample)
print("🔮 Predicted Sale Price for sample:", prediction[0])

🔮 Predicted Sale Price for sample: 226469.07


In [195]:
import joblib
joblib.dump(model, 'house_price_model.pkl')

['house_price_model.pkl']