<a href="https://www.kaggle.com/code/shetyerahul/house-prices?scriptVersionId=293256467" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Load training and test data
train_path = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
test_path  = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (1460, 81)
Test shape: (1459, 80)


In [3]:
train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Separate target

X = train.drop("SalePrice", axis=1)
y = np.log1p(train["SalePrice"])

y.head()


0    12.247699
1    12.109016
2    12.317171
3    11.849405
4    12.429220
Name: SalePrice, dtype: float64

In [5]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(include="object").columns

print("Numeric columns:", len(num_cols))
print("Categorical columns:", len(cat_cols))


Numeric columns: 37
Categorical columns: 43


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


In [7]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNetCV


model = ElasticNetCV(
    l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
    alphas=[0.0005, 0.001, 0.005, 0.01],
    cv=5,
    max_iter=10000
)
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", model)
])



In [8]:
rmse = -cross_val_score(
    pipeline,
    X,
    y,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE:", rmse.mean())


CV RMSE: 0.13765768250038135


In [9]:
pipeline.fit(X, y)


In [10]:
test_preds = np.expm1(pipeline.predict(test))

test_preds[:5]


array([116013.14781952, 141015.11077367, 169354.53525127, 193880.54440422,
       198663.19508886])

In [11]:
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds
})

submission.head()


Unnamed: 0,Id,SalePrice
0,1461,116013.14782
1,1462,141015.110774
2,1463,169354.535251
3,1464,193880.544404
4,1465,198663.195089


In [12]:
# Feature Engineering
for df in [X, test]:
    df["TotalSF"] = (
        df["TotalBsmtSF"].fillna(0) +
        df["1stFlrSF"] +
        df["2ndFlrSF"]
    )



In [13]:
submission.to_csv("submission.csv", index=False)
print("submission3.csv saved!")


submission3.csv saved!
