In [246]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
pd.pandas.set_option('display.max_columns',None)

In [247]:
# -----------------------------
# Load Dataset
# -----------------------------
dataset = pd.read_csv("/content/AmesHousing_actual.csv")

# -----------------------------
# Split Data FIRST
# -----------------------------
x = dataset.drop("SalePrice", axis=1)
y = dataset["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

dataset.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,BrkFace,Plywood,Stone,112.0,TA,TA,CBlock,TA,Gd,Gd,BLQ,639.0,Unf,0.0,441.0,1080.0,GasA,Fa,Y,SBrkr,1656,0,0,1656,1.0,0.0,1,0,3,1,TA,7,Typ,2,Gd,Attchd,1960.0,Fin,2.0,528.0,TA,TA,P,210,62,0,0,0,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968,1968,Hip,CompShg,BrkFace,BrkFace,,0.0,Gd,TA,CBlock,TA,TA,No,ALQ,1065.0,Unf,0.0,1045.0,2110.0,GasA,Ex,Y,SBrkr,2110,0,0,2110,1.0,0.0,2,1,3,1,Ex,8,Typ,2,TA,Attchd,1968.0,Fin,2.0,522.0,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [248]:
# we combine back for easier preprocessing
train = pd.concat([x_train, y_train], axis=1)
test = pd.concat([x_test, y_test], axis=1)


In [249]:
dataset.shape

(2930, 82)

In [250]:
# 1️ Handle Categorical Missing Values
# -----------------------------
feature_nan = [
    feature for feature in train.columns
    if train[feature].isnull().sum() > 1 and train[feature].dtype == 'O'
]

for feature in feature_nan:
    train[feature] = train[feature].fillna("Missing")
    test[feature] = test[feature].fillna("Missing")


In [251]:
# 2️ Handle Numerical Missing Values
# -----------------------------
numerical_with_nan = [
    feature for feature in train.columns
    if train[feature].isnull().sum() > 1 and train[feature].dtype != 'O'
]

for feature in numerical_with_nan:
    median_value = train[feature].median()
    train[feature] = train[feature].fillna(median_value)
    test[feature] = test[feature].fillna(median_value)


In [252]:
#3️ Create Time Features
# -----------------------------
for feature in ['Year Built', 'Year Remod/Add', 'Garage Yr Blt']:
    train[feature] = train['Yr Sold'] - train[feature]
    test[feature] = test['Yr Sold'] - test[feature]
    #this difference can make some values negetive, to handle this lets use :
    train[feature] = train[feature].clip(lower=0)
    test[feature] = test[feature].clip(lower=0)

In [253]:
# 4️. Log Transform Skewed Features
# -----------------------------
num_features = ['Lot Frontage','Lot Area','1st Flr SF','Gr Liv Area']

for feature in num_features:
    train[feature] = np.log1p(train[feature])
    test[feature] = np.log1p(test[feature])

# Log transform target ONLY in train
y_train = np.log(train["SalePrice"])
y_test = np.log(test["SalePrice"])

train.drop("SalePrice", axis=1, inplace=True)
test.drop("SalePrice", axis=1, inplace=True)


In [254]:
# 5. Rare Categorical Handling (Fit on Train)
# -----------------------------
categorical_features = [
    feature for feature in train.columns
    if train[feature].dtype == 'O'
]

for feature in categorical_features:
    temp = train.groupby(feature)[feature].count()/len(train)
    rare_labels = temp[temp < 0.01].index

    train[feature] = np.where(train[feature].isin(rare_labels),
                              'Rare_Var', train[feature])

    test[feature] = np.where(test[feature].isin(rare_labels),
                             'Rare_Var', test[feature])


In [255]:
# 6️. Label Encoding Based on Train
# -----------------------------
for feature in categorical_features:
    ordered_labels = train.groupby(feature)[feature].count().sort_values().index
    label_dict = {k: i for i, k in enumerate(ordered_labels, 0)}

    train[feature] = train[feature].map(label_dict)
    test[feature] = test[feature].map(label_dict)


In [256]:
# Final NaN cleanup
train = train.fillna(0)
test = test.fillna(0)


In [257]:
# 7️. Feature Scaling (Fit ONLY on Train)
# -----------------------------
scaler = MinMaxScaler()

scaler.fit(train)

x_train_scaled = scaler.transform(train)
x_test_scaled = scaler.transform(test)

In [258]:
#before model let's check if there is any missing value, because lasso won't be able to handle missing values

print(np.isnan(x_train_scaled).sum())
print(np.isnan(x_test_scaled).sum())


0
0


In [259]:
'''
# 8️. Train Model
# -----------------------------
model = Lasso(alpha=0.001, random_state=42)
model.fit(x_train_scaled, y_train)
# -----------------------------
# 9️. Evaluate Model
# -----------------------------
y_pred = model.predict(x_test_scaled)

print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
'''

'\n# 8️. Train Model\n# -----------------------------\nmodel = Lasso(alpha=0.001, random_state=42)\nmodel.fit(x_train_scaled, y_train)\n# -----------------------------\n# 9️. Evaluate Model\n# -----------------------------\ny_pred = model.predict(x_test_scaled)\n\nprint("R2 Score:", r2_score(y_test, y_pred))\nprint("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))\n'

In [260]:
!pip install xgboost




In [261]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
#Scaling is not required for XGBoost, but it's fine if already done.

In [262]:
# -----------------------------
# Train XGBoost Model
xgb_model = XGBRegressor(
    n_estimator = 1000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(x_train_scaled, y_train)

# Predictions
# -----------------------------
y_pred = xgb_model.predict(x_test_scaled)

# -----------------------------
# Evaluation
# -----------------------------
print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R2 Score: 0.9287586091981352
RMSE: 0.11481247821704156


Parameters: { "n_estimator" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


We checked for Lasso and XGBoost, XGboost is performing well , so we will go for XGB, so comment out lasso
