In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, train_test_split, cross_val_predict   # âœ… Add this here
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings("ignore")

In [5]:
train = pd.read_csv("train_cleaned_final.csv")
test = pd.read_csv("test_cleaned_final.csv")

TARGET = "SalePrice"

In [6]:
# 3) FEATURE & TARGET SELECTION
# ---------------------------
FEATURES = [c for c in train.columns if c != TARGET]
X = train[FEATURES]
y = train[TARGET]
X_test = test[FEATURES]

In [7]:
# 4) HANDLE CATEGORICAL FEATURES
# ---------------------------
# LightGBM can handle category dtype directly
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [8]:
# Example: Total bathrooms, Total SF, Age features
train['TotalBath'] = train['BsmtFullBath'] + 0.5*train['BsmtHalfBath'] + train['FullBath'] + 0.5*train['HalfBath']
test['TotalBath'] = test['BsmtFullBath'] + 0.5*test['BsmtHalfBath'] + test['FullBath'] + 0.5*test['HalfBath']

train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train['HouseAge'] = train['YrSold'] - train['YearBuilt']
train['RemodAge'] = train['YrSold'] - train['YearRemodAdd']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']
test['RemodAge'] = test['YrSold'] - test['YearRemodAdd']

train['TotalPorch'] = train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch'] + train['WoodDeckSF']
test['TotalPorch'] = test['OpenPorchSF'] + test['EnclosedPorch'] + test['3SsnPorch'] + test['ScreenPorch'] + test['WoodDeckSF']


In [9]:
# One-hot encoding
train_encoded = pd.get_dummies(train, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns
X_train = train_encoded.drop('SalePrice', axis=1)
y_train = train_encoded['SalePrice']
X_test = test_encoded.reindex(columns=X_train.columns, fill_value=0)


In [10]:
y_train_log = np.log1p(
    y_train)

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = lgb.LGBMRegressor(
    objective='regression',
    learning_rate=0.01,
    n_estimators=10000,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Out-of-fold predictions
oof_preds_log = cross_val_predict(model, X_train, y_train_log, cv=kf, n_jobs=-1)

# Convert back from log
oof_preds = np.expm1(oof_preds_log)


In [12]:
rmse = np.sqrt(mean_squared_error(y, oof_preds))  # manually take sqrt
mae = mean_absolute_error(y, oof_preds)
r2 = r2_score(y, oof_preds)
mape = np.mean(np.abs((y_train - oof_preds)/y_train)) * 100
overall_acc = 100 - mape

print("ðŸ“Š CV (OOF) Evaluation on train_cleaned.csv:")
print(f"RÂ²: {r2:.4f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"MAPE: {mape:.2f}%")
print(f"Overall Accuracy (100 - MAPE): {overall_acc:.2f}%")


ðŸ“Š CV (OOF) Evaluation on train_cleaned.csv:
RÂ²: 0.9076
RMSE: 23744.031
MAE: 15162.364
MAPE: 8.75%
Overall Accuracy (100 - MAPE): 91.25%


In [13]:
import lightgbm as lgb
import joblib

# Example training
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)

# âœ… Now save the trained model
joblib.dump(model, r"C:\Users\HP\Desktop\Reality_AI\housing\lightgbm_housing_model.pkl")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4319
[LightGBM] [Info] Number of data points in the train set: 1436, number of used features: 167
[LightGBM] [Info] Start training from score 179696.789694


['C:\\Users\\HP\\Desktop\\Reality_AI\\housing\\lightgbm_housing_model.pkl']

In [14]:
import joblib
joblib.dump(model, 'lightgbm_housing_model.pkl')

['lightgbm_housing_model.pkl']

In [17]:
import pandas as pd

# Load your test dataset
test_path = r"C:\Users\HP\Desktop\Reality_AI\housing\test_cleaned_final.csv"
test_df = pd.read_csv(test_path)

# Show summary information
print("âœ… File Loaded Successfully!")
print("Shape (rows, columns):", test_df.shape)
print("\nColumn Names:\n", test_df.columns.tolist()[:20], "...")  # first 20 columns
print("\nFirst 5 rows:")
display(test_df.head())


âœ… File Loaded Successfully!
Shape (rows, columns): (1459, 80)

Column Names:
 ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt'] ...

First 5 rows:


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [18]:
import pandas as pd
df = pd.read_csv("test_cleaned_final.csv")
print(df.columns.tolist())

['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC'

In [19]:
import pandas as pd

# Load the cleaned test dataset
test = pd.read_csv("test_cleaned_final.csv")

# List of features needed for prediction
features_needed = [
    'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
    'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
    'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
    'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
    'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
    'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold',
    'SaleType', 'SaleCondition'
]

# Add your engineered features
for col in ['TotalBath', 'TotalSF', 'HouseAge', 'RemodAge', 'TotalPorch']:
    if col in test.columns:
        features_needed.append(col)

# Create new CSV with only these columns
prediction_csv = test[features_needed]
prediction_csv.to_csv("housing_prediction_input.csv", index=False)

print("âœ… housing_prediction_input.csv created successfully!")


âœ… housing_prediction_input.csv created successfully!
