## House Price Prediction

### The objective of a house price prediction in machine learning project is to accurately forecast property values based on various features, facilitating informed real estate decisions.

#### import required packages

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

#### load/read the data from source

In [24]:
df=pd.read_csv("housedataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Size,Society,Location,total_sqft,Available by,Age(in yrs),Status,Price(In Lakhs)
0,,2,Millennium Pacific,Tathawade,650,Sep 2025,0,New,62.77
1,0.0,3,Bhandari 43 Privet Drive C Building,Balewadi,1465,Aug 2023,0,New,228.0
2,1.0,2,Malpani Cereza,Tathawade,1012,Feb 2023,0,New,45.5
3,2.0,2,Kohinoor Viva City,Dhanori,734,Dec 2026,0,New,63.0
4,3.0,1,Karia Indrayu Enclave II,hingne Khurd,489,Ready To Move,1,Resale,36.0


In [25]:
df.drop ('Unnamed: 0' , axis=1, inplace=True)
df.columns

Index(['Size', 'Society', 'Location', 'total_sqft', 'Available by',
       'Age(in yrs)', 'Status', 'Price(In Lakhs)'],
      dtype='object')

#### Exploratory Data Analysis

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60565 entries, 0 to 60564
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Size             60565 non-null  int64  
 1   Society          60565 non-null  object 
 2   Location         60565 non-null  object 
 3   total_sqft       60565 non-null  int64  
 4   Available by     60565 non-null  object 
 5   Age(in yrs)      60565 non-null  int64  
 6   Status           60565 non-null  object 
 7   Price(In Lakhs)  60565 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 3.7+ MB


In [27]:
df

Unnamed: 0,Size,Society,Location,total_sqft,Available by,Age(in yrs),Status,Price(In Lakhs)
0,2,Millennium Pacific,Tathawade,650,Sep 2025,0,New,62.77
1,3,Bhandari 43 Privet Drive C Building,Balewadi,1465,Aug 2023,0,New,228.00
2,2,Malpani Cereza,Tathawade,1012,Feb 2023,0,New,45.50
3,2,Kohinoor Viva City,Dhanori,734,Dec 2026,0,New,63.00
4,1,Karia Indrayu Enclave II,hingne Khurd,489,Ready To Move,1,Resale,36.00
...,...,...,...,...,...,...,...,...
60560,3,Unique K Pune,Bavdhan,1116,Aug 2027,0,New,131.00
60561,3,Shapoorji Pallonji Sensorium Phase VI,Hinjewadi,1500,Sep 2026,0,New,100.00
60562,2,Godrej Rejuve,Mundhwa,707,Aug 2023,0,New,76.31
60563,2,Dreams Avani,Manjari,887,Ready To Move,6,Resale,41.50


In [28]:
# df['Society'].unique()
# df['Location'].value_counts()
df['Available by'].value_counts()

Available by
Ready To Move    21642
 Dec 2025         3123
 Dec 2026         1937
 Nov 2023         1852
 Nov 2026         1568
                 ...  
 Mar 2020            3
 Apr 2021            3
 Feb 2020            1
 Sep 2020            1
 Apr 2020            1
Name: count, Length: 117, dtype: int64

In [29]:
# df['Siize']=df['Siize'].astype(int)

#### Label Encoding

In [30]:
from sklearn.preprocessing import LabelEncoder
cols=['Society','Location','Available by','Status']
df[cols]=df[cols].apply(LabelEncoder().fit_transform)
encoder = LabelEncoder()
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

In [31]:
df.head()

Unnamed: 0,Size,Society,Location,total_sqft,Available by,Age(in yrs),Status,Price(In Lakhs)
0,2,2470,292,650,111,0,0,62.77
1,3,493,22,1465,12,0,0,228.0
2,2,2277,292,1012,32,0,0,45.5
3,2,1740,72,734,24,0,0,63.0
4,1,1624,338,489,116,1,1,36.0


#### Checking Correlation

In [32]:
df.corr()

Unnamed: 0,Size,Society,Location,total_sqft,Available by,Age(in yrs),Status,Price(In Lakhs)
Size,1.0,-0.029465,-0.076538,0.050987,-0.010025,0.001369,-0.022112,0.617463
Society,-0.029465,1.0,0.032307,-0.008748,-0.020023,0.004852,-0.017978,-0.028734
Location,-0.076538,0.032307,1.0,-0.006636,-0.005714,0.003915,0.000964,-0.138816
total_sqft,0.050987,-0.008748,-0.006636,1.0,-0.001251,-5.7e-05,0.00237,0.170019
Available by,-0.010025,-0.020023,-0.005714,-0.001251,1.0,0.012769,0.751081,0.013207
Age(in yrs),0.001369,0.004852,0.003915,-5.7e-05,0.012769,1.0,0.017001,-0.004024
Status,-0.022112,-0.017978,0.000964,0.00237,0.751081,0.017001,1.0,0.013625
Price(In Lakhs),0.617463,-0.028734,-0.138816,0.170019,0.013207,-0.004024,0.013625,1.0


In [33]:
x=df.drop(['Price(In Lakhs)', 'Age(in yrs)'],axis=1)
y=df['Price(In Lakhs)']
x.columns

Index(['Size', 'Society', 'Location', 'total_sqft', 'Available by', 'Status'], dtype='object')

In [34]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=12345)

In [35]:
import pickle

def save_model(model, name):
    with open(name, 'wb') as file:
        pickle.dump(model, file)

In [36]:
def dectree():
    from sklearn.tree import DecisionTreeRegressor
    model = DecisionTreeRegressor()
    model.fit(x_train,y_train)
    return model

In [37]:
def ranfor():
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor()
    model.fit(x_train,y_train)
    save_model(model, 'rf.pkl')
    return model

In [38]:
def catboo():
    from catboost import CatBoostRegressor
    model = CatBoostRegressor(verbose=False)
    model.fit(x_train,y_train)
    return model

In [39]:
def xgboo():
    from xgboost import XGBRegressor
    model = XGBRegressor()
    model.fit(x_train,y_train)
    # save_model(model, 'xgb.pkl')
    return model

In [40]:
def linear_regression():
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(x_train,y_train)
    return model

In [41]:
def svm():
    from sklearn.svm import SVR
    model = SVR()
    model.fit(x_train,y_train)
    return model

In [42]:
DecisionTree = dectree()
RandomForest = ranfor()
CatBoost = catboo()
XGBoost = xgboo()
LinearRegression = linear_regression()
Svm = svm()

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [43]:
def model_eval(model):
    y_true=y_test
    y_pred=model.predict(x_test)
    from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score
    mae = mean_absolute_error(y_true , y_pred)
    print(mae)
    mse = mean_squared_error(y_true , y_pred)
    print(mse)
    r2 = r2_score(y_pred , y_true)
    print(r2*100)
    
    
    

In [44]:
model_eval(CatBoost)

17.1850692345059
1225.3212827947016
84.59644570207568


In [45]:
model_eval(DecisionTree)

13.047030776790628
1668.597505053313
82.409000205972


In [46]:
model_eval(RandomForest)

13.007431028654857
1021.1546633126287
87.06346599565497


In [47]:
model_eval(XGBoost)

16.473726983477142
1248.2740167181698
85.00206804762924


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [48]:
model_eval(LinearRegression)

39.467902630541595
5151.684715851846
-35.56135960554514


In [49]:
model_eval(Svm)

29.475387048261265
3850.3444887760115
-44.89554835865333
