# House Price Prediction Using ML

## Downloading and Exploring a Kaggle Dataset

In [1]:
import opendatasets as od

In [2]:
dataset= "https://www.kaggle.com/datasets/shibumohapatra/house-price"

In [3]:
od.download(dataset)

Skipping, found downloaded files in ".\house-price" (use force=True to force download)


In [4]:
link= "house-price"

In [5]:
import os

In [6]:
os.listdir(link)

['.ipynb_checkpoints', '1553768847-housing.csv']

In [78]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 787.7 kB/s eta 0:02:07
   ---------------------------------------- 0.2/99.8 MB 1.7 MB/s eta 0:00:58
   ---------------------------------------- 0.5/99.8 MB 2.8 MB/s eta 0:00:36
   ---------------------------------------- 0.9/99.8 MB 3.9 MB/s eta 0:00:26
    --------------------------------------- 1.3/99.8 MB 4.8 MB/s eta 0:00:21
    --------------------------------------- 1.4/99.8 MB 4.4 MB/s eta 0:00:23
    --------------------------------------- 1.8/99.8 MB 4.9 MB/s eta 0:00:20
    --------------------------------------- 2.1/99.8 MB 5.1 MB/s eta 0:00:20
    --------------------------------------- 2.4/99.8 MB 5.3 MB/s eta 0:00:19
    --------------

## Import Necessary Libraries

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

## Data Loading and Preprocessing

In [10]:
data= pd.read_csv("house-price/1553768847-housing.csv")

In [11]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND,78100
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND,77100
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,INLAND,92300
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND,84700


In [12]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [13]:
data[data.isnull().any(axis=1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
290,-122.16,37.77,47,1256,,570,218,4.3750,NEAR BAY,161900
341,-122.17,37.75,38,992,,732,259,1.6196,NEAR BAY,85100
538,-122.28,37.78,29,5154,,3741,1273,2.5762,NEAR BAY,173400
563,-122.24,37.75,45,891,,384,146,4.9489,NEAR BAY,247100
696,-122.10,37.69,41,746,,387,161,3.9063,NEAR BAY,178400
...,...,...,...,...,...,...,...,...,...,...
20267,-119.19,34.20,18,3620,,3171,779,3.3409,NEAR OCEAN,220500
20268,-119.18,34.19,19,2393,,1938,762,1.6953,NEAR OCEAN,167400
20372,-118.88,34.17,15,4260,,1701,669,5.1033,<1H OCEAN,410700
20460,-118.75,34.29,17,5512,,2734,814,6.6073,<1H OCEAN,258100


In [14]:
data.dropna(inplace=True)

In [15]:
data.shape

(20433, 10)

In [16]:
data["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: count, dtype: int64

In [17]:
data["ocean_proximity"]= data["ocean_proximity"].replace({"<1H OCEAN":"lessthan_1H"})

In [18]:
ocean_proximity= pd.get_dummies(data["ocean_proximity"],dtype=int)

In [19]:
data= pd.concat([data,ocean_proximity],axis=1)

In [20]:
data.drop("ocean_proximity",axis=1,inplace=True)

In [21]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,lessthan_1H
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0,0


In [22]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN',
       'lessthan_1H'],
      dtype='object')

In [23]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,lessthan_1H
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155,0.317917,0.000245,0.111095,0.128615,0.442128
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099,0.465678,0.015641,0.314257,0.334782,0.496652
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,0.0,0.0,0.0,0.0,0.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0,0.0,0.0,0.0,0.0,0.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0,0.0,0.0,0.0,0.0,0.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0,1.0,0.0,0.0,0.0,1.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0,1.0,1.0,1.0,1.0,1.0


## Model Training and Evaluation

In [24]:
x= data.drop("median_house_value",axis=1)
y= data["median_house_value"]

In [25]:
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,lessthan_1H
0,-122.23,37.88,41,880,129.0,322,126,8.3252,0,0,1,0,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,0,0,1,0,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,0,0,1,0,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,0,0,1,0,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,1,0,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,1,0,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,1,0,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,1,0,0,0,0


In [26]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=1)

In [27]:
print(x.shape,x_train.shape,x_test.shape)

(20433, 13) (16346, 13) (4087, 13)


In [28]:
model= XGBRegressor()

In [29]:
model.fit(x_train,y_train)

In [30]:
predict_xtrain= model.predict(x_train)

In [31]:
accuracy_score= r2_score(y_train,predict_xtrain)

In [32]:
accuracy_score

0.9385726468921205

In [33]:
predict_xtest= model.predict(x_test)

In [34]:
accuracy_xtest= r2_score(y_test,predict_xtest)

In [35]:
accuracy_xtest

0.829608240438699

In [36]:
input_data= (-121.92,37.53,7,28258,3864,12203,3701,8.4045,0,0,0,0,1)
array= np.asarray(input_data)
reshape= array.reshape(1,-1)
output= model.predict(reshape)
print(output)

[437450.62]
