# House Price Prediction

In [7]:
# Importing libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import fetch_california_housing

In [95]:
data = fetch_california_housing()

In [96]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [97]:
#independent data
df = pd.DataFrame(data = data.data, columns = data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [98]:
# Dependent data
df['price'] = data.target

In [99]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Exploratory data analysis 

In [13]:
import sweetviz as sv

report = sv.analyze(df)
report.show_html("./report.html")

                                             |                                             | [  0%]   00:00 ->…

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Preprocessing

In [24]:
## Feature Engineering

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent = 'geopiExercises')

In [25]:
geolocator.reverse("37.88"+","+"-122.23")

Location(Ecological Study Area, Vollmer Peak Trail, Contra Costa County, California, 94563, United States, (37.87563745, -122.22856355341203, 0.0))

In [26]:
#function to get the loacation

def location(coord):
    Lattitude = str(coord[0])
    Longitude = str(coord[1])
    location = geolocator.reverse(Lattitude+","+Longitude).raw['address']

    # if value are missing, replace by a empty string
    if location.get('road') is None:
        location['road'] = None
        
    if location.get('county') is None:
        location['county'] = None
        
    loc_update['County'].append(location['county'])
    loc_update['Road'].append(location['road'])
    

In [27]:
"""

import pickle
loc_update = {
     "County" : [],
     "Road" : [],
     "Neighbourhood" : []
 }

for i , coord in enumerate(df.iloc[:,6:-1].values):
    location(coord)

    pickle.dump(loc_update, open('loc_update.pickle','wb'))

    if i%10 == 0:
        print(i)
"""


'\n\nimport pickle\nloc_update = {\n     "County" : [],\n     "Road" : [],\n     "Neighbourhood" : []\n }\n\nfor i , coord in enumerate(df.iloc[:,6:-1].values):\n    location(coord)\n\n    pickle.dump(loc_update, open(\'loc_update.pickle\',\'wb\'))\n\n    if i%10 == 0:\n        print(i)\n'

In [28]:
import pickle

loc_update = pickle.load(open("loc_update.pickle", "rb"))

In [29]:
loc_update

{'County': ['Contra Costa County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Alameda County',
  'Al

In [30]:
loc_update.keys()

dict_keys(['County', 'Road', 'Neighbourhood'])

In [31]:
loc_update['County']

['Contra Costa County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alameda County',
 'Alame

In [45]:
import pandas as pd
import pickle

# Load the pickle file
file_path = 'loc_update.pickle'  
with open(file_path, "rb") as file:
    data = pickle.load(file)

# Finding the maximum length among all lists in the dictionary
max_length = max(len(lst) for lst in data.values() if isinstance(lst, list))

# Padding shorter lists with None values
padded_data = {key: lst + [None] * (max_length - len(lst)) if isinstance(lst, list) else lst
for key, lst in data.items()}

# Converting the padded dictionary to a DataFrame
loc_update = pd.DataFrame(padded_data)


In [46]:
loc_update.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         20068 non-null  object
 1   Road           19647 non-null  object
 2   Neighbourhood  0 non-null      object
dtypes: object(3)
memory usage: 483.9+ KB


In [100]:
## add the new feature in the dataframe

for i in loc_update.keys():
    df[i] = loc_update[i]

df = df.sample(axis = 0, frac = 1)

In [48]:
df.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price,County,Road,Neighbourhood
13488,2.3578,31.0,5.290389,1.08589,1227.0,2.509202,34.1,-117.36,0.886,San Bernardino County,North Pine Avenue,
2591,2.4321,26.0,5.10076,1.055133,1353.0,2.572243,40.88,-124.09,0.821,Humboldt County,Eastern Avenue,
10298,8.0287,11.0,8.007212,0.985577,1460.0,3.509615,33.92,-117.85,3.718,Orange County,Handicap Ramp,
15622,4.1607,52.0,4.683363,1.093023,1328.0,2.375671,37.8,-122.41,3.5,,Columbus Avenue,
19804,2.7083,19.0,6.475,1.35,216.0,2.7,40.34,-123.48,0.645,Trinity County,Mule Ridge Road,
5710,3.4148,38.0,3.624352,1.010363,859.0,2.225389,34.21,-118.23,2.348,Los Angeles County,Montrose Avenue,
6020,3.1641,21.0,4.833333,1.106715,2558.0,3.067146,34.05,-117.82,1.173,Los Angeles County,South Campus Drive,
10791,5.2416,37.0,5.349081,0.994751,837.0,2.19685,33.62,-117.92,4.713,Orange County,Cliff Drive,
1578,8.2673,21.0,7.539615,0.957173,1396.0,2.989293,37.87,-122.03,3.587,Contra Costa County,Trotter Way,
15149,4.0806,19.0,5.942308,1.061538,1621.0,3.117308,32.9,-116.9,1.892,San Diego County,Via Vigneto,


In [101]:
#drop lattitude and longitude and neighbourhood column

df.drop(labels = ['Latitude', 'Longitude', 'Neighbourhood'], axis = 1, inplace = True)

In [102]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,price,County,Road
7662,3.9091,44.0,5.005587,1.128492,1115.0,3.114525,1.744,Los Angeles County,East 218th Place
16235,2.625,42.0,3.930818,0.899371,533.0,3.352201,0.654,San Joaquin County,East Hampton Street
14416,3.1838,20.0,3.783465,1.094488,525.0,2.066929,2.458,San Diego County,Haines Street
8322,1.2455,41.0,4.065217,1.086957,1805.0,3.567194,1.008,Los Angeles County,South Normandie Avenue
211,3.9048,52.0,4.99854,1.007299,2273.0,3.318248,1.647,Alameda County,Davis Street


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 7662 to 9349
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   price       20640 non-null  float64
 7   County      20068 non-null  object 
 8   Road        19647 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.6+ MB


In [105]:
old_df = df

In [106]:
# it contain price column
old_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 7662 to 9349
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   price       20640 non-null  float64
 7   County      20068 non-null  object 
 8   Road        19647 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.6+ MB


In [59]:
# df.drop(labels = ['price'], axis = 1, inplace = True)

In [107]:
df = old_df

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 7662 to 9349
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   price       20640 non-null  float64
 7   County      20068 non-null  object 
 8   Road        19647 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.6+ MB


## Using Classification Algorithem to fill the missing Categorical values 

In [109]:
df['County'].isnull().sum()

572

In [110]:
df['Road'].isnull().sum()

993

In [111]:
df.keys()

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'price', 'County', 'Road'],
      dtype='object')

In [112]:
# MIssing Road data

missing_idx = []

for i in range(df.shape[0]):
    if df['Road'][i] is None:
        missing_idx.append(i)


# Independent Parameter 

missing_road_X_train = np.array([[df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range (df.shape[0]) if i not in missing_idx ])


## Dependent Parameter

missing_road_y_train = np.array([[df['Road'][i]] for i in range(df.shape[0]) if i not in missing_idx])

missing_road_X_test = np.array([[df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range (df.shape[0]) if i not in missing_idx ])


## Model Train 

In [113]:
from sklearn.linear_model import SGDClassifier

model_1 = SGDClassifier()

model_1.fit(missing_road_X_train, missing_road_y_train)

missing_road_y_pred = model_1.predict(missing_road_X_test)

  y = column_or_1d(y, warn=True)


In [114]:
for n, i in enumerate(missing_idx):
    df['Road'][i] = missing_idx[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Road'][i] = missing_idx[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Road'][i] = missing_idx[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Road'][i] = missing_idx[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Road'][i] = missing_idx[n]
A value is trying to be set on a copy of a s

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 7662 to 9349
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   price       20640 non-null  float64
 7   County      20068 non-null  object 
 8   Road        20640 non-null  object 
dtypes: float64(7), object(2)
memory usage: 2.1+ MB


In [117]:
df['Road'] = df['Road'].astype(str)

In [118]:
# label encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Road'] = le.fit_transform(df['Road'])

In [119]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,price,County,Road
7662,3.9091,44.0,5.005587,1.128492,1115.0,3.114525,1.744,Los Angeles County,3292
16235,2.625,42.0,3.930818,0.899371,533.0,3.352201,0.654,San Joaquin County,3482
14416,3.1838,20.0,3.783465,1.094488,525.0,2.066929,2.458,San Diego County,4517
8322,1.2455,41.0,4.065217,1.086957,1805.0,3.567194,1.008,Los Angeles County,8560
211,3.9048,52.0,4.99854,1.007299,2273.0,3.318248,1.647,Alameda County,3034


## Missing County Data 

In [120]:
# MIssing Road data

missing_idx = []

for i in range(df.shape[0]):
    if df['County'][i] is None:
        missing_idx.append(i)


# Independent Parameter 

missing_county_X_train = np.array([[df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range (df.shape[0]) if i not in missing_idx ])


## Dependent Parameter

missing_county_y_train = np.array([[df['County'][i]] for i in range(df.shape[0]) if i not in missing_idx])

missing_county_X_test = np.array([[df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range (df.shape[0]) if i not in missing_idx ])


In [121]:
from sklearn.linear_model import SGDClassifier

model_1 = SGDClassifier()

model_1.fit(missing_county_X_train, missing_county_y_train)

missing_road_y_pred = model_1.predict(missing_county_X_test)

  y = column_or_1d(y, warn=True)


In [122]:
for n, i in enumerate(missing_idx):
    df['County'][i] = missing_idx[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['County'][i] = missing_idx[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['County'][i] = missing_idx[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['County'][i] = missing_idx[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['County'][i] = missing_idx[n]
A value is trying to be set on a cop

In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 7662 to 9349
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   price       20640 non-null  float64
 7   County      20640 non-null  object 
 8   Road        20640 non-null  int32  
dtypes: float64(7), int32(1), object(1)
memory usage: 2.0+ MB


In [124]:
df['County'] = df['County'].astype(str)

In [125]:
# label encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['County'] = le.fit_transform(df['County'])

In [126]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,price,County,Road
7662,3.9091,44.0,5.005587,1.128492,1115.0,3.114525,1.744,592,3292
16235,2.625,42.0,3.930818,0.899371,533.0,3.352201,0.654,612,3482
14416,3.1838,20.0,3.783465,1.094488,525.0,2.066929,2.458,611,4517
8322,1.2455,41.0,4.065217,1.086957,1805.0,3.567194,1.008,592,8560
211,3.9048,52.0,4.99854,1.007299,2273.0,3.318248,1.647,572,3034


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20640 entries, 7662 to 9349
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   price       20640 non-null  float64
 7   County      20640 non-null  int32  
 8   Road        20640 non-null  int32  
dtypes: float64(7), int32(2)
memory usage: 1.9 MB


### Understanding which model is used

In [130]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,price,County,Road
7662,3.9091,44.0,5.005587,1.128492,1115.0,3.114525,4.526,592,3292
16235,2.625,42.0,3.930818,0.899371,533.0,3.352201,3.585,612,3482
14416,3.1838,20.0,3.783465,1.094488,525.0,2.066929,3.521,611,4517
8322,1.2455,41.0,4.065217,1.086957,1805.0,3.567194,3.413,592,8560
211,3.9048,52.0,4.99854,1.007299,2273.0,3.318248,3.422,572,3034


In [156]:
## Split the column

y = df.iloc[:, -3].values

df.drop(labels = ["County", 'Road'], axis = 1, inplace = True)


x = df.iloc[:,:].values

In [157]:
y

array([3.11452514, 3.35220126, 2.06692913, ..., 2.79381443, 1.9872449 ,
       2.40196078])

In [158]:
x

array([[3.90910000e+00, 4.40000000e+01, 5.00558659e+00, 1.12849162e+00,
        1.11500000e+03, 3.11452514e+00],
       [2.62500000e+00, 4.20000000e+01, 3.93081761e+00, 8.99371069e-01,
        5.33000000e+02, 3.35220126e+00],
       [3.18380000e+00, 2.00000000e+01, 3.78346457e+00, 1.09448819e+00,
        5.25000000e+02, 2.06692913e+00],
       ...,
       [5.03800000e+00, 1.30000000e+01, 5.95876289e+00, 9.07216495e-01,
        1.08400000e+03, 2.79381443e+00],
       [3.69760000e+00, 2.50000000e+01, 3.53061224e+00, 1.08418367e+00,
        1.55800000e+03, 1.98724490e+00],
       [4.69440000e+00, 5.20000000e+01, 5.99019608e+00, 1.05882353e+00,
        7.35000000e+02, 2.40196078e+00]])

In [159]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

## Model Training 

In [160]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

In [161]:
## Model Prediction

y_pred = model.predict(X_test)

In [162]:
# Model Accuracy

from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.5037136765372814