In [None]:
# 🏠 California Housing Price Prediction

This project focuses on predicting housing prices in California using the California Housing dataset. The dataset includes various features such as median income, house age, and population to estimate the median house value in different districts.

The goal is to build a regression model that can predict housing prices based on these features. This can be useful for real estate insights, investment planning, and understanding what factors influence home prices.

We'll go through data exploration, preprocessing, model training, and evaluation to build a reliable predictive model.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Data set/Bengaluru_House_Data.csv')

In [None]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
df.drop(columns=['society'],inplace=True)

In [None]:
df.shape

(13320, 8)

In [None]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
total_sqft,0
bath,73
balcony,609
price,0


In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

(12710, 8)

In [None]:
df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [None]:
le=LabelEncoder()

In [None]:
df['area_type']=le.fit_transform(df['area_type'])

In [None]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,3,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,2,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,0,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,3,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,3,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [None]:
di={}

#df['availability']
di=df['availability']


In [None]:
df1=pd.DataFrame(di)

date_pattern = r'\d{2}-[A-Za-z]{3}'
df1['availability'] = np.where(df1['availability'].str.match(date_pattern), 'Not Ready to Move', df1['availability'])
print("\nDataFrame after replacement:")
df1


DataFrame after replacement:


Unnamed: 0,availability
0,Not Ready to Move
1,Ready To Move
2,Ready To Move
3,Ready To Move
4,Ready To Move
...,...
13314,Ready To Move
13315,Ready To Move
13317,Ready To Move
13318,Not Ready to Move


In [None]:
df1['availability']=le.fit_transform(df1['availability'])

In [None]:
df1.shape

(12710, 1)

In [None]:
df.drop(columns=['availability'],inplace=True)

In [None]:
df['availability']=df1['availability']

In [None]:
df

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,availability
0,3,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,0
1,2,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,1
2,0,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,1
3,3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,1
4,3,Kothanur,2 BHK,1200,2.0,1.0,51.00,1
...,...,...,...,...,...,...,...,...
13314,3,Green Glen Layout,3 BHK,1715,3.0,3.0,112.00,1
13315,0,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,1
13317,0,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,1
13318,3,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,0


In [None]:
df['location'].value_counts()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,514
Sarjapur Road,372
Electronic City,300
Kanakpura Road,261
Thanisandra,231
Yelahanka,206
Uttarahalli,186
Hebbal,173
Raja Rajeshwari Nagar,168
Marathahalli,164


In [None]:
val=199
df=df.groupby('location').filter(lambda x:len(x)>=val)
df

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,availability
5,3,Whitefield,2 BHK,1170,2.0,1.0,38.0,1
10,3,Whitefield,3 BHK,1800,2.0,2.0,70.0,0
11,2,Whitefield,4 Bedroom,2785,5.0,3.0,295.0,1
22,2,Thanisandra,4 Bedroom,2800,5.0,2.0,380.0,1
26,3,Electronic City,2 BHK,660,1.0,1.0,23.1,1
...,...,...,...,...,...,...,...,...
13290,3,Sarjapur Road,4 BHK,4050,2.0,1.0,450.0,1
13293,3,Sarjapur Road,4 BHK,2425,5.0,1.0,195.0,1
13297,3,Electronic City,2 BHK,1060,2.0,1.0,52.0,1
13299,3,Whitefield,4 BHK,2830 - 2882,5.0,0.0,154.5,0


In [None]:
df['location']=le.fit_transform(df['location'])

In [None]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,availability
5,3,4,2 BHK,1170,2.0,1.0,38.0,1
10,3,4,3 BHK,1800,2.0,2.0,70.0,0
11,2,4,4 Bedroom,2785,5.0,3.0,295.0,1
22,2,3,4 Bedroom,2800,5.0,2.0,380.0,1
26,3,0,2 BHK,660,1.0,1.0,23.1,1


In [None]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,0
size,0
total_sqft,0
bath,0
balcony,0
price,0
availability,0


In [None]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.bhk.unique()

array([2, 3, 4, 1, 5, 6, 7, 8])

In [None]:
df = df.drop(columns=['size'])

In [None]:
df

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,availability,bhk
5,3,4,1170,2.0,1.0,38.0,1,2
10,3,4,1800,2.0,2.0,70.0,0,3
11,2,4,2785,5.0,3.0,295.0,1,4
22,2,3,2800,5.0,2.0,380.0,1,4
26,3,0,660,1.0,1.0,23.1,1,2
...,...,...,...,...,...,...,...,...
13290,3,2,4050,2.0,1.0,450.0,1,4
13293,3,2,2425,5.0,1.0,195.0,1,4
13297,3,0,1060,2.0,1.0,52.0,1,2
13299,3,4,2830 - 2882,5.0,0.0,154.5,0,4


In [None]:
df['total_sqft'].unique()

array(['1170', '1800', '2785', '2800', '660', '1610', '1025',
       '2100 - 2850', '700', '1254', '1330.74', '1459', '2010', '1600',
       '5700', '1005', '1326', '1665', '1000', '1296', '1116', '1530',
       '2497', '1436', '1427', '880', '950', '3050', '1735', '2050',
       '4200', '1500', '1060', '1152', '1350', '770', '2144', '1070',
       '2250', '1327', '1225', '1075', '1282', '589', '1787', '984',
       '1100', '1200', '910', '1108', '1035', '1017', '1863', '525',
       '1550', '1280', '1128', '1173', '1910', '3252', '11890', '1670',
       '630', '1346', '1130', '1040', '1195', '1599', '1150', '1135',
       '1768', '4144', '2100', '2230', '2500', '1249', '4104', '1720',
       '1210', '1120 - 1145', '1702', '35000', '1140', '1125', '1640',
       '1094', '805', '1155', '2280', '1185', '1650', '940', '1320',
       '1705', '1447', '1114', '1180', '1339', '1691', '1342', '1360',
       '1278', '1897', '2238', '2225', '1590', '1000 - 1285', '1495',
       '1115', '1113', '

In [None]:
def convert_range_to_average(sqft):
    if ' - ' in sqft:
        sqft_range = sqft.split(' - ')
        return (float(sqft_range[0]) + float(sqft_range[1])) / 2
    try:
        return float(sqft)
    except ValueError:
        return None


In [None]:
df['total_sqft'] = df['total_sqft'].apply(convert_range_to_average)

In [None]:
df

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,availability,bhk
5,3,4,1170.0,2.0,1.0,38.0,1,2
10,3,4,1800.0,2.0,2.0,70.0,0,3
11,2,4,2785.0,5.0,3.0,295.0,1,4
22,2,3,2800.0,5.0,2.0,380.0,1,4
26,3,0,660.0,1.0,1.0,23.1,1,2
...,...,...,...,...,...,...,...,...
13290,3,2,4050.0,2.0,1.0,450.0,1,4
13293,3,2,2425.0,5.0,1.0,195.0,1,4
13297,3,0,1060.0,2.0,1.0,52.0,1,2
13299,3,4,2856.0,5.0,0.0,154.5,0,4


In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,0
total_sqft,0
bath,0
balcony,0
price,0
availability,0
bhk,0


In [None]:
#linear regression
X = df.drop('price', axis=1)
y = df['price']

In [None]:
X = pd.get_dummies(X, columns=['area_type', 'location'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model= LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

3630.1281953229404

In [None]:
r2_score(y_test, y_pred)

0.6215901584161925

In [None]:
mean_absolute_error(y_test, y_pred)

32.17052589943044

In [None]:
#randomforest
X = df.drop('price', axis=1)
y = df['price']

In [None]:
X = pd.get_dummies(X, columns=['area_type', 'location'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred)

2329.9618317151226

In [None]:
r2_score(y_test, y_pred)

0.7630075935292948

In [None]:
## ✅ Conclusion

In this project, we explored the California Housing dataset and built a regression model to predict median house values. After preprocessing and feature analysis, we used a **Random Forest Regressor** and evaluated it using metrics like RMSE and R² score.

**Key Takeaways:**
- Median income had the strongest positive correlation with housing prices.
- The Random Forest model performed well, handling non-linear relationships and interactions between features effectively.
- This project demonstrates how ensemble methods can improve predictive performance over simpler models like Linear Regression.

**Next Steps:**
- Tune hyperparameters using GridSearchCV or RandomizedSearchCV to improve accuracy.
- Compare performance with other models (e.g., Gradient Boosting, XGBoost).
- Deploy the model via a simple web app or REST API for real-time predictions.

Thanks for checking out the project!
