In [14]:
import pandas as pd

# Load dataset
data = pd.read_csv(r"C:\Users\Roshan\Downloads\housing.csv")
print(data.head())
print(data.info())
print(data.describe())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [29]:
# Pick a random row
sample = data.sample(1, random_state=42)

print("Sample row:\n", sample)

Sample row:
        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
20046    -119.01     36.06                25.0       1505.0           435.0   

       population  households  median_income  median_house_value  \
20046      1392.0       359.0         1.6812             47700.0   

       ocean_proximity_INLAND  ocean_proximity_ISLAND  \
20046                    True                   False   

       ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  
20046                     False                       False  


In [15]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [18]:
# Fill missing numeric values with median
data['total_bedrooms'] = data['total_bedrooms'].fillna(data['total_bedrooms'].median())

In [20]:
data = pd.get_dummies(data, drop_first=True)

In [21]:
X = data.drop('median_house_value', axis=1)  # features
y = data['median_house_value']               # target

In [27]:
data.isnull().sum()

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
median_house_value            0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2 Score:", r2)

MAE: 31639.373406007748
RMSE: 49038.20933848149
R2 Score: 0.8164888965922015


In [24]:
import joblib
joblib.dump(model, "house_price_model.pkl")

['house_price_model.pkl']

In [None]:
# Example input (random values for testing)
test_input = np.array([[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252]])

In [None]:
prediction = model.predict(test_input)
print("Predicted House Price:", prediction[0])

In [31]:
import pandas as pd

# Your sample row
sample_data = {
    "longitude": [-119.01],
    "latitude": [36.06],
    "housing_median_age": [25.0],
    "total_rooms": [1505.0],
    "total_bedrooms": [435.0],
    "population": [1392.0],
    "households": [359.0],
    "median_income": [1.6812],
    # one-hot encoded categorical features
    "ocean_proximity_INLAND": [True],
    "ocean_proximity_ISLAND": [False],
    "ocean_proximity_NEAR BAY": [False],
    "ocean_proximity_NEAR OCEAN": [False]
}

# Convert into DataFrame
X_test = pd.DataFrame(sample_data)

# Predict
prediction = model.predict(X_test)

# Compare with original
print(" Predicted Price:", prediction[0])
print(" Original Price: 47700.0")

 Predicted Price: 51521.0
 Original Price: 47700.0


In [33]:
# Extract features (drop target column)
X_sample = sample.drop("median_house_value", axis=1)

# Convert to numpy for prediction
test_input = X_sample.values

In [34]:
# Predict using trained model
predicted_value = model.predict(test_input)

print("🔮 Predicted House Price:", predicted_value[0])
print("🏠 Original (Actual) House Price:", sample["median_house_value"].values[0])

🔮 Predicted House Price: 51521.0
🏠 Original (Actual) House Price: 47700.0


