<a href="https://colab.research.google.com/github/smi-techie/ML-projects/blob/main/HousePricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

print("--- House Price Prediction Project: Tools Ready! ---\n")

np.random.seed(42)

num_houses = 1500 # Number of hypothetical house samples

# Define features for house data and their plausible ranges
house_attributes = {
    'SquareFeet': np.random.uniform(800, 3500, num_houses),
    'NumBedrooms': np.random.randint(2, 6, num_houses), # 2 to 5 bedrooms
    'NumBathrooms': np.random.uniform(1.0, 4.0, num_houses).round(0) + 0.5 * np.random.randint(0, 2, num_houses), # 1.0, 1.5, 2.0, etc.
    'YearBuilt': np.random.randint(1950, 2024, num_houses),
    'NeighborhoodScore': np.random.uniform(3, 10, num_houses), # Scale 1-10 for amenities, schools, etc.
    'DistanceToCityCenter_Miles': np.random.uniform(1, 20, num_houses),
    'CrimeRate_Per1000': np.random.uniform(0.5, 10, num_houses)
}

house_df = pd.DataFrame(house_attributes)

# Define a formula for House_Price based on features, with some noise
# This formula is hypothetical but captures plausible relationships:
# - Larger size, more bedrooms/bathrooms, newer build, better neighborhood score contribute positively.
# - Further distance to city center and higher crime rate contribute negatively.
house_df['House_Price'] = (
    house_df['SquareFeet'] * 150 +
    house_df['NumBedrooms'] * 15000 +
    house_df['NumBathrooms'] * 10000 +
    (2024 - house_df['YearBuilt']) * -500 + # Older houses are cheaper
    house_df['NeighborhoodScore'] * 20000 +
    house_df['DistanceToCityCenter_Miles'] * -2000 +
    house_df['CrimeRate_Per1000'] * -5000 +
    np.random.normal(0, 25000, num_houses) # Add some random noise
)

# Ensure prices are positive and within a reasonable range (e.g., $100,000 to $1,500,000)
house_df['House_Price'] = np.clip(house_df['House_Price'], 100000, 1500000)
house_df['House_Price'] = (house_df['House_Price'] - house_df['House_Price'].min()) / \
                          (house_df['House_Price'].max() - house_df['House_Price'].min()) * 1400000 + 100000


print("--- Synthetic House Price Dataset Created! ---")
print("First 5 rows of the dataset:\n", house_df.head())
print("\nDataset Info:")
house_df.info()

print("\n--- Data Exploration (Descriptive Statistics & Correlation) ---")
print("Descriptive Statistics:\n", house_df.describe())
print("\nCorrelation Matrix:\n", house_df.corr())

print("\n--- Data Exploration Complete! ---\n")

# Define features (X) and target (y)
X = house_df.drop('House_Price', axis=1)
y = house_df['House_Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("--- Data Prepared for ML Model! ---\n")

print("--- Training House Price Predictor (Random Forest Regressor) ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_test_scaled)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Prediction Performance (Mean Squared Error - MSE): ${mse_rf:.2f}")
print(f"Prediction Performance (R-squared - R2): {r2_rf:.2f}")

print("\nMost important house attributes for price prediction:")
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

print("\n--- Model Trained and Evaluated! ---\n")

print("--- Predicting Price for a New House Sample ---")

# Define attributes for a new hypothetical house sample
new_house_data = {
    'SquareFeet': [2200],
    'NumBedrooms': [4],
    'NumBathrooms': [2.5],
    'YearBuilt': [2010],
    'NeighborhoodScore': [8.5],
    'DistanceToCityCenter_Miles': [5],
    'CrimeRate_Per1000': [2.0]
}
new_house_df = pd.DataFrame(new_house_data)

# Scale the new data using the SAME SCALER fitted on training data
new_house_scaled = scaler.transform(new_house_df)

# Predict price using the Random Forest model
predicted_price = rf_model.predict(new_house_scaled)

print("\n--- Prediction for a New House ---")
print("New House Attributes:\n", new_house_df.iloc[0].to_dict())
print(f"\nPredicted House Price: ${predicted_price[0]:,.2f}")

print("\n--- Project Complete! You've built a a House Price Predictor! ---")




--- House Price Prediction Project: Tools Ready! ---

--- Synthetic House Price Dataset Created! ---
First 5 rows of the dataset:
     SquareFeet  NumBedrooms  NumBathrooms  YearBuilt  NeighborhoodScore  \
0  1811.258321            5           1.5       1966           3.701085   
1  3366.928627            3           4.5       1977           4.902023   
2  2776.383643            3           3.0       1990           8.711691   
3  2416.377907            5           3.5       2015           5.648209   
4  1221.250329            2           4.0       1974           3.618779   

   DistanceToCityCenter_Miles  CrimeRate_Per1000   House_Price  
0                    9.176012           2.953511  5.961982e+05  
1                    6.219396           5.067469  1.168590e+06  
2                   17.867279           9.924152  1.037859e+06  
3                    1.771129           7.023179  9.815133e+05  
4                   18.360822           8.492889  2.964255e+05  

Dataset Info:
<class 'panda