In [191]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [192]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,TITLE,LOCATION,PRICE,LAND AREA,BUILDUP AREA,ROAD ACCESS,FACING,FLOOR,BEDROOM,BATHROOM,BUILT YEAR,PARKING,AMENITIES
0,House for Sale,"Imadol, Lalitpur",Rs. 2.9 Cr,4.0 aana,,12 Feet,West,3.0,5.0,4.0,2076 B.S,1 CaRs. & 2 Bikes,"['Earthquake Resistant', 'Marbel', 'Parquet', ..."
1,House for Sale,"Satdobato, Lalitpur",Rs. 4.75 Cr,3.0 aana,,10 Feet,West,4.5,5.0,6.0,2076 B.S,2 CaRs. & 2 Bikes,"['Earthquake Resistant', 'Parquet', 'Drinking ..."
2,4 BHK House for Sale,"Imadol, Lalitpur",Rs. 1.99 Cr,2.3 aana,,10 Feet,West,2.5,4.0,4.0,2060 B.S,1 CaRs. & 3 Bikes,"['Earthquake Resistant', 'Marbel', 'Parquet', ..."
3,Bungalow House for Sale,"Bhaisepati, Lalitpur",Rs. 4 Cr,7.0 aana,,12 Feet,North-West,2.5,4.0,3.0,2059 B.S,4 CaRs. & 4 Bikes,"['Earthquake Resistant', 'Marbel', 'Parquet', ..."
4,House for Rent,"Maharajgunj, Kathmandu",Rs. 12000000,6.0 aana,,20 Feet,South,2.0,4.0,4.0,2071 B.S,4 CaRs. & 5 Bikes,"['Earthquake Resistant', 'Parquet', 'Parking',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3413,Padma Colony Phase III,"Sitapiala, Kathmandu","Rs. 26,000,000",4.5 aana,,16 Feet,North,2.5,4.0,4.0,2076 B.S,1 Car & 3 Bikes,"['Garage', 'Parking']"
3414,Bhatbhateni Apartment,"Bhatbhateni, Kathmandu","Rs. 30,000,000",4.5 aana,1700 Sq.ft,16 Feet,South,3.0,3.0,3.0,2076 B.S,1 Car & 3 Bikes,"['Garage', 'Parking']"
3415,स्यूचाटार,"Syuchatar, Kathmandu",Rs. 16000000,4.7 aana,1700 Sq.ft,16 Feet,South,2.5,4.0,2.0,2072 B.S,1 Car & 3 Bikes,['Garage']
3416,Sano Bharayang Colony,"Sano Bharayang, Kathmandu",Rs. 3.5 Cr,6.3 aana,3505 Sq.ft,16 Feet,North-West,2.5,5.0,3.0,2077 B.S,2 CaRs. & 7 Bikes,"['Lawn', 'Garage', 'Air Condition', 'Backyard'..."


In [193]:
df = df.drop(columns=["AMENITIES", "BUILDUP AREA"], errors="ignore")
# Remove"House for Rent" from title
df = df[~df["TITLE"].str.contains("House for Rent", case=False, na=False)]
df["LOCATION"] = df["LOCATION"].apply(lambda x: str(x).split()[-1] if isinstance(x, str) else x)


In [194]:

def convert_price(price):
    price = str(price).replace("Rs.", "").replace(",", "").strip()
    if "Cr" in price:
        return float(price.replace("Cr", "").strip()) * 10_000_000
    elif "Lakh" in price:
        return float(price.replace("Lakh", "").strip()) * 100_000
    else:
        return float(price) if price.isdigit() else np.nan

def clean_land_area(area):
    area = str(area).strip().lower()
    match = re.search(r"(\d+\.\d+|\d+)", area)
    number = float(match.group(1)) if match else np.nan
    
    if "aana" in area:
        return number * 342.25  #aana to square feet
    elif "kattha" in area:
        return number * 3388.98  # kattha to square feet
    elif "sq. mtr" in area:
        return number * 10.7639  # square meters to square feet
    elif "sq. ft" in area:
        return number
    return np.nan

def convert_road_access(road):
    match = re.search(r"([\d.]+)\s*Feet", str(road))
    return float(match.group(1)) if match else np.nan

def extract_parking(parking):
    cars = re.search(r"(\d+)\s*CaRs?", str(parking))
    bikes = re.search(r"(\d+)\s*Bikes?", str(parking))
    return int(cars.group(1)) if cars else 0, int(bikes.group(1)) if bikes else 0


In [195]:
# Apply transformations
df["PRICE"] = df["PRICE"].apply(convert_price)
df["LAND AREA"] = df["LAND AREA"].apply(clean_land_area)
df["ROAD ACCESS"] = df["ROAD ACCESS"].apply(convert_road_access)
df["CARS"], df["BIKES"] = zip(*df["PARKING"].apply(extract_parking))

# Drop unnecessary columns
df = df.drop(columns=["TITLE", "PARKING", "BUILT YEAR"], errors="ignore")

# one-hot encoding
df = pd.get_dummies(df, columns=["LOCATION", "FACING"], drop_first=True)

df = df.fillna(df.median())

In [196]:
X = df.drop(columns=["PRICE"])
y = df["PRICE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [197]:
X

Unnamed: 0,LAND AREA,ROAD ACCESS,FLOOR,BEDROOM,BATHROOM,CARS,BIKES,LOCATION_Banke,LOCATION_Bhaktapur,LOCATION_Chitwan,...,FACING_South-east,FACING_WEST,FACING_WEST / NORTH,FACING_WEST-NORTH,FACING_WEST-SOUTH,FACING_West,FACING_West-South,FACING_south,FACING_south west,FACING_west
0,1369.000,12.0,3.0,5.0,4.0,1,2,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,1026.750,10.0,4.5,5.0,6.0,2,2,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,787.175,10.0,2.5,4.0,4.0,1,3,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,2395.750,12.0,2.5,4.0,3.0,4,4,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,1095.200,13.0,2.5,4.0,3.0,4,5,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3413,1540.125,16.0,2.5,4.0,4.0,0,3,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3414,1540.125,16.0,3.0,3.0,3.0,0,3,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3415,1608.575,16.0,2.5,4.0,2.0,0,3,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3416,2156.175,16.0,2.5,5.0,3.0,2,7,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [198]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [199]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Mean Absolute Error (MAE): 12332280.469481787
Root Mean Squared Error (RMSE): 21994546.19696886
R² Score: 0.2598785248397527


In [200]:
predictions_df = pd.DataFrame({"Actual Price": y_test, "Predicted Price": y_pred})
# Display the DataFrame
print(predictions_df.head())

      Actual Price  Predicted Price
412     45000000.0       51824000.0
2352    21500000.0       24607500.0
3292    18500000.0       21559000.0
2754    32000000.0       40317000.0
339     46500000.0       34935000.0


In [201]:
import joblib

joblib.dump(rf_model, "house_price_model.pkl")
print("Model saved successfully!")


Model saved successfully!


In [202]:
test_data = {
    "LAND AREA": 1000,  # Example: 1500 sq. ft
    "ROAD ACCESS": 12,  # Example: 12 feet road access
    "FLOOR": 2,  # Example: 2.5 floors
    "BEDROOM": 3,  # Example: 4 bedrooms
    "BATHROOM": 3,  # Example: 3 bathrooms
    "CARS": 0,  # Example: Parking for 2 cars
    "BIKES": 2,  # Example: Parking for 3 bikes

    # One-hot encoded location (Only one location is True, others are False)
    "LOCATION_Banke": False,
    "LOCATION_Bhaktapur": False,
    "LOCATION_Chitwan": True,
    "LOCATION_Kathmandu": False,  # Example: Property is in Kathmandu
    "LOCATION_Lalitpur": False,

    # One-hot encoded facing direction (Only one is True)
    "FACING_North": False,
    "FACING_East": True,  # Example: Property faces East
    "FACING_South": False,
    "FACING_West": False,
    "FACING_South-East": False,
}

print(test_data)


{'LAND AREA': 1000, 'ROAD ACCESS': 12, 'FLOOR': 2, 'BEDROOM': 3, 'BATHROOM': 3, 'CARS': 0, 'BIKES': 2, 'LOCATION_Banke': False, 'LOCATION_Bhaktapur': False, 'LOCATION_Chitwan': True, 'LOCATION_Kathmandu': False, 'LOCATION_Lalitpur': False, 'FACING_North': False, 'FACING_East': True, 'FACING_South': False, 'FACING_West': False, 'FACING_South-East': False}


In [203]:
# Convert dummy_data to DataFrame
dummy_df = pd.DataFrame([test_data])

# Ensure columns match the training data
dummy_df = dummy_df.reindex(columns=X_train.columns, fill_value=0)

# Make prediction
y_pred = rf_model.predict(dummy_df)

print(f"Predicted Price: Rs. {y_pred[0]:,.2f}")

Predicted Price: Rs. 26,614,500.00


In [2]:
import pandas as pd
import random

# Define possible values for each categorical feature
house_types = ["traditional", "modern", "apartment"]
foundation_types = ["normal", "pile", "RCC"]
material_qualities = ["low", "standard", "premium"]
locations = ["Kathmandu", "Pokhara", "Terai", "Lalitpur", "Bhaktapur"]
roof_types = ["flat", "sloped", "metal", "RCC"]

# Generate synthetic dataset
num_samples = 500  # Number of records

data = []
for _ in range(num_samples):
    house_type = random.choice(house_types)
    total_area = random.randint(500, 5000)  # in sq. ft.
    floors = random.randint(1, 5)
    foundation_type = random.choice(foundation_types)
    material_quality = random.choice(material_qualities)
    location = random.choice(locations)
    bedrooms = random.randint(1, 6)
    bathrooms = random.randint(1, 4)
    roof_type = random.choice(roof_types)
    parking = random.choice(["yes", "no"])
    
    # Additional Features (randomly assigned)
    additional_features = random.choice(["none", "basement", "solar panels", "garden", "swimming pool"])
    
    # Costs (rough estimations based on area, quality, and location)
    base_material_cost = {"low": 1200, "standard": 1500, "premium": 2000}  # per sq. ft.
    base_labor_cost = {"low": 300, "standard": 400, "premium": 600}  # per sq. ft.
    
    material_cost = base_material_cost[material_quality] * total_area
    labor_cost = base_labor_cost[material_quality] * total_area
    
    # Additional feature cost adjustment
    extra_cost = 0
    if additional_features == "basement":
        extra_cost += 500000  # Rough extra cost for a basement
    elif additional_features == "solar panels":
        extra_cost += 200000
    elif additional_features == "garden":
        extra_cost += 150000
    elif additional_features == "swimming pool":
        extra_cost += 800000
    
    # Total estimated cost
    total_cost = material_cost + labor_cost + extra_cost

    # Append data
    data.append([house_type, total_area, floors, foundation_type, material_quality, location, bedrooms, bathrooms, roof_type, parking, additional_features, labor_cost, material_cost, total_cost])

# Create DataFrame
columns = ["House Type", "Total Area (sq. ft.)", "Floors", "Foundation Type", "Material Quality", "Location", "Bedrooms", "Bathrooms", "Roof Type", "Parking", "Additional Features", "Labor Cost", "Material Cost", "Total Estimated Cost"]
df = pd.DataFrame(data, columns=columns)

# Save dataset
file_path = r"C:\Users\subin\Desktop\construction_price_estimation_nepal.csv"
df.to_csv(file_path, index=False)
print(f"Dataset saved at {file_path}")


Dataset saved at C:\Users\subin\Desktop\construction_price_estimation_nepal.csv
