In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


In [42]:
df = pd.read_csv("construction_price.csv")
df.head()
df.isnull().sum()

House Type              0
Total Area (sq. ft.)    0
Floors                  0
Foundation Type         0
Material Quality        0
Location                0
Bedrooms                0
Bathrooms               0
Roof Type               0
Parking                 0
Additional Features     0
Labor Cost              0
Material Cost           0
Total Estimated Cost    0
dtype: int64

In [None]:

categorical_cols = ["House Type", "Foundation Type", "Material Quality", "Location", 
                    "Roof Type", "Parking", "Additional Features"]
numerical_cols = ["Total Area (sq. ft.)", "Floors", "Bedrooms", "Bathrooms"]


# One-Hot Encoding categorical columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_data = encoder.fit_transform(df[categorical_cols])

# Convert encoded data into DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

final_df = pd.concat([df[numerical_cols], encoded_df], axis=1)


In [44]:
final_df

Unnamed: 0,Total Area (sq. ft.),Floors,Bedrooms,Bathrooms,House Type_apartment,House Type_modern,House Type_traditional,Foundation Type_RCC,Foundation Type_normal,Foundation Type_pile,...,Roof Type_flat,Roof Type_metal,Roof Type_sloped,Parking_no,Parking_yes,Additional Features_basement,Additional Features_garden,Additional Features_none,Additional Features_solar panels,Additional Features_swimming pool
0,611,1,1,4,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,3108,1,5,2,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1672,1,3,2,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,3167,3,6,4,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,1845,3,1,1,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2381,3,3,3,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
496,2902,3,1,3,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
497,4861,1,5,3,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
498,4751,3,3,4,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:

# Define target variables
labor_target = df["Labor Cost"]
material_target = df["Material Cost"]


X_train, X_test, y_train_labor, y_test_labor = train_test_split(final_df, labor_target, test_size=0.2, random_state=42)
_, _, y_train_material, y_test_material = train_test_split(final_df, material_target, test_size=0.2, random_state=42)

# Display the transformed dataset shape
final_df.shape


(500, 29)

In [None]:

labor_model = RandomForestRegressor(n_estimators=100, random_state=42)
labor_model.fit(X_train, y_train_labor)

material_model = RandomForestRegressor(n_estimators=100, random_state=42)
material_model.fit(X_train, y_train_material)

y_pred_labor = labor_model.predict(X_test)
y_pred_material = material_model.predict(X_test)

mae_labor = mean_absolute_error(y_test_labor, y_pred_labor)
r2_labor = r2_score(y_test_labor, y_pred_labor)

mae_material = mean_absolute_error(y_test_material, y_pred_material)
r2_material = r2_score(y_test_material, y_pred_material)

print(f"Labor Cost - Mean Absolute Error: {mae_labor}")
print(f"Labor Cost - R-squared Score: {r2_labor}")

print(f"Material Cost - Mean Absolute Error: {mae_material}")
print(f"Material Cost - R-squared Score: {r2_material}")




Labor Cost - Mean Absolute Error: 11819.49
Labor Cost - R-squared Score: 0.9994567004654643
Material Cost - Mean Absolute Error: 46867.54
Material Cost - R-squared Score: 0.9992356263427061


In [47]:
import joblib
# Save the model to a file
joblib.dump(labor_model, "LaborCostModel.pkl")
joblib.dump(material_model, "MaterialCostModel.pkl")
print("Models saved successfully!")

Models saved successfully!


In [None]:
dummy_data = {
    "House Type": ["apartment"],
    "Total Area (sq. ft.)": [5000],
    "Floors": [12],
    "Foundation Type": ["RCC"],
    "Material Quality": ["premium"],
    "Location": ["Kathmandu"],
    "Bedrooms": [16],
    "Bathrooms": [20],
    "Roof Type": ["flat"],
    "Parking": ["yes"],
    "Additional Features": ["solar panels"]
}

dummy_df = pd.DataFrame(dummy_data)

dummy_encoded = encoder.transform(dummy_df[categorical_cols])

dummy_encoded_df = pd.DataFrame(dummy_encoded, columns=encoder.get_feature_names_out(categorical_cols))

dummy_final_df = pd.concat([dummy_df[numerical_cols].reset_index(drop=True), dummy_encoded_df], axis=1)

dummy_final_df = dummy_final_df.reindex(columns=final_df.columns, fill_value=0)

predicted_labor_cost = labor_model.predict(dummy_final_df)
predicted_material_cost = material_model.predict(dummy_final_df)

print(f"Predicted Labor Cost: {predicted_labor_cost[0]}")
print(f"Predicted Material Cost: {predicted_material_cost[0]}")

predicted_total_cost = predicted_labor_cost[0] + predicted_material_cost[0]
print(f"Predicted Total Estimated Cost: {predicted_total_cost}")


Predicted Labor Cost: 2960760.0
Predicted Material Cost: 9861820.0
Predicted Total Estimated Cost: 12822580.0
