In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
df = pd.read_csv('construction_price.csv')

In [68]:
df

Unnamed: 0,House Type,Total Area (sq. ft.),Floors,Foundation Type,Material Quality,Location,Bedrooms,Bathrooms,Roof Type,Parking,...,Labor Cost,Material Cost,Total Estimated Cost,House Type_encoded,Foundation Type_encoded,Material Quality_encoded,Location_encoded,Roof Type_encoded,Parking_encoded,Additional Features_encoded
0,traditional,611,1,pile,premium,Pokhara,1,4,metal,yes,...,366600,1222000,1588600,2,2,1,3,2,1,2
1,apartment,3108,1,RCC,standard,Kathmandu,5,2,sloped,no,...,1243200,4662000,6105200,0,0,2,1,3,0,3
2,apartment,1672,1,RCC,standard,Terai,3,2,sloped,yes,...,668800,2508000,3176800,0,0,2,4,3,1,2
3,traditional,3167,3,RCC,standard,Bhaktapur,6,4,RCC,yes,...,1266800,4750500,6517300,2,0,2,0,0,1,0
4,apartment,1845,3,RCC,premium,Terai,1,1,flat,no,...,1107000,3690000,5297000,0,0,1,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,modern,2381,3,pile,standard,Bhaktapur,3,3,RCC,yes,...,952400,3571500,4673900,1,2,2,0,0,1,1
496,apartment,2902,3,normal,standard,Terai,1,3,flat,no,...,1160800,4353000,6313800,0,1,2,4,1,0,4
497,modern,4861,1,pile,low,Kathmandu,5,3,flat,yes,...,1458300,5833200,7291500,1,2,0,1,1,1,2
498,modern,4751,3,pile,standard,Terai,3,4,RCC,no,...,1900400,7126500,9226900,1,2,2,4,0,0,3


In [None]:
categorical_columns = [
    'House Type', 'Foundation Type', 'Material Quality', 
    'Location', 'Roof Type', 'Parking', 'Additional Features'
]
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [62]:
df

Unnamed: 0,House Type,Total Area (sq. ft.),Floors,Foundation Type,Material Quality,Location,Bedrooms,Bathrooms,Roof Type,Parking,...,Labor Cost,Material Cost,Total Estimated Cost,House Type_encoded,Foundation Type_encoded,Material Quality_encoded,Location_encoded,Roof Type_encoded,Parking_encoded,Additional Features_encoded
0,traditional,611,1,pile,premium,Pokhara,1,4,metal,yes,...,366600,1222000,1588600,2,2,1,3,2,1,2
1,apartment,3108,1,RCC,standard,Kathmandu,5,2,sloped,no,...,1243200,4662000,6105200,0,0,2,1,3,0,3
2,apartment,1672,1,RCC,standard,Terai,3,2,sloped,yes,...,668800,2508000,3176800,0,0,2,4,3,1,2
3,traditional,3167,3,RCC,standard,Bhaktapur,6,4,RCC,yes,...,1266800,4750500,6517300,2,0,2,0,0,1,0
4,apartment,1845,3,RCC,premium,Terai,1,1,flat,no,...,1107000,3690000,5297000,0,0,1,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,modern,2381,3,pile,standard,Bhaktapur,3,3,RCC,yes,...,952400,3571500,4673900,1,2,2,0,0,1,1
496,apartment,2902,3,normal,standard,Terai,1,3,flat,no,...,1160800,4353000,6313800,0,1,2,4,1,0,4
497,modern,4861,1,pile,low,Kathmandu,5,3,flat,yes,...,1458300,5833200,7291500,1,2,0,1,1,1,2
498,modern,4751,3,pile,standard,Terai,3,4,RCC,no,...,1900400,7126500,9226900,1,2,2,4,0,0,3


In [63]:
# Prepare features and target
feature_columns = [
    'Total Area (sq. ft.)', 'Floors', 'Bedrooms', 'Bathrooms',
    'House Type_encoded', 'Foundation Type_encoded', 
    'Material Quality_encoded', 'Location_encoded', 
    'Roof Type_encoded', 'Parking_encoded', 
    'Additional Features_encoded'
]

X = df[feature_columns]
y = df['Total Estimated Cost']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [64]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [65]:
rf_model = RandomForestRegressor(
    n_estimators=100, 
    random_state=42,
    max_depth=10,     
    min_samples_split=5,
    min_samples_leaf=2
)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)


In [66]:

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.4f}")



Model Performance Metrics:
Mean Absolute Error: 232747.90
Mean Squared Error: 81474603460.81
Root Mean Squared Error: 285437.56
R-squared Score: 0.9916


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('construction_price.csv')

# Separate features
numeric_features = [
    'Total Area (sq. ft.)', 'Floors', 'Bedrooms', 'Bathrooms', 
    'Labor Cost', 'Material Cost'
]

categorical_features = [
    'House Type', 'Foundation Type', 'Material Quality', 
    'Location', 'Roof Type', 'Parking', 'Additional Features'
]

# Prepare features and target
X = df[numeric_features + categorical_features]
y = df['Total Estimated Cost']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ])

# Create a pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=4,
        max_features='sqrt'
    ))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit the pipeline
rf_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = rf_pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.4f}")

# Feature Importance Visualization
# Extract feature names after one-hot encoding
feature_names = (
    numeric_features + 
    list(rf_pipeline.named_steps['preprocessor']
         .named_transformers_['cat']
         .get_feature_names_out(categorical_features))
)

# Get feature importances
feature_importances = rf_pipeline.named_steps['regressor'].feature_importances_

# Create a DataFrame of feature importances
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# Visualize top 15 features
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=importance_df.head(15))
plt.title('Top 15 Features - One-Hot Encoding')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig('feature_importance_onehot.png')
plt.close()

# Print top 15 features
print("\nTop 15 Most Important Features:")
print(importance_df.head(15))

# Save the model and preprocessor
import joblib
joblib.dump(rf_pipeline, 'construction_price_predictor_onehot.joblib')

# Optional: Prediction function
def predict_price(new_data):
    """
    Predict construction price for new data
    
    :param new_data: DataFrame or dict with same features as training data
    :return: Predicted price
    """
    return rf_pipeline.predict(pd.DataFrame([new_data]))[0]

# Example usage
# sample_construction = {
#     'Total Area (sq. ft.)': 2000,
#     'Floors': 2,
#     'Bedrooms': 3,
#     'Bathrooms': 2,
#     'Labor Cost': 500000,
#     'Material Cost': 1500000,
#     'House Type': 'Residential',
#     'Foundation Type': 'Concrete',
#     'Material Quality': 'High',
#     'Location': 'Urban',
#     'Roof Type': 'Tile',
#     'Parking': 'Yes',
#     'Additional Features': 'Garage'
# }
# predicted_price = predict_price(sample_construction)
# print(f"Predicted Construction Price: {predicted_price:.2f}")

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('construction_price.csv')

# Separate numeric and categorical features
numeric_features = [
    'Total Area (sq. ft.)', 'Floors', 'Bedrooms', 'Bathrooms', 
    'Labor Cost', 'Material Cost'
]

categorical_features = [
    'House Type', 'Foundation Type', 'Material Quality', 
    'Location', 'Roof Type', 'Parking', 'Additional Features'
]

# Apply One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_categorical = encoder.fit_transform(df[categorical_features])


# Convert encoded categorical features into DataFrame
encoded_feature_names = encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_categorical, columns=encoded_feature_names)

# Combine numeric and encoded categorical features
X = pd.concat([df[numeric_features].reset_index(drop=True), encoded_df], axis=1)
y = df['Total Estimated Cost']

# Split the data **after** encoding
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numeric features
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Train the model
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt'
)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.4f}")

# Feature Importance Visualization
feature_names = list(X.columns)
feature_importances = rf_model.feature_importances_

# Create a DataFrame of feature importances
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# Visualize top 15 features
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=importance_df.head(15))
plt.title('Top 15 Features - One-Hot Encoding Applied Before Split')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig('feature_importance_onehot.png')
plt.close()

# Print top 15 features
print("\nTop 15 Most Important Features:")
print(importance_df.head(15))

# Save the model and preprocessing objects
import joblib
joblib.dump(rf_model, 'construction_price_predictor_onehot.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(encoder, 'encoder.joblib')

# Optional: Prediction function
def predict_price(new_data):
    """
    Predict construction price for new data
    
    :param new_data: Dictionary with feature values
    :return: Predicted price
    """
    # Convert input data to DataFrame
    new_df = pd.DataFrame([new_data])

    # One-hot encode categorical features
    new_encoded = encoder.transform(new_df[categorical_features])
    new_encoded_df = pd.DataFrame(new_encoded, columns=encoded_feature_names)

    # Combine numeric and encoded categorical features
    new_X = pd.concat([new_df[numeric_features].reset_index(drop=True), new_encoded_df], axis=1)

    # Standardize numeric features
    new_X[numeric_features] = scaler.transform(new_X[numeric_features])

    return rf_model.predict(new_X)[0]

# Example usage
sample_construction = {
    'Total Area (sq. ft.)': 2000,
    'Floors': 2,
    'Bedrooms': 3,
    'Bathrooms': 2,
    'Labor Cost': 500000,
    'Material Cost': 1500000,
    'House Type': 'Residential',
    'Foundation Type': 'Concrete',
    'Material Quality': 'High',
    'Location': 'Urban',
    'Roof Type': 'Tile',
    'Parking': 'Yes',
    'Additional Features': 'Garage'
}
predicted_price = predict_price(sample_construction)
print(f"Predicted Construction Price: {predicted_price:.2f}")


Model Performance Metrics:
Mean Absolute Error: 434302.93
Mean Squared Error: 350153522682.35
Root Mean Squared Error: 591737.71
R-squared Score: 0.9637

Top 15 Most Important Features:
                      feature  importance
5               Material Cost    0.338777
4                  Labor Cost    0.271658
0        Total Area (sq. ft.)    0.239937
13   Material Quality_premium    0.051821
12       Material Quality_low    0.030422
14  Material Quality_standard    0.006237
1                      Floors    0.005932
2                    Bedrooms    0.004806
3                   Bathrooms    0.003721
19             Location_Terai    0.003385
23           Roof Type_sloped    0.003314
16         Location_Kathmandu    0.003208
9         Foundation Type_RCC    0.002663
8      House Type_traditional    0.002621
25                Parking_yes    0.002588
Predicted Construction Price: 3105964.55
