In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [5]:
# Load the dataset
df = pd.read_csv('Civil_Engineering_Regression_Dataset.csv')
df

Unnamed: 0,Project_ID,Building_Height,Material_Quality_Index,Labor_Cost,Concrete_Strength,Foundation_Depth,Weather_Index,Construction_Cost
0,1,21.854305,9,70.213332,45.326394,8.804790,4,2400.287931
1,2,47.782144,9,142.413614,47.900505,6.727632,6,3705.461312
2,3,37.939727,3,110.539985,22.112484,8.208544,8,2653.631004
3,4,31.939632,6,250.784939,26.267562,7.094515,4,2534.099466
4,5,12.020839,7,167.575159,40.134306,6.160303,6,1741.179333
...,...,...,...,...,...,...,...,...
95,96,27.220802,6,97.727758,36.181323,7.345759,4,2422.719616
96,97,28.522977,4,117.118714,40.518913,3.238517,4,2432.162280
97,98,24.239346,8,171.319969,38.475535,3.972273,8,2281.105598
98,99,6.143861,9,143.171717,48.316748,4.910067,4,1583.189537


In [7]:
# Assuming 'target' is the target variable and the rest are predictor variables
X = df.drop(columns=['Building_Height'])  # Predictor variables
y = df['Building_Height']  # Target variable

In [9]:
df

Unnamed: 0,Project_ID,Building_Height,Material_Quality_Index,Labor_Cost,Concrete_Strength,Foundation_Depth,Weather_Index,Construction_Cost
0,1,21.854305,9,70.213332,45.326394,8.804790,4,2400.287931
1,2,47.782144,9,142.413614,47.900505,6.727632,6,3705.461312
2,3,37.939727,3,110.539985,22.112484,8.208544,8,2653.631004
3,4,31.939632,6,250.784939,26.267562,7.094515,4,2534.099466
4,5,12.020839,7,167.575159,40.134306,6.160303,6,1741.179333
...,...,...,...,...,...,...,...,...
95,96,27.220802,6,97.727758,36.181323,7.345759,4,2422.719616
96,97,28.522977,4,117.118714,40.518913,3.238517,4,2432.162280
97,98,24.239346,8,171.319969,38.475535,3.972273,8,2281.105598
98,99,6.143861,9,143.171717,48.316748,4.910067,4,1583.189537


In [11]:
# Add a constant (intercept) to the predictors for the multiple regression
X_with_intercept = sm.add_constant(X)

In [13]:
# Fit Simple Linear Regression: using one predictor (replace 'predictor1' with an actual column name)
X_simple = df[['Material_Quality_Index']]  # Replace 'predictor1' with an actual column name for the simple regression model
X_simple_with_intercept = sm.add_constant(X_simple)


In [15]:
# Fit Simple Linear Regression Model
simple_model = sm.OLS(y, X_simple_with_intercept).fit()
simple_r_squared = simple_model.rsquared
print(f"Simple Linear Regression R-squared: {simple_r_squared}")


Simple Linear Regression R-squared: 3.6933911613523662e-06


In [17]:
# Fit Multiple Linear Regression Model
multiple_model = sm.OLS(y, X_with_intercept).fit()
multiple_r_squared = multiple_model.rsquared
adjusted_r_squared = multiple_model.rsquared_adj
print(f"Multiple Linear Regression R-squared: {multiple_r_squared}")
print(f"Multiple Linear Regression Adjusted R-squared: {adjusted_r_squared}")


Multiple Linear Regression R-squared: 0.999774276562958
Multiple Linear Regression Adjusted R-squared: 0.9997571019536179


In [19]:
# Check for multicollinearity using Variance Inflation Factor (VIF)
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_intercept.values, i+1) for i in range(X.shape[1])]

print("\nVariance Inflation Factor (VIF) for each feature:")
print(vif_data)



Variance Inflation Factor (VIF) for each feature:
                  Feature       VIF
0              Project_ID  1.091909
1  Material_Quality_Index  1.144248
2              Labor_Cost  1.086349
3       Concrete_Strength  1.110593
4        Foundation_Depth  1.074112
5           Weather_Index  1.054597
6       Construction_Cost  1.178828


In [21]:
# Conclusion and Interpretation of the model:
# Key Takeaways:
# - The R-squared value will show how much of the variance in the target variable (cost) is explained by the predictor variables.
# - The Adjusted R-squared is important as it considers the number of predictors, preventing overfitting.
# - The VIF values will help detect multicollinearity issues, with values > 10 indicating collinearity between predictors.

# Model Improvement:
# - Consider adding more variables like 'location', 'project complexity', 'time of year', and 'supply chain factors' for better predictions.
# - Address multicollinearity by removing or combining highly correlated variables.

# The construction industry can leverage regression models to better estimate costs and optimize resource allocation, leading to cost-effective project planning.
