In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
df = pd.read_csv(r"C:\Users\madha\Downloads\Civil_Engineering_Regression_Dataset.csv")
df.columns = df.columns.str.strip()
print("Dataset Columns:", df.columns.tolist())
cost_column = "Construction_Cost"
if cost_column not in df.columns:
    raise KeyError(f"Column '{cost_column}' not found in dataset. Available columns: {df.columns.tolist()}")
independent_vars = ["Building_Height", "Material_Quality_Index", "Labor_Cost", "Concrete_Strength", "Foundation_Depth"]
for var in independent_vars:
    if var not in df.columns:
        raise KeyError(f"Column '{var}' not found in dataset. Available columns: {df.columns.tolist()}")
dependent_var = df[cost_column]
X = df[independent_vars]
y = dependent_var
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
intercept = model.intercept_
coefficients = model.coef_
equation = f"Regression Equation: {cost_column} = {intercept:.2f}"
for var, coef in zip(independent_vars, coefficients):
    equation += f" + ({coef:.2f} * {var})"
print(equation)
impactful_var = independent_vars[np.argmax(np.abs(coefficients))]
print(f"The variable with the highest impact on {cost_column} is {impactful_var}")
y_pred = model.predict(X_test)
r2_multiple = r2_score(y_test, y_pred)
mse_multiple = mean_squared_error(y_test, y_pred)
print(f"Multiple Linear Regression - R-squared: {r2_multiple:.2f}")
print(f"Multiple Linear Regression - Mean Squared Error: {mse_multiple:.2f}")
X_simple = df[["Building_Height"]]
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_simple, y, test_size=0.2, random_state=42)
simple_model = LinearRegression()
simple_model.fit(X_train_s, y_train_s)
y_pred_s = simple_model.predict(X_test_s)
r2_simple = r2_score(y_test_s, y_pred_s)
print(f"Simple Linear Regression - R-squared: {r2_simple:.2f}")
n = X_train.shape[0]
k = X_train.shape[1]
adj_r2 = 1 - ((1 - r2_multiple) * (n - 1) / (n - k - 1))
print(f"Adjusted R-squared: {adj_r2:.2f}")
X_with_const = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data["Feature"] = X_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]
print("Variance Inflation Factor (VIF) values:")
print(vif_data)
print("\nModel Interpretation & Conclusion:")
print("1. The multiple regression model provides a better fit compared to the simple regression model based on R-squared values.")
print("2. Adjusted R-squared accounts for the number of predictors and provides a more accurate measure of model performance.")
print("3. High VIF values indicate potential multicollinearity, which can distort coefficient estimates.")
print("4. Construction companies can use regression analysis to estimate costs by identifying key cost drivers and making data-driven budget decisions.")
print("5. Limitations include data quality, missing variables, and assumptions of linear regression.")
print("6. Additional variables such as site conditions, labor productivity, and market trends could improve the model.")
print("7. Regression analysis in civil engineering contributes to cost-effective planning by predicting expenses accurately and optimizing resource allocation.")
print("8. Data science plays a crucial role in optimizing construction project costs through predictive modeling and informed decision-making.")
