In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

# Load dataset
df = pd.read_csv(r"C:\Users\madha\Downloads\Civil_Engineering_Regression_Dataset.csv")

# Clean column names
df.columns = df.columns.str.strip()
print("Dataset Columns:", df.columns.tolist())

# Ensure correct column name
cost_column = "Construction_Cost"
if cost_column not in df.columns:
    raise KeyError(f"Column '{cost_column}' not found in dataset. Available columns: {df.columns.tolist()}")

# Define independent variables and dependent variable
independent_vars = ["Building_Height", "Material_Quality_Index", "Labor_Cost", "Concrete_Strength", "Foundation_Depth"]
for var in independent_vars:
    if var not in df.columns:
        raise KeyError(f"Column '{var}' not found in dataset. Available columns: {df.columns.tolist()}")

dependent_var = df[cost_column]

# Prepare data
X = df[independent_vars]
y = dependent_var
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Regression equation
intercept = model.intercept_
coefficients = model.coef_
equation = f"Regression Equation: {cost_column} = {intercept:.2f}"
for var, coef in zip(independent_vars, coefficients):
    equation += f" + ({coef:.2f} * {var})"
print(equation)

# Identify most impactful variable
impactful_var = independent_vars[np.argmax(np.abs(coefficients))]
print(f"The variable with the highest impact on {cost_column} is {impactful_var}")

# Model Evaluation
y_pred = model.predict(X_test)
r2_multiple = r2_score(y_test, y_pred)
mse_multiple = mean_squared_error(y_test, y_pred)
print(f"Multiple Linear Regression - R-squared: {r2_multiple:.2f}")
print(f"Multiple Linear Regression - Mean Squared Error: {mse_multiple:.2f}")

# Compare with Simple Linear Regression
X_simple = df[["Building_Height"]]
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_simple, y, test_size=0.2, random_state=42)
simple_model = LinearRegression()
simple_model.fit(X_train_s, y_train_s)
y_pred_s = simple_model.predict(X_test_s)
r2_simple = r2_score(y_test_s, y_pred_s)
print(f"Simple Linear Regression - R-squared: {r2_simple:.2f}")

# Adjusted R-squared calculation
n = X_train.shape[0]
k = X_train.shape[1]
adj_r2 = 1 - ((1 - r2_multiple) * (n - 1) / (n - k - 1))
print(f"Adjusted R-squared: {adj_r2:.2f}")

# Check for multicollinearity using VIF
X_with_const = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data["Feature"] = X_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]
print("Variance Inflation Factor (VIF) values:")
print(vif_data)


Dataset Columns: ['Project_ID', 'Building_Height', 'Material_Quality_Index', 'Labor_Cost', 'Concrete_Strength', 'Foundation_Depth', 'Weather_Index', 'Construction_Cost']
Regression Equation: Construction_Cost = -9.64 + (49.81 * Building_Height) + (10.33 * Material_Quality_Index) + (0.53 * Labor_Cost) + (20.20 * Concrete_Strength) + (30.14 * Foundation_Depth)
The variable with the highest impact on Construction_Cost is Building_Height
Multiple Linear Regression - R-squared: 1.00
Multiple Linear Regression - Mean Squared Error: 113.50
Simple Linear Regression - R-squared: 0.93
Adjusted R-squared: 1.00
Variance Inflation Factor (VIF) values:
                  Feature        VIF
0                   const  36.217244
1         Building_Height   1.047164
2  Material_Quality_Index   1.048067
3              Labor_Cost   1.054086
4       Concrete_Strength   1.019701
5        Foundation_Depth   1.040594
