In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [17]:
data = pd.read_csv('data.csv')

In [30]:
data.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [31]:
data = pd.read_csv("data.csv")
data = data.drop(columns=["Unnamed: 0","water_cement_ratio"], errors="ignore")  

X = data.drop(columns=["concrete_compressive_strength"])
y = data["concrete_compressive_strength"]


In [20]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [21]:
models = {
    "Linear Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]),
    
    "Ridge Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Ridge())
    ]),
    
    "Lasso Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Lasso(max_iter=50000))
    ]),
    
    "KNN Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsRegressor())
    ]),
    
    "Decision Tree": Pipeline([
        ("model", DecisionTreeRegressor(random_state=42))
    ]),
    
    "Random Forest": Pipeline([
        ("model", RandomForestRegressor(random_state=42))
    ])
}

In [22]:
param_grid = {
    "Ridge Regression": {"model__alpha": [0.1, 1, 10, 50, 100]},
    "Lasso Regression": {"model__alpha": [0.001, 0.01, 0.1, 1, 10]},
    "KNN Regression": {
        "model__n_neighbors": [3, 5, 7, 10, 15],
        "model__weights": ["uniform", "distance"],
        "model__p": [1, 2] 
    },
    "Decision Tree": {
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10]
    },
    "Random Forest": {
        "model__n_estimators": [50, 100, 200],
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10]
    }
}

In [23]:
results = {}

for name, pipeline in models.items():
    if name in param_grid:
        grid = GridSearchCV(pipeline, param_grid[name], cv=5, scoring="r2", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        pipeline.fit(X_train, y_train)
        best_model = pipeline
        best_params = "N/A"
    
    y_pred = best_model.predict(X_test)
    
    results[name] = {
        "Best Params": best_params,
        "R²": r2_score(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred))
    }
    
results_df = pd.DataFrame(results).T


In [33]:
print("Model Comparison:")
results_df.head(6)

Model Comparison:


Unnamed: 0,Best Params,R²,RMSE
Linear Regression,,0.627553,9.796476
Ridge Regression,{'model__alpha': 1},0.627576,9.796178
Lasso Regression,{'model__alpha': 0.001},0.627557,9.796431
KNN Regression,"{'model__n_neighbors': 5, 'model__p': 2, 'mode...",0.77648,7.5892
Decision Tree,"{'model__max_depth': None, 'model__min_samples...",0.83475,6.525414
Random Forest,"{'model__max_depth': 20, 'model__min_samples_s...",0.880989,5.537731


In [34]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=200,      
    learning_rate=0.1,     
    max_depth=3,           
    random_state=42
)

gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Gradient Boosting R²:", r2)
print("RMSE:", rmse)

Gradient Boosting R²: 0.9055202195137996
RMSE: 4.934091245184691


In [35]:
import xgboost as xgb

xgbr = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    objective='reg:squarederror',
    random_state=42
)

xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("XGBoost R²:", r2)
print("RMSE:", rmse)


XGBoost R²: 0.9067467985462473
RMSE: 4.901958316839912


In [36]:
if "Unnamed: 0" in data.columns:
    data = data.drop(columns=["Unnamed: 0"])


In [37]:
data.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [39]:
import joblib

joblib.dump(gbr, "model.pkl")

['model.pkl']