In [None]:
/# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# load the dataset
df = pd.read_csv('cleaned_concrete_data.csv')
df.head()

Unnamed: 0,cement,flyash,ggbs,mk,water,water_tcm,sp,vma,nca_20_down,nca_10_down,rca_20_down,rca_10_down,sand,age,cs
0,550.0,0.0,0.0,0.0,165.0,0.3,3.3,0.28,0.0,0.0,247.0,337.0,607.0,7.0,49.71
1,385.0,165.0,0.0,0.0,148.0,0.26,6.05,1.1,0.0,0.0,270.0,432.0,729.0,7.0,18.53
2,275.0,275.0,0.0,0.0,146.0,0.26,5.5,1.1,0.0,0.0,276.0,441.0,745.0,7.0,15.09
3,165.0,385.0,0.0,0.0,172.0,0.3,4.4,1.1,0.0,0.0,280.0,448.0,757.0,7.0,6.34
4,385.0,0.0,165.0,0.0,155.0,0.28,6.6,1.38,0.0,0.0,361.0,417.0,815.0,7.0,42.83


## Pre-processing

In [None]:
X = df.drop('cs',axis=1) #independent features
y = df['cs'] #target

In [None]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}



In [None]:
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error

results = []

for name, model in models.items():
    pipe = make_pipeline(StandardScaler(), model)

    r2 = cross_val_score(pipe, X_train, y_train, scoring='r2', cv=5)
    mae = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    rmse = cross_val_score(pipe, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)

    results.append({
        "Model": name,
        "R² (CV Mean)": np.mean(r2),
        "MAE (CV Mean)": -np.mean(mae),
        "RMSE (CV Mean)": -np.mean(rmse)
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="R² (CV Mean)", ascending=False).reset_index(drop=True)

results_df


Unnamed: 0,Model,R² (CV Mean),MAE (CV Mean),RMSE (CV Mean)
0,CatBoost,0.910656,3.834935,5.250614
1,Gradient Boosting,0.90686,3.892319,5.431273
2,XGBoost,0.894146,4.215979,5.772405
3,Random Forest,0.882419,4.477738,6.106525
4,Ridge,0.830725,5.644938,7.315047
5,AdaBoost,0.82434,5.821255,7.491565
6,Lasso,0.823646,5.986184,7.513029
7,Linear Regression,0.814886,5.805213,7.628876
8,Decision Tree,0.802195,5.700033,7.699929
9,ElasticNet,0.770128,6.971217,8.566633


In [None]:
catboost_params = {
    'iterations': [200, 500, 800],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128]
}

gbr_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 1.0]
}

xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

def tune_model(model, param_grid, name, scaled=True):
    print(f"Tuning {name}...")

    if scaled:
        X_ = X_train_scaled
    else:
        X_ = X_train

    search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=20,
        cv=5,
        scoring='r2',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    search.fit(X_, y_train)

    print(f"Best R² Score for {name}: {search.best_score_:.4f}")
    print(f"Best Params for {name}: {search.best_params_}")
    return search.best_estimator_


In [None]:
# CatBoost doesn't need scaled data
best_cat = tune_model(CatBoostRegressor(verbose=0, random_state=42), catboost_params, "CatBoost", scaled=False)
best_gbr = tune_model(GradientBoostingRegressor(random_state=42), gbr_params, "Gradient Boosting")
best_xgb = tune_model(XGBRegressor(random_state=42), xgb_params, "XGBoost")
best_rf  = tune_model(RandomForestRegressor(random_state=42), rf_params, "Random Forest")


Tuning CatBoost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best R² Score for CatBoost: 0.9131
Best Params for CatBoost: {'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 500, 'depth': 4, 'border_count': 128}
Tuning Gradient Boosting...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best R² Score for Gradient Boosting: 0.9193
Best Params for Gradient Boosting: {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05}
Tuning XGBoost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best R² Score for XGBoost: 0.9157
Best Params for XGBoost: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Tuning Random Forest...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best R² Score for Random Forest: 0.8833
Best Params for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10}


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, name, scaled=True):

    if scaled:
        X_eval = X_test_scaled
    else:
        X_eval = X_test

    preds = model.predict(X_eval)

    #metrics
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)

    print(f"\nEvaluation: {name}")
    print(f"R²: {r2:.4f}")
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")

    return {"name": name, "R2": r2, "MAE": mae, "RMSE": rmse}

In [None]:
results = [
    evaluate_model(best_cat, "CatBoost", scaled=False),
    evaluate_model(best_gbr, "Gradient Boosting"),
    evaluate_model(best_xgb, "XGBoost"),
    evaluate_model(best_rf, "Random Forest")
]

pd.DataFrame(results)



Evaluation: CatBoost
R²: 0.9395
MAE: 3.53
MSE: 20.00
RMSE: 4.47

Evaluation: Gradient Boosting
R²: 0.9350
MAE: 3.66
MSE: 21.48
RMSE: 4.63

Evaluation: XGBoost
R²: 0.9227
MAE: 3.85
MSE: 25.55
RMSE: 5.05

Evaluation: Random Forest
R²: 0.9327
MAE: 3.82
MSE: 22.23
RMSE: 4.72


Unnamed: 0,name,R2,MAE,RMSE
0,CatBoost,0.939459,3.531234,4.472388
1,Gradient Boosting,0.935,3.660609,4.634193
2,XGBoost,0.922671,3.854613,5.054602
3,Random Forest,0.932707,3.824595,4.715205


In [None]:
import joblib

joblib.dump(best_cat, "final_model_catboost.pkl")
joblib.dump(scaler, "scaler.pkl")  # useful if we try scaled models later

y_pred = best_cat.predict(X_test)
pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
}).to_csv("catboost_predictions.csv", index=False)
