In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sinamhd9/concrete-comprehensive-strength", path="Concrete_Data.xls")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

df = pd.read_excel(path)
print(df.head())
print("Column names:")
print(df.columns)
print("Overview:")
print(df.info())
print("Basic statistics:")
print(df.describe())
print("Missing values:")
print(df.isnull().sum())
print("Correlation matrix:")
print(df.corr())


In [19]:
"""
Column names:
Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)',
       'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

We try to predict Concrete compressivestrength, we pop it off and use it as our target variable y.
Our cross_val_score uses neg_mean_absolute_error as scoring so the score is in the units of y.
Score 8.4 means off by 8.4 MPa on average.
"""

X = df.copy()

y = X.pop(df.columns[8])

# Train and score baseline model
baseline = RandomForestRegressor(criterion="absolute_error", random_state=0)
baseline_score = cross_val_score(
    baseline, X, y, cv=5, scoring="neg_mean_absolute_error"
)
baseline_score = -1 * baseline_score.mean()

print(f"MAE Baseline Score: {baseline_score:.4}")

MAE Baseline Score: 8.355


In [None]:
"""
Here we add synthetic features so a linear model can better learn.
We get a score of 8, an improvement over the baseline score of 8.4.
The reason is the new ratio features exposed important information to the model.
"""

X = df.copy()
y = X.pop(df.columns[8])

fine_agg = df.columns[6]
coarse_agg = df.columns[5]
cement = df.columns[0]
water = df.columns[3]
print("\n".join([df.columns[8], fine_agg, coarse_agg, cement, water]))
# Create synthetic features
X["FCRatio"] = X[fine_agg] / X[coarse_agg]
X["AggCmtRatio"] = (X[coarse_agg] + X[fine_agg]) / X[cement]
X["WtrCmtRatio"] = X[water] / X[cement]

# Train and score model on dataset with additional ratio features
model = RandomForestRegressor(criterion="absolute_error", random_state=0)
score = cross_val_score(
    model, X, y, cv=5, scoring="neg_mean_absolute_error"
)
score = -1 * score.mean()

print(f"MAE Score with Ratio Features: {score:.4}")