# concrete Strength prediction

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("/Users/rahatrihan/Downloads/ConcreteStrengthData.csv")

# Display dataset information
print("Dataset Overview:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())

# Handle missing values
df.dropna(inplace=True)

# Explore the target variable
df["strength"].hist(bins=30, edgecolor='black')
plt.title("Distribution of Concrete Strength")
plt.xlabel("Strength")
plt.ylabel("Frequency")
plt.show()

# Standardizing numerical features
scaler = StandardScaler()
X = df.drop(columns=["strength"])
y = df["strength"]
X_scaled = scaler.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Selection
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Model Training & Evaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred)
    }

# Feature Importance (for tree-based models)
feature_importance = pd.DataFrame({
    "Feature": df.drop(columns=["strength"]).columns,
    "Importance": models["Random Forest"].feature_importances_
}).sort_values(by="Importance", ascending=False)

# Hyperparameter Tuning for Gradient Boosting
param_grid = {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}
gb_grid = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=5, scoring='r2')
gb_grid.fit(X_train, y_train)
best_gb = gb_grid.best_estimator_
y_pred_gb = best_gb.predict(X_test)
results["Tuned Gradient Boosting"] = {
    "MAE": mean_absolute_error(y_test, y_pred_gb),
    "MSE": mean_squared_error(y_test, y_pred_gb),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_gb)),
    "R2": r2_score(y_test, y_pred_gb)
}

# Comparative Analysis
for model, metrics in results.items():
    print(f"\n{model} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

# Plot Feature Importance
plt.figure(figsize=(8,5))
sns.barplot(x=feature_importance["Importance"], y=feature_importance["Feature"])
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

print("\nConclusion: The best-performing model is determined based on the highest R² and lowest RMSE.")


Dataset Overview:
    CementComponent   BlastFurnaceSlag  FlyAshComponent  WaterComponent  \
0             540.0               0.0              0.0           162.0   
1             540.0               0.0              0.0           162.0   
2             332.5             142.5              0.0           228.0   
3             332.5             142.5              0.0           228.0   
4             198.6             132.4              0.0           192.0   

   SuperplasticizerComponent  CoarseAggregateComponent  \
0                        2.5                    1040.0   
1                        2.5                    1055.0   
2                        0.0                     932.0   
3                        0.0                     932.0   
4                        0.0                     978.4   

   FineAggregateComponent  AgeInDays  Strength  
0                   676.0         28     79.99  
1                   676.0         28     61.89  
2                   594.0        270    

KeyError: 'strength'