In [14]:
# ============================================================
# 🌍 UNIVERSAL AUTOML SYSTEM v4 — By Sandesh Singh
# Handles Classification + Regression + Auto Target Detection
# ============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import (
    RandomForestRegressor, RandomForestClassifier,
    GradientBoostingRegressor, GradientBoostingClassifier
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVR, SVC
import warnings
warnings.filterwarnings("ignore")

# ==============================================
# 1️⃣ Load Dataset
# ==============================================
df = pd.read_csv("concrete_data.csv")  # ⬅️ Change only this filename

print("✅ Data Loaded Successfully!")
print(f"Shape: {df.shape}")
print("Columns:", list(df.columns), "\n")

# ==============================================
# 2️⃣ Auto Target Detection (smart heuristic)
# ==============================================
# Common target keywords
possible_targets = [
    'target', 'output', 'label', 'y', 'class', 'survived',
    'strength', 'price', 'score', 'result'
]

target_col = None
for col in df.columns:
    if col.lower() in possible_targets:
        target_col = col
        break

# If not found, assume last column
if target_col is None:
    target_col = df.columns[-1]

print(f"🎯 Detected Target Column: {target_col}\n")

# ==============================================
# 3️⃣ Separate Features (X) and Target (y)
# ==============================================
X = df.drop(columns=[target_col])
y = df[target_col]

# Drop useless / text-heavy columns
drop_text_cols = ['name', 'ticket', 'cabin', 'remarks', 'description', 'comments']
X = X.drop(columns=[c for c in X.columns if c.lower() in drop_text_cols], errors='ignore')

# Drop ID-like columns
id_like_cols = [c for c in X.columns if 'id' in c.lower()]
X = X.drop(columns=id_like_cols, errors='ignore')

# Drop constant columns
X = X.loc[:, X.nunique() > 1]

# ==============================================
# 4️⃣ Handle NaN in target
# ==============================================
nan_target_idx = y[y.isna()].index
if len(nan_target_idx) > 0:
    print(f"⚠️ Dropping {len(nan_target_idx)} rows with NaN in target.")
    X = X.drop(index=nan_target_idx)
    y = y.drop(index=nan_target_idx)

# ==============================================
# 5️⃣ Detect Data Types
# ==============================================
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("📊 Numeric Columns:", num_cols)
print("🔤 Categorical Columns:", cat_cols, "\n")

# ==============================================
# 6️⃣ Problem Type Detection
# ==============================================
if y.nunique() <= 15 and y.dtype in ['int64', 'float64']:
    problem_type = "classification"
else:
    problem_type = "regression"

print(f"🧠 Detected Problem Type: {problem_type.upper()}\n")

# ==============================================
# 7️⃣ Build Preprocessing Pipeline
# ==============================================
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# ==============================================
# 8️⃣ Define Models
# ==============================================
if problem_type == "regression":
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(n_estimators=150, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=150, random_state=42),
        "Support Vector Regressor": SVR()
    }
else:
    models = {
        "Logistic Regression": LogisticRegression(max_iter=500),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=150, random_state=42),
        "Support Vector Machine": SVC(kernel='rbf', C=1)
    }

# ==============================================
# 9️⃣ Split Data
# ==============================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================================
# 🔟 Train + Evaluate Models
# ==============================================
results = []

for name, model in models.items():
    try:
        pipe = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model)
        ])
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)

        if problem_type == "regression":
            r2 = r2_score(y_test, preds)
            mae = mean_absolute_error(y_test, preds)
            rmse = np.sqrt(mean_squared_error(y_test, preds))
            results.append({"Model": name, "R2": r2, "MAE": mae, "RMSE": rmse})
        else:
            acc = accuracy_score(y_test, preds)
            prec = precision_score(y_test, preds, average="weighted", zero_division=0)
            rec = recall_score(y_test, preds, average="weighted", zero_division=0)
            f1 = f1_score(y_test, preds, average="weighted", zero_division=0)
            results.append({"Model": name, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1})
    
    except Exception as e:
        print(f"⚠️ Skipping {name} due to error: {e}")
        continue

# ==============================================
# 11️⃣ Display Results Safely
# ==============================================
if len(results) == 0:
    print("\n❌ All models failed — dataset may contain non-numeric text columns.")
    print("💡 Tip: Check your dataset for names, free-text, or IDs.")
else:
    results_df = pd.DataFrame(results)
    metric = "R2" if problem_type == "regression" else "Accuracy"

    if metric not in results_df.columns:
        print("\n⚠️ Metric column missing — showing raw results:")
        print(results_df)
    else:
        results_df = results_df.sort_values(by=metric, ascending=False).reset_index(drop=True)
        print("\n🏆 Model Comparison Results:\n")
        print(results_df.round(4))

        best = results_df.iloc[0]
        print("\n🔥 Best Model Found Automatically:")
        for col in results_df.columns:
            print(f"{col}: {best[col]}")

print("\n✅ Execution Completed Successfully — AutoML finished.")


✅ Data Loaded Successfully!
Shape: (1030, 9)
Columns: ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age', 'Strength'] 

🎯 Detected Target Column: Strength

📊 Numeric Columns: ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age']
🔤 Categorical Columns: [] 

🧠 Detected Problem Type: REGRESSION


🏆 Model Comparison Results:

                      Model      R2     MAE    RMSE
0         Gradient Boosting  0.8988  3.7383  5.1069
1             Random Forest  0.8814  3.7891  5.5279
2             Decision Tree  0.8348  4.2938  6.5254
3  Support Vector Regressor  0.6548  7.5149  9.4318
4         Linear Regression  0.6276  7.7456  9.7965

🔥 Best Model Found Automatically:
Model: Gradient Boosting
R2: 0.8987881608434356
MAE: 3.738323381586785
RMSE: 5.106853472213489

✅ Execution Completed Successfully — AutoML finished.
