In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris


In [None]:
iris = load_iris()
data = pd.DataFrame(
    data=iris.data,
    columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
)
# Introduce missing values for testing
np.random.seed(42)
mask = np.random.random(data.shape) < 0.1  # 10% missing
data[mask] = np.nan
data.to_csv('data.csv', index=False)

In [None]:
data[['sepal_length','petal_length']].mean()

Unnamed: 0,0
sepal_length,5.866929
petal_length,3.737879


In [None]:
data['sepal_width'].median()

3.0

In [None]:
def generate_sample_data(output_file='output.csv', missing_rate=0.1, target_col='petal_width', random_seed=42):
    np.random.seed(random_seed)

    iris = load_iris()
    columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    data = pd.DataFrame(data=iris.data, columns=columns)

    feature_cols = [col for col in columns if col != target_col]
    mask = np.random.rand(*data[feature_cols].shape) < missing_rate
    data.loc[:, feature_cols] = data.loc[:, feature_cols].mask( mask)

    data.to_csv(output_file, index=False)
    print(f"✅ Dataset with missing values saved to '{output_file}'")

if __name__ == "__main__":
    generate_sample_data()

✅ Dataset with missing values saved to 'output.csv'


In [None]:
algoparams_from_ui = {
    "target": {
        "prediction_type": "Regression",
        "target": "petal_width",
        "type": "regression",
        "partitioning": True
    },
    "feature_handling": {
        "sepal_length": {
            "feature_name": "sepal_length",
            "is_selected": True,
            "feature_variable_type": "numerical",
            "feature_details": {
                "numerical_handling": "Keep as regular numerical feature",
                "rescaling": "Standardize (mean=0, std=1)",
                "make_derived_feats": False,
                "missing_values": "Impute",
                "impute_with": "Average of values",
                "impute_value": 5.86
            }
        },
        "sepal_width": {
            "feature_name": "sepal_width",
            "is_selected": True,
            "feature_variable_type": "numerical",
            "feature_details": {
                "numerical_handling": "Keep as regular numerical feature",
                "rescaling": "Standardize (mean=0, std=1)",
                "make_derived_feats": False,
                "missing_values": "Impute",
                "impute_with": "Constant",
                "impute_value": 3
            }
        },
        "petal_length": {
            "feature_name": "petal_length",
            "is_selected": True,
            "feature_variable_type": "numerical",
            "feature_details": {
                "numerical_handling": "Keep as regular numerical feature",
                "rescaling": "Standardize (mean=0, std=1)",
                "make_derived_feats": False,
                "missing_values": "Impute",
                "impute_with": "Average of values",
                "impute_value": 3.73
            }
        }
    },
    "feature_reduction": {
        "feature_reduction_method": "Correlation with target",
        "No Reduction": {
            "is_selected": False,
            "num_of_features_to_keep": 5
        },
        "Correlation with target": {
            "is_selected": False,
            "num_of_features_to_keep": 2,
            "threshold": 0.5
        },
        "Tree-based": {
            "is_selected": False,
            "num_of_features_to_keep": 2,
            "depth_of_trees": 10,
            "num_of_trees": 100
        },
        "Principal Component Analysis": {
            "is_selected": False,
            "num_of_features_to_keep": 2
        }
    },
    "models": [
        {
            "model_name": "LinearRegression",
            "is_selected": True,
            "hyperparameters": {
                "fit_intercept": [True, False]
            }
        },
        {
            "model_name": "RandomForestRegressor",
            "is_selected": True,
            "hyperparameters": {
                "n_estimators": [50, 100, 150, 200],
                "max_depth": [None, 10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "max_features": ["sqrt", "log2", 0.5, None],
                "bootstrap": [True],
                "max_samples": [None, 0.5, 0.75],  # Controls sample size per tree
                "warm_start": [True, False]  # Allows adding more trees
            }
        },
        {
            "model_name": "DecisionTreeRegressor",
            "is_selected": True,
            "hyperparameters": {
                "criterion": ["squared_error", "absolute_error"],
                "splitter": ["best", "random"],
                "max_depth": [None, 5, 10, 20],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "max_features": [None, "sqrt", "log2", 0.5]
            }
        }
    ],
    "hyperparameters": {
        "search_method": "Grid Search",
        "Grid Search": {
            "is_selected": True,
            "shuffle_grid": True,
            "random_state": 0,
            "max_iterations": 10,
            "max_search_time": 0,
            "cross_validation_strategy": "Time-based K-fold(with overlap)",
            "Time-based K-fold(with overlap)": {
                "is_selected": True,
                "num_of_folds": 5,
                "split_ratio": 0,
                "stratified": False
            }
        }
    }
}

with open("algoparams_from_ui.json", "w") as file:
    json.dump(algoparams_from_ui, file, indent=4)



In [None]:
# === Custom Feature Reduction (Correlation with Target) ===
class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5, num_features=None):
        self.threshold = threshold
        self.num_features = num_features
        self.selected_indices_ = None

    def fit(self, X, y):
        X_np = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        correlations = np.array([np.abs(np.corrcoef(X_np[:, i], y)[0, 1]) for i in range(X_np.shape[1])])
        if self.num_features:
            self.selected_indices_ = np.argsort(correlations)[-self.num_features:]
        else:
            self.selected_indices_ = np.where(correlations > self.threshold)[0]
        if len(self.selected_indices_) == 0:
            self.selected_indices_ = np.arange(X_np.shape[1])
        return self

    def transform(self, X):
        return X.iloc[:, self.selected_indices_] if isinstance(X, pd.DataFrame) else X[:, self.selected_indices_]

# === Passthrough ===
class PassthroughTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X

# === Config Load ===
def load_config(file_path='/content/algoparams_from_ui.json'):
    with open(file_path, 'r') as f:
        return json.load(f)

# === Data Load ===
def load_data(config, file_path='output.csv'):
    df = pd.read_csv(file_path)
    target = config['target']['target']
    features = [f for f, meta in config['feature_handling'].items() if meta['is_selected']]
    df = df.dropna(subset=[target])
    return df[features], df[target]

# === Imputer Builder ===
def create_imputer(config):
    transformers = []
    for feature, meta in config['feature_handling'].items():
        if not meta['is_selected']: continue
        method = meta['feature_details']['impute_with']
        value = meta['feature_details']['impute_value']
        if method == 'Average of values':
            imp = SimpleImputer(strategy='mean')
        elif method == 'Median of values':
            imp = SimpleImputer(strategy='median')
        elif method == 'Constant':
            imp = SimpleImputer(strategy='constant', fill_value=value)
        else:
            raise ValueError(f"Invalid impute method: {method}")
        transformers.append((f'imp_{feature}', imp, [feature]))
    return ColumnTransformer(transformers, remainder='passthrough')

# === Feature Reducer Builder ===
def create_feature_reducer(config):
    method = config['feature_reduction']['feature_reduction_method']
    opts = config['feature_reduction'].get(method, {})
    if method == 'No Reduction':
        return PassthroughTransformer()
    elif method == 'Principal Component Analysis':
        return PCA(n_components=opts.get('num_of_features_to_keep', 2))
    elif method == 'Correlation with target':
        return CorrelationThresholdSelector(
            threshold=opts.get('threshold', 0.5),
            num_features=opts.get('num_of_features_to_keep')
        )
    elif method == 'Tree-based':
        return SelectFromModel(
            RandomForestRegressor(
                n_estimators=opts.get('num_of_trees', 100),
                max_depth=opts.get('depth_of_trees', 10),
                random_state=0
            ),
            max_features=opts.get('num_of_features_to_keep', 2)
        )
    else:
        raise ValueError(f"Unknown feature reduction method: {method}")

# === Model Class Resolver ===
def get_model_class(name, task):
    if task == 'Regression':
        return {
            'LinearRegression': LinearRegression,
            'RandomForestRegressor': RandomForestRegressor,
            'DecisionTreeRegressor': DecisionTreeRegressor
        }[name]
    raise ValueError(f"Unsupported prediction_type: {task}")

# === Main Pipeline Executor ===
def main():
    config = load_config()
    X, y = load_data(config)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    imputer = create_imputer(config)
    reducer = create_feature_reducer(config)
    prediction_type = config['target']['prediction_type']

    for model_conf in config['models']:
        if not model_conf['is_selected']: continue
        name = model_conf['model_name']
        hyperparams = model_conf.get('hyperparameters', {})

        # Handle bootstrap-max_samples conflict
        if name == "RandomForestRegressor" and 'bootstrap' in hyperparams and 'max_samples' in hyperparams:
            if False in hyperparams['bootstrap']:
                print(f"⚠️ Removing 'max_samples' because bootstrap=False is included for {name}")
                del hyperparams['max_samples']

        model_class = get_model_class(name, prediction_type)
        pipeline = Pipeline([
            ('imputer', imputer),
            ('reducer', reducer),
            ('model', model_class())
        ])

        grid_params = {f"model__{k}": v for k, v in hyperparams.items()}
        folds = config['hyperparameters']['Grid Search']['Time-based K-fold(with overlap)']['num_of_folds']

        search = GridSearchCV(pipeline, grid_params, scoring='neg_mean_squared_error', cv=folds, n_jobs=-1)

        try:
            search.fit(X_train, y_train)
            preds = search.predict(X_test)

            print(f"\n✅ Model: {name}")
            print(f"Best Params: {search.best_params_}")
            print(f"MSE: {mean_squared_error(y_test, preds):.4f}")
            print(f"MAE: {mean_absolute_error(y_test, preds):.4f}")
            print(f"R2 Score: {r2_score(y_test, preds):.4f}")
            print("-" * 60)
        except Exception as e:
            print(f"❌ Error with {name}: {e}")

if __name__ == "__main__":
    main()


✅ Model: LinearRegression
Best Params: {'model__fit_intercept': True}
MSE: 0.1507
MAE: 0.2766
R2 Score: 0.7629
------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
xalgoparams_from_ui_={
    "target": {
        "prediction_type": "Regression",
        "target": "petal_width",
        "type": "regression",
        "partitioning": True
    },
    "feature_handling": {
        "sepal_length": {
            "feature_name": "sepal_length",
            "is_selected": True,
            "feature_variable_type": "numerical",
            "feature_details": {
                "numerical_handling": "Keep as regular numerical feature",
                "rescaling": "Standardize (mean=0, std=1)",
                "make_derived_feats": False,
                "missing_values": "Impute",
                "impute_with": "Average of values",
                "impute_value": 5.86
            }
        },
        "sepal_width": {
            "feature_name": "sepal_width",
            "is_selected": True,
            "feature_variable_type": "numerical",
            "feature_details": {
                "numerical_handling": "Keep as regular numerical feature",
                "rescaling": "Standardize (mean=0, std=1)",
                "make_derived_feats": False,
                "missing_values": "Impute",
                "impute_with": "Constant",
                "impute_value": 3
            }
        },
        "petal_length": {
            "feature_name": "petal_length",
            "is_selected": True,
            "feature_variable_type": "numerical",
            "feature_details": {
                "numerical_handling": "Keep as regular numerical feature",
                "rescaling": "Standardize (mean=0, std=1)",
                "make_derived_feats": False,
                "missing_values": "Impute",
                "impute_with": "Average of values",
                "impute_value": 3.73
            }
        }
    },
    "feature_reduction": {
        "feature_reduction_method": "Principal Component Analysis",
        "No Reduction": {
            "is_selected": False,
            "num_of_features_to_keep": 5
        },
        "Correlation with target": {
            "is_selected": False,
            "num_of_features_to_keep": 2,
            "threshold": 0.5
        },
        "Tree-based": {
            "is_selected": False,
            "num_of_features_to_keep": 2,
            "depth_of_trees": 10,
            "num_of_trees": 100
        },
        "Principal Component Analysis": {
            "is_selected": True,
            "num_of_features_to_keep": 2
        }
    },
    "models": [
        {
            "model_name": "LinearRegression",
            "is_selected": True,
            "hyperparameters": {
                "fit_intercept": [True, False]
            }
        },
        {
            "model_name": "RandomForestRegressor",
            "is_selected": True,
            "hyperparameters": {
                "n_estimators": [50, 100, 150, 200],
                "max_depth": [None, 10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "max_features": ["sqrt", "log2", 0.5, None],
                "bootstrap": [True],
                "max_samples": [None, 0.5, 0.75],
                "warm_start": [True, False]
            }
        },
        {
            "model_name": "DecisionTreeRegressor",
            "is_selected": True,
            "hyperparameters": {
                "criterion": ["squared_error", "absolute_error"],
                "splitter": ["best", "random"],
                "max_depth": [None, 5, 10, 20],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "max_features": [None, "sqrt", "log2", 0.5]
            }
        }
    ],
    "hyperparameters": {
        "search_method": "Grid Search",
        "Grid Search": {
            "is_selected": True,
            "shuffle_grid": True,
            "random_state": 0,
            "max_iterations": 10,
            "max_search_time": 0,
            "cross_validation_strategy": "Time-based K-fold(with overlap)",
            "Time-based K-fold(with overlap)": {
                "is_selected": True,
                "num_of_folds": 5,
                "split_ratio": 0,
                "stratified": False
            }
        }
    }
}


with open("algoparams_from_ui_.json", "w") as file:
    json.dump(algoparams_from_ui_, file, indent=4)

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

def generate_sample_data(output_file='output.csv', missing_rate=0.1, target_col='petal_width', random_seed=42):
    np.random.seed(random_seed)

    iris = load_iris()
    columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    data = pd.DataFrame(data=iris.data, columns=columns)

    feature_cols = [col for col in columns if col != target_col]
    mask = np.random.rand(*data[feature_cols].shape) < missing_rate
    data.loc[:, feature_cols] = data.loc[:, feature_cols].mask(mask)

    data.to_csv(output_file, index=False)
    print(f"✅ Dataset with missing values saved to '{output_file}'")

if __name__ == "__main__":
    generate_sample_data()


✅ Dataset with missing values saved to 'output.csv'


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, TransformerMixin

class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5, num_features=None):
        self.threshold = threshold
        self.num_features = num_features
        self.selected_indices_ = None

    def fit(self, X, y):
        X_np = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        correlations = np.array([np.abs(np.corrcoef(X_np[:, i], y)[0, 1]) for i in range(X_np.shape[1])])
        if self.num_features:
            self.selected_indices_ = np.argsort(correlations)[-self.num_features:]
        else:
            self.selected_indices_ = np.where(correlations > self.threshold)[0]
        if len(self.selected_indices_) == 0:
            self.selected_indices_ = np.arange(X_np.shape[1])
        return self

    def transform(self, X):
        return X.iloc[:, self.selected_indices_] if isinstance(X, pd.DataFrame) else X[:, self.selected_indices_]

class PassthroughTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X

def load_config(file_path='/content/algoparams_from_ui_.json'):
    with open(file_path, 'r') as f:
        return json.load(f)

def load_data(config, file_path='output.csv'):
    df = pd.read_csv(file_path)
    target = config['target']['target']
    features = [f for f, meta in config['feature_handling'].items() if meta['is_selected']]
    df = df.dropna(subset=[target])
    return df[features], df[target]

def create_imputer(config):
    transformers = []
    for feature, meta in config['feature_handling'].items():
        if not meta['is_selected']: continue
        method = meta['feature_details']['impute_with']
        value = meta['feature_details']['impute_value']
        if method == 'Average of values':
            imp = SimpleImputer(strategy='mean')
        elif method == 'Median of values':
            imp = SimpleImputer(strategy='median')
        elif method == 'Constant':
            imp = SimpleImputer(strategy='constant', fill_value=value)
        else:
            raise ValueError(f"Invalid impute method: {method}")
        transformers.append((f'imp_{feature}', imp, [feature]))
    return ColumnTransformer(transformers, remainder='passthrough')

def create_feature_reducer(config):
    method = config['feature_reduction']['feature_reduction_method']
    opts = config['feature_reduction'].get(method, {})
    if method == 'No Reduction':
        return PassthroughTransformer()
    elif method == 'Principal Component Analysis':
        return PCA(n_components=opts.get('num_of_features_to_keep', 2))
    elif method == 'Correlation with target':
        return CorrelationThresholdSelector(
            threshold=opts.get('threshold', 0.5),
            num_features=opts.get('num_of_features_to_keep')
        )
    elif method == 'Tree-based':
        return SelectFromModel(
            RandomForestRegressor(
                n_estimators=opts.get('num_of_trees', 100),
                max_depth=opts.get('depth_of_trees', 10),
                random_state=0
            ),
            max_features=opts.get('num_of_features_to_keep', 2)
        )
    else:
        raise ValueError(f"Unknown feature reduction method: {method}")

def get_model_class(name, task):
    if task == 'Regression':
        return {
            'LinearRegression': LinearRegression,
            'RandomForestRegressor': RandomForestRegressor,
            'DecisionTreeRegressor': DecisionTreeRegressor
        }[name]
    raise ValueError(f"Unsupported prediction_type: {task}")

def main():
    config = load_config()
    X, y = load_data(config)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    imputer = create_imputer(config)
    reducer = create_feature_reducer(config)
    prediction_type = config['target']['prediction_type']

    for model_conf in config['models']:
        if not model_conf['is_selected']: continue
        name = model_conf['model_name']
        hyperparams = model_conf.get('hyperparameters', {})

        if name == "RandomForestRegressor" and 'bootstrap' in hyperparams and 'max_samples' in hyperparams:
            if False in hyperparams['bootstrap']:
                print(f"⚠️ Removing 'max_samples' because bootstrap=False is included for {name}")
                del hyperparams['max_samples']

        model_class = get_model_class(name, prediction_type)
        pipeline = Pipeline([
            ('imputer', imputer),
            ('reducer', reducer),
            ('model', model_class())
        ])

        grid_params = {f"model__{k}": v for k, v in hyperparams.items()}
        folds = config['hyperparameters']['Grid Search']['Time-based K-fold(with overlap)']['num_of_folds']

        search = GridSearchCV(pipeline, grid_params, scoring='neg_mean_squared_error', cv=folds, n_jobs=-1)

        try:
            search.fit(X_train, y_train)
            preds = search.predict(X_test)

            print(f"\n✅ Model: {name}")
            print(f"Best Params: {search.best_params_}")
            print(f"MSE: {mean_squared_error(y_test, preds):.4f}")
            print(f"MAE: {mean_absolute_error(y_test, preds):.4f}")
            print(f"R2 Score: {r2_score(y_test, preds):.4f}")
            print("-" * 60)
        except Exception as e:
            print(f"❌ Error with {name}: {e}")

if __name__ == "__main__":
    main()



✅ Model: LinearRegression
Best Params: {'model__fit_intercept': True}
MSE: 0.1535
MAE: 0.2736
R2 Score: 0.7586
------------------------------------------------------------

✅ Model: RandomForestRegressor
Best Params: {'model__bootstrap': True, 'model__max_depth': None, 'model__max_features': None, 'model__max_samples': 0.5, 'model__min_samples_leaf': 4, 'model__min_samples_split': 5, 'model__n_estimators': 50, 'model__warm_start': True}
MSE: 0.1111
MAE: 0.2357
R2 Score: 0.8252
------------------------------------------------------------

✅ Model: DecisionTreeRegressor
Best Params: {'model__criterion': 'squared_error', 'model__max_depth': 20, 'model__max_features': 'log2', 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__splitter': 'best'}
MSE: 0.2292
MAE: 0.3061
R2 Score: 0.6395
------------------------------------------------------------
