In [1]:
import json
import pandas as pd
import numpy as np  # Import numpy module and use 'np' as an alias
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score


In [4]:
import numpy as np



In [5]:
def read_json(json_data):
    """
    Read the JSON data and return relevant information for analysis.
    """
    target = json_data['design_state_data']['target']['target']
    regression_type = json_data['design_state_data']['target']['type']
    features = json_data['design_state_data']['feature_handling']

    return target, regression_type, features


In [6]:
def load_data(csv_file_path):
    """
    Load data from a CSV file and return it as a pandas DataFrame.
    """
    return pd.read_csv("iris.csv")

In [7]:
def apply_missing_imputation(df, feature_handling):
    """
    Apply missing imputation to the DataFrame based on the feature handling configuration.
    """
    for feature_name, feature_details in feature_handling.items():
        if feature_details['is_selected']:
            if feature_details['feature_details']['missing_values'] == 'Impute':
                if feature_details['feature_details']['impute_with'] == 'Average of values':
                    imputer = SimpleImputer(strategy='mean')
                else:
                    imputer = SimpleImputer(strategy='constant', fill_value=feature_details['feature_details']['impute_value'])
                df[feature_name] = imputer.fit_transform(df[[feature_name]])
    return df


In [8]:
def perform_feature_reduction(df, feature_reduction_method, reduction_params):
    """
    Perform feature reduction based on the selected method.
    """
    if feature_reduction_method == 'No Reduction':
        pass
    elif feature_reduction_method == 'Corr with Target':
        # Implement feature reduction based on correlation with target
        pass
    elif feature_reduction_method == 'Tree-based':
        # Implement tree-based feature reduction
        pass
    elif feature_reduction_method == 'PCA':
        # Implement Principal Component Analysis (PCA) for feature reduction
        pca = PCA(n_components=reduction_params.get('pca_components', 2))
        df_reduced = pca.fit_transform(df)
        return df_reduced
    else:
        raise ValueError("Invalid feature reduction method specified in the JSON.")
    return df

def build_models(json_data):
    """
    Build models based on the prediction_type specified in the JSON.
    """
    prediction_type = json_data['design_state_data']['target']['prediction_type']
    models = []

    if prediction_type == 'Regression':
        models.append(('RandomForestRegressor', RandomForestRegressor()))
        models.append(('LinearRegression', LinearRegression()))
    elif prediction_type == 'Classification':
        models.append(('RandomForestClassifier', RandomForestClassifier()))
        models.append(('LogisticRegression', LogisticRegression()))
        models.append(('SVC', SVC()))
    else:
        raise ValueError("Invalid prediction_type specified in the JSON.")

    return models

In [9]:
def run_model_fit_predict(model, X_train, X_test, y_train, y_test):
    """
    Run model fit and prediction on each model.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [10]:
def calculate_model_metrics(y_true, y_pred, regression_type):
    """
    Calculate model evaluation metrics based on regression_type.
    """
    metrics = {}
    if regression_type == 'Regression':
        # For Regression, use Mean Squared Error
        metrics['MSE'] = mean_squared_error(y_true, y_pred)
    elif regression_type == 'Classification':
        # For Classification, use Accuracy and F1 Score
        metrics['Accuracy'] = accuracy_score(y_true, y_pred)
        metrics['F1 Score'] = f1_score(y_true, y_pred)
    else:
        raise ValueError("Invalid regression_type specified in the JSON.")

    return metrics

In [36]:
def parse_and_run_pipeline(json_data, csv_file_path):
    """
    Parse the JSON and execute the machine learning pipeline.
    """
    # Step 1: Read the JSON data
    target, regression_type, feature_handling = read_json(json_data)

    # Step 2: Load the data from CSV
    data = load_data(csv_file_path)

    # Step 3: Apply missing imputation
    data = apply_missing_imputation(data, feature_handling)

    # Step 4: Feature Reduction
    feature_reduction_method = json_data['design_state_data']['feature_reduction']['feature_reduction_method']
    reduction_params = json_data['design_state_data']['feature_reduction'].get('reduction_params', {})
    if feature_reduction_method != 'No Reduction':
        data = perform_feature_reduction(data, feature_reduction_method, reduction_params)

    # Step 5: Build models based on prediction type
    models = build_models(json_data)

    # Step 6: Prepare data for model training and evaluation
    X = data.drop(target, axis=1)
    y = data[target]

    # Step 7: Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 8: Create pipelines for each model with GridSearchCV for hyperparameter tuning
    pipelines = []
    for name, model in models:
        pipeline_steps = []

        # Feature Handling
        for feature_name, feature_details in feature_handling.items():
            if feature_details['is_selected']:
                if feature_details['feature_details']['missing_values'] == 'Impute':
                    if feature_details['feature_details']['impute_with'] == 'Average of values':
                        imputer = SimpleImputer(strategy='mean')
                    else:
                        imputer = SimpleImputer(strategy='constant', fill_value=feature_details['feature_details']['impute_value'])
                    pipeline_steps.append((feature_name + '_imputer', imputer))

        # Feature Reduction
        if feature_reduction_method == 'PCA':
            n_components = reduction_params.get('pca_components', 2)
            pipeline_steps.append(('pca', PCA(n_components=n_components)))
        elif feature_reduction_method == 'Tree-based':
            k_best = reduction_params.get('tree_k_best', 10)
            pipeline_steps.append(('k_best', SelectKBest(k=k_best)))

        # Model Building
        pipeline_steps.append((name, model))

        pipeline = Pipeline(pipeline_steps)
        pipelines.append((name, pipeline))

    # Step 9: Run the fit and predict on each model using GridSearchCV for hyperparameter tuning
    for name, pipeline in pipelines:
        grid_search = GridSearchCV(pipeline, param_grid={}, cv=5)  # Add param_grid for hyperparameter tuning
        grid_search.fit(X_train, y_train)
        y_pred = grid_search.predict(X_test)