In [2]:
import json
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [3]:
json_path = '/content/drive/MyDrive/Assigment-pro/json_data.json'  #JSON file
dataset_path = '/content/drive/MyDrive/Assigment-pro/iris.csv'  #Path to the iris.csv dataset

In [4]:
with open(json_path, 'r') as f:
    config = json.load(f)

#Extract target, prediction type, and features from the config.
target_column = config['design_state_data']['target']['target']
prediction_type = config['design_state_data']['target']['prediction_type']
features = config['design_state_data']['feature_handling']

print(f"Target Column: {target_column}")
print(f"Prediction Type: {prediction_type}")

Target Column: petal_width
Prediction Type: Regression


In [5]:
data = pd.read_csv(dataset_path)
#Separate features and target
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target

In [6]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  #Convert categorical target labels to numeric

In [7]:
#Identify numeric and categorical columns
numeric_columns = X.select_dtypes(include=['number']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

#Create imputers for numeric and categorical data
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

#Create a column transformer to apply different imputers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_imputer, numeric_columns),
        ('cat', categorical_imputer, categorical_columns)
    ])

In [8]:
feature_generation = config['design_state_data']['feature_generation']
linear_interactions = feature_generation.get('linear_interactions', [])
polynomial_interactions = feature_generation.get('polynomial_interactions', [])
explicit_pairwise_interactions = feature_generation.get('explicit_pairwise_interactions', [])

print(f"Linear Interactions: {linear_interactions}")
print(f"Polynomial Interactions: {polynomial_interactions}")
print(f"Explicit Pairwise Interactions: {explicit_pairwise_interactions}")

Linear Interactions: [['petal_length', 'sepal_width']]
Polynomial Interactions: ['petal_length/sepal_width', 'petal_width/species']
Explicit Pairwise Interactions: ['sepal_width/sepal_length', 'petal_width/sepal_length']


In [9]:
feature_reduction = config['design_state_data']['feature_reduction']
reduction_method = feature_reduction['feature_reduction_method']

if reduction_method == "Tree-based":
    num_features_to_keep = int(feature_reduction['num_of_features_to_keep'])
    print(f"Performing tree-based feature reduction, keeping {num_features_to_keep} features.")


Performing tree-based feature reduction, keeping 4 features.


In [12]:
selected_model_name = None
for model_key, model_config in config['design_state_data']['algorithms'].items():
    if model_config['is_selected']:
        selected_model_name = model_key
        break

print(f"Selected Model: {selected_model_name}")

grid_search = None

if selected_model_name == 'RandomForestClassifier':
    model = RandomForestClassifier()

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [10, 20],
        'max_depth': [20, 25],
        'min_samples_split': [2, 5]
    }

    # Create GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)

Selected Model: RandomForestRegressor


In [23]:
#Build the pipeline
model = RandomForestClassifier()

grid_search = None


if selected_model_name == 'RandomForestClassifier':

    param_grid = {
        'n_estimators': [10, 20],
        'max_depth': [20, 25],
        'min_samples_split': [2, 5]
    }

    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimator', grid_search if grid_search is not None else model)
])

pipeline.fit(X, y)

In [24]:
# Predict using the trained pipeline
predictions = pipeline.predict(X)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y, predictions)
print(f"Accuracy of the model: {accuracy}")

Accuracy of the model: 0.9733333333333334
