In [1]:
import json

with open("C:\\Users\\samre\\Downloads\\ml_assignment\\algoparams_from_ui.json") as f:
    config = json.load(f)

print(config['design_state_data'])


{'session_info': {'project_id': '1', 'experiment_id': 'kkkk-11', 'dataset': 'iris_modified.csv', 'session_name': 'test', 'session_description': 'test'}, 'target': {'prediction_type': 'Regression', 'target': 'petal_width', 'type': 'regression', 'partitioning': True}, 'train': {'policy': 'Split the dataset', 'time_variable': 'sepal_length', 'sampling_method': 'No sampling(whole data)', 'split': 'Randomly', 'k_fold': False, 'train_ratio': 0, 'random_seed': 0}, 'metrics': {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}, 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length', 'is_selected': True, 'feature_variable_type': 'numerical', 'feature_details': {'numerical_handling': 'Keep as regular numerica

In [2]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings("ignore")


In [3]:
def load_data(dataset_path, target_column):
    df = pd.read_csv("C:\\Users\\samre\\Downloads\\DS_Assignment - internship\\Screening Test - DS\\iris.csv")
    df = df.drop_duplicates()
    df = df.dropna(subset=[target_column])  # drop rows where target is missing
    return df


In [4]:
def build_preprocessor(feature_handling_config, df):
    num_impute = []
    num_custom_values = {}
    cat_features = []

    for col, info in feature_handling_config.items():
        if not info['is_selected']:
            continue
        details = info['feature_details']
        if info['feature_variable_type'] == 'numerical':
            if details['impute_with'] == 'Average of values':
                num_impute.append(col)
            elif details['impute_with'] == 'custom':
                num_custom_values[col] = details['impute_value']
        elif info['feature_variable_type'] == 'text':
            cat_features.append(col)

    # Create imputers
    transformers = []

    if num_impute:
        transformers.append(('num_mean_imputer', SimpleImputer(strategy='mean'), num_impute))
    
    if cat_features:
        transformers.append(('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features))
    
    # Custom imputation is done manually outside of ColumnTransformer
    return ColumnTransformer(transformers), num_custom_values


In [5]:
def apply_custom_imputation(df, custom_values):
    for col, val in custom_values.items():
        df[col] = df[col].fillna(val)
    return df


In [6]:
def apply_feature_reduction(X, y, method='tree_based'):
    model = RandomForestRegressor(n_estimators=10, random_state=42)
    model.fit(X, y)
    selector = SelectFromModel(model, prefit=True, threshold="median")
    return selector.transform(X), selector


In [7]:
def split_data(X, y, strategy='random', test_size=0.2):
    return train_test_split(X, y, test_size=test_size, random_state=42)


In [8]:
def train_model(X_train, X_test, y_train, y_test, algo_config):
    model_results = {}
    for model_name, model_info in algo_config.items():
        if not model_info.get('is_selected', False):
            continue
        if model_name == 'RandomForestRegressor':
            param_grid = {
                'n_estimators': [model_info['min_trees'], model_info['max_trees']],
                'max_depth': [model_info['min_depth'], model_info['max_depth']],
                'min_samples_leaf': [model_info['min_samples_per_leaf_min_value'], model_info['min_samples_per_leaf_max_value']]
            }
            base_model = RandomForestRegressor(random_state=42)
            grid = GridSearchCV(base_model, param_grid, cv=3, scoring='r2')
            grid.fit(X_train, y_train)
            preds = grid.predict(X_test)

            model_results[model_name] = {
                'best_params': grid.best_params_,
                'r2': r2_score(y_test, preds),
                'rmse': mean_squared_error(y_test, preds, squared=False)
            }
    return model_results


In [9]:
def save_results(results, filename='result.json'):
    with open(filename, 'w') as f:
        json.dump(results, f, indent=4)


In [11]:
def main():
    config_path = 'algoparams_from_ui.json'
    config = load_config(config_path)

    dataset_path = config['session_info']['dataset']
    target = config['target']['target']
    df = load_data(dataset_path, target)

    X = df.drop(columns=[target])
    y = df[target]

    # Feature Handling
    preprocessor, custom_values = build_preprocessor(config['feature_handling'], X)
    X = apply_custom_imputation(X, custom_values)

    # Apply ColumnTransformer
    X = preprocessor.fit_transform(X)

    # Optional: Feature Reduction
    X, selector = apply_feature_reduction(X, y)

    # Split
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Model Training
    results = train_model(X_train, X_test, y_train, y_test, config['algorithms'])

    # Save Output
    save_results(results)

    print("Done. Results saved to result.json")


# FINAL CODE

In [1]:


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load dataset
df = pd.read_csv("C:\\Users\\samre\\Downloads\\DS_Assignment - internship\\Screening Test - DS\\iris.csv")

# -------------------------
# Step 1: Handle Missing Values (based on your config)
impute_strategies = {
    'sepal_length': 'median',
    'sepal_width': 'mean',
    'petal_length': 'most_frequent',
    'petal_width': 'mean'
}

for col, strategy in impute_strategies.items():
    imputer = SimpleImputer(strategy=strategy)
    df[col] = imputer.fit_transform(df[[col]])

# -------------------------
# Step 2: Feature Engineering
df['linear_interaction'] = df['petal_length'] * df['sepal_width']
df['polynomial_interaction'] = df['petal_length'] / (df['sepal_width'] + 1e-6)  # to avoid divide by zero
df['explicit_interaction'] = df['sepal_length'] + df['petal_width']

# -------------------------
# Step 3: Define Features and Target
features = ['sepal_length', 'sepal_width', 'petal_length', 
            'linear_interaction', 'polynomial_interaction', 'explicit_interaction']
target = 'petal_width'

X = df[features]
y = df[target]

# -------------------------
# Step 4: Feature Reduction (Tree-based - Top 4 by importance)
# Temporary model to get feature importances
temp_model = RandomForestRegressor(random_state=0)
temp_model.fit(X, y)
importances = temp_model.feature_importances_

# Get top 4 important features
top_indices = np.argsort(importances)[::-1][:4]
X = X.iloc[:, top_indices]

# -------------------------
# Step 5: Train-Test Split (70-30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=0, shuffle=True
)

# -------------------------
# Step 6: Model Training with Grid Search + TimeSeriesSplit
param_grid = {
    'n_estimators': [10, 15, 20],
    'max_depth': [20, 22, 25],
    'min_samples_leaf': [5, 7, 10]
}

tscv = TimeSeriesSplit(n_splits=6)
model = RandomForestRegressor(random_state=0)
grid = GridSearchCV(model, param_grid, cv=tscv)
grid.fit(X_train, y_train)

# -------------------------
# Step 7: Evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("\n✅ Best Model Params:", grid.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))



✅ Best Model Params: {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 15}
R² Score: 0.9293920913583217
Mean Squared Error (MSE): 0.03608848663908004
Mean Absolute Error (MAE): 0.14173496757418327
