In [None]:
import itertools
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from itertools import combinations

data_path = 'mergedData_annotated.num.csv'
df = pd.read_csv(data_path)


features = ["seedNumber_1","seedEbest_1","seedNumber_3","Pu1_1","Pu2_1","pumin1_4u",
            "pumin5_8u","pumin1_4d","pumin5_8d","E_diff_12","E_3","E_1","E_hybrid_1"]
target = 'Y'  # Assuming the target column is named 'Y'


models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "GradientBoosting": GradientBoostingRegressor()
}

scoring = 'r2' 

results = []

total_combinations = sum(1 for _ in itertools.chain(*(combinations(features, i) for i in range(1, len(features) + 1))))
print(f"Total feature combinations to test: {total_combinations}")

start_time = time.time()

for i in range(1, len(features) + 1):
    for feature_subset in combinations(features, i):
        feature_list = list(feature_subset)
        X = df[feature_list]
        y = df[target]
        
       
        for model_name, model in models.items():
            scores = cross_val_score(model, X, y, cv=5, scoring=scoring)
            mean_score = np.mean(scores)
            std_score = np.std(scores)
            
            results.append({
                "Features": ', '.join(feature_list),
                "Model": model_name,
                "Mean_R2": mean_score,
                "Std_R2": std_score
            })

results_df = pd.DataFrame(results)
output_path = 'model_comparison_results_cv.csv'
results_df.to_csv(output_path, index=False)

total_time = time.time() - start_time
print(f"Estimated runtime: {total_time:.2f} seconds")
print(f"Results saved to {output_path}")
