# Feature Engineering

In [1]:
import pandas as pd
df=pd.read_csv(r"C:\Users\K6502\OneDrive\Desktop\TFM\datos\clean\dataset_encoded.csv")

**Univariate filter**

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression


target_columns = ['synergy_zip', 'synergy_loewe', 'synergy_hsa', 'synergy_bliss']

# Feature set (exclude target columns)
X = df.drop(columns=target_columns)

# Dictionary to store the results
univariate_filter_results = {}

# Loop through each target and apply SelectKBest
for target in target_columns:
    y = df[target]
    
    selector = SelectKBest(score_func=f_regression, k='all')
    selector.fit(X, y)
    
    scores_df = pd.DataFrame({
        'Feature': X.columns,
        'Score': selector.scores_
    }).sort_values(by='Score', ascending=False).reset_index(drop=True)
    
    univariate_filter_results[target] = scores_df

# Example: print top 5 features for each target
for target in target_columns:
    print(f"\nTop features for {target}:")
    print(univariate_filter_results[target].head(10))


**Multivariate filter**

In [None]:
# Define the feature columns (everything except targets)
feature_columns = [col for col in df.columns if col not in target_columns]

Option 2: Using Sequential Feature Selector (sklearn)

In [14]:
target_columns = ['synergy_zip', 'synergy_loewe', 'synergy_hsa', 'synergy_bliss']
feature_columns = [col for col in df.columns if col not in target_columns]

from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector



X = df[feature_columns]

results = {}

for target in target_columns:
    y = df[target]

    # Step 1: Univariate mutual information filtering
    selector = SelectKBest(mutual_info_regression, k=30)  # pick top 30 features
    selector.fit(X, y)
    X_selected = X.loc[:, selector.get_support()]

    # Step 2: Multivariate selection (model-based)
    model = RandomForestRegressor(random_state=42)
    sfs = SequentialFeatureSelector(model, n_features_to_select=20, direction='forward')
    sfs.fit(X_selected, y)

    selected_features = X_selected.columns[sfs.get_support()].tolist()
    results[target] = selected_features

for target, features in results.items():
    print(f"\nSelected features for {target}:")
    print(features)

KeyboardInterrupt: 