In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.signal import savgol_filter
import joblib

# Load dataset
features = pd.read_csv('yes.csv')
targets = pd.read_csv('yoo.csv')

# Preprocess target data
targets = targets.drop('Sample Name:', axis=1)
targets.fillna(targets.mean(), inplace=True)

# Preprocess data function
def preprocess_data(features, targets):
    X = features.values  # Features
    y = targets.values  # Target

    # Savitzky-Golay filtering for spectral smoothing
    X_smooth = savgol_filter(X, window_length=11, polyorder=2, axis=1)

    # Standardize features
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_smooth)

    # Standardize targets
    y_scaler = StandardScaler()
    y_scaled = y_scaler.fit_transform(y)

    # Band selection using RandomForestRegressor for regression targets
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y_scaled)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]

    return X_scaled, y_scaled, indices, X_scaler, y_scaler

X_scaled, y_scaled, indices, X_scaler, y_scaler = preprocess_data(features, targets)

# Function to train and evaluate models for each target component
def train_and_evaluate(X, y, top_indices, model_name, model):
    X_selected = X[:, top_indices]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'{model_name}: MSE={mse}, R2={r2}')

    return model, mse

# Number of top features to select (for demonstration, top 10)
top_k = 10

# Initialize models
models = {
    'PLSR': PLSRegression(n_components=10),
    'RF': RandomForestRegressor(n_estimators=100, random_state=42),
    'SVM': SVR(),
    'ANN': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}

# Train each model and find the best one for each target component
best_models = []
best_scores = []
best_model_names = []
best_indices_list = []

for i in range(y_scaled.shape[1]):
    best_model = None
    best_score = float('inf')
    best_model_name = ""
    best_indices = None
    
    for model_name, model in models.items():
        selected_indices = indices[:top_k]
        trained_model, mse = train_and_evaluate(X_scaled, y_scaled[:, i], selected_indices, model_name, model)
        
        if mse < best_score:
            best_score = mse
            best_model = trained_model
            best_model_name = model_name
            best_indices = selected_indices

    best_models.append(best_model)
    best_scores.append(best_score)
    best_model_names.append(best_model_name)
    best_indices_list.append(best_indices)

# Save all models and preprocessing details in a single dictionary
model_data = {
    'models': best_models,
    'model_names': best_model_names,
    'preprocessing_details': {
        'X_scaler': X_scaler,
        'y_scaler': y_scaler,
        'indices': best_indices_list
    }
}

joblib.dump(model_data, 'model_data.pkl')

for i, model_name in enumerate(best_model_names):
    print(f'Best model for component {i}: {model_name} with MSE={best_scores[i]}')


PLSR: MSE=18.166896686605675, R2=-27.735965560515407
RF: MSE=0.8837170402053014, R2=-0.3978426183985131
SVM: MSE=0.5669678575619073, R2=0.10318371321886699




ANN: MSE=1.8490852727831288, R2=-1.9248391529813937
PLSR: MSE=27.69409895226914, R2=-33.95833009394202
RF: MSE=1.1128579848232854, R2=-0.40476340639145136
SVM: MSE=0.8448189942393163, R2=-0.06641712088741802




ANN: MSE=1.701942803746532, R2=-1.148366640739019
PLSR: MSE=946.8075450974433, R2=-1699.1052101498171
RF: MSE=1.6416883018306063, R2=-1.9478460008437848
SVM: MSE=2.1471739580586506, R2=-2.855505432012399




ANN: MSE=2.3895455881695806, R2=-3.2907124318696805
PLSR: MSE=42583.65695576856, R2=-68229.90695980853
RF: MSE=0.9925738481726073, R2=-0.5903804118032678
SVM: MSE=0.6271674251873427, R2=-0.004897307918582783




ANN: MSE=2.5957486024629244, R2=-3.159113942930369
PLSR: MSE=4.032813882548513, R2=-3.3813714289393824
RF: MSE=1.0395665348005851, R2=-0.1294166422523988
SVM: MSE=0.803523142352585, R2=0.1270280650367065




ANN: MSE=2.0039211260026932, R2=-1.1771207457180775
PLSR: MSE=21.525029896271374, R2=-42.56550070547944
RF: MSE=0.7123471707740421, R2=-0.4417522911909437
SVM: MSE=0.3848702876710573, R2=0.2210432752079029




ANN: MSE=1.4754849045007075, R2=-1.9863019451176065
PLSR: MSE=6.4421965346137275, R2=-8.650462773403873
RF: MSE=1.4449010016328427, R2=-1.1644734451349352
SVM: MSE=0.7595970760002673, R2=-0.1378825941339501




ANN: MSE=2.5472988450725724, R2=-2.815874375304433
PLSR: MSE=3.6327383325503355, R2=-2.1119095597028945
RF: MSE=0.8957381302743017, R2=0.23268460444436012
SVM: MSE=1.2322832260828216, R2=-0.05560973581504758




ANN: MSE=2.6558328968060905, R2=-1.2750638840376172
PLSR: MSE=727.5172846603548, R2=-452.21279809634865
RF: MSE=2.344507915269452, R2=-0.46053023734599985
SVM: MSE=2.2407278383118707, R2=-0.39587959597112454
ANN: MSE=2.194040991424435, R2=-0.366795645722398
PLSR: MSE=3802.8020196903103, R2=-3708.6563379916006
RF: MSE=1.3056242478256554, R2=-0.27364433933281473
SVM: MSE=1.4174373917155811, R2=-0.3827187365152924
ANN: MSE=1.2473350097087808, R2=-0.21678283550028676
PLSR: MSE=958.332566812923, R2=-445.57324889917805
RF: MSE=2.6895600313933747, R2=-0.25330767514569885
SVM: MSE=2.622781994922141, R2=-0.22218978795832944




ANN: MSE=3.400193510241781, R2=-0.5844556632405336
PLSR: MSE=20.576601963465038, R2=-99.25901216526286
RF: MSE=0.6537921472128855, R2=-2.1855869573292668
SVM: MSE=0.4228458989779782, R2=-1.0603067603162977




ANN: MSE=4.925912966239672, R2=-23.001395803065023
PLSR: MSE=3.08585311137881, R2=-1.6806929113901057
RF: MSE=1.1701539056676324, R2=-0.016517367139709638
SVM: MSE=1.1325024592416777, R2=0.016190594611763087
ANN: MSE=2.5408863671914057, R2=-1.207278126123878
Best model for component 0: SVM with MSE=0.5669678575619073
Best model for component 1: SVM with MSE=0.8448189942393163
Best model for component 2: RF with MSE=1.6416883018306063
Best model for component 3: SVM with MSE=0.6271674251873427
Best model for component 4: SVM with MSE=0.803523142352585
Best model for component 5: SVM with MSE=0.3848702876710573
Best model for component 6: SVM with MSE=0.7595970760002673
Best model for component 7: RF with MSE=0.8957381302743017
Best model for component 8: ANN with MSE=2.194040991424435
Best model for component 9: ANN with MSE=1.2473350097087808
Best model for component 10: SVM with MSE=2.622781994922141
Best model for component 11: SVM with MSE=0.4228458989779782
Best model for component

