## First Notebook

### Importation of the libraries

In [None]:
import time
import os
from pathlib import Path
import numpy as np
import pinard as pn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import display

from pinard import utils
from pinard import preprocessing as pp
from pinard.model_selection import train_test_split_idx

from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from scipy.stats import ks_2samp
from scipy.spatial.distance import pdist, squareform



# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

### Importation of the datasets

In [21]:
mode = 'Regression' # 'Classification' or 'Regression'
 
## Choose the source of data to be imported
# 'BeerOriginalExtract' or 'Digest_0.8' or 'YamProtein' for regression //
# 'CoffeeSpecies' or 'Malaria2024' or 'mDigest_custom3' or 'WhiskyConcentration' or 'YamMould' for classification
data_source = 'BeerOriginalExtract'


## Importation of the datasets with the adapted path

file_name = Path("Data/%s/%s"% (mode,data_source))
full_path = str(file_name.resolve()).replace("\\", "/")
Xcal = pd.read_csv(f'{full_path}/Xcal.csv', sep=';')
Xval = pd.read_csv(f'{full_path}/Xval.csv', sep=';')
Ycal = pd.read_csv(f'{full_path}/Ycal.csv', sep=';')
Yval = pd.read_csv(f'{full_path}/Yval.csv', sep=';')

display(Ycal.head(5))

Unnamed: 0,V1
0,4.23
1,6.02
2,6.49
3,8.92
4,8.979999


### Fucntion to plot appropriate spectra

In [22]:
def plot_spectra(X, names, title='Visualization of the spectra'):
    if isinstance(X, pd.DataFrame):
        X = [X.values]
    elif isinstance(X, np.ndarray):
            X = [X]

    plt.figure(figsize=(10, 5))
    colors = sns.color_palette(palette = "Paired")
    for i, dataset in enumerate(X):
         plt.plot(dataset.T, color=colors[i])
         plt.plot([], [], color=colors[i], label=names[i])

    plt.legend(loc='upper left', fontsize='small')
    plt.title(title)
    plt.xlabel('Wavelength')
    plt.ylabel('Absorbance')
    plt.xticks(np.arange(0, dataset.shape[1], dataset.shape[1]//10), rotation=45)
    plt.show(block=False)

### Test of pipelines with Pinard

In [None]:
# Simple pipeline declaration
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('preprocessing', pp.SavitzkyGolay()),
    ('pls', PLSRegression(n_components=10, scale=False))
    ])

# Transform the data with the desired part of the pipeline
Xcal_transformed = pipeline[1].fit_transform(Xcal)



### Visualize the spectra after preprocessing

In [None]:
plot_spectra([Xcal, Xcal_transformed], 
             names = ['Before transformation', 'After transformation'], 
             title= 'Visualization of the spectra with or without preprocessing')
plt.show()

### Test on diverse preprocessing methods

In [None]:
# List of preprocessing methods to be used
preprocessing_methods = [
    pp.SavitzkyGolay(),
    pp.SavitzkyGolay(deriv=1),
    pp.SavitzkyGolay(deriv=2),
    pp.SavitzkyGolay(deriv=3),
    pp.SavitzkyGolay(deriv=4),
    pp.SavitzkyGolay(deriv=5),
    pp.SavitzkyGolay(deriv=6),
    pp.Gaussian(order = 1, sigma = 2),
    pp.Gaussian(order = 2, sigma = 1),
    pp.Gaussian(order = 3, sigma = 1),
    pp.Gaussian(order = 4, sigma = 1),
    pp.Gaussian(order = 5, sigma = 1)
    ]

# List of transformed spectra
transformed_spectra = []
for method in preprocessing_methods:
    # Apply the preprocessing method to the spectra
    transformed_spectrum = method.fit_transform(Xcal.values)
    
    # Append the transformed spectrum to the list
    transformed_spectra.append(transformed_spectrum)

### Dissimilarity with the Kolmogorov-Smirnov test

In [None]:
def ks_tests_apparies(X1, X2):
    """
    Make a Kolmogorov-Smirnov test for each pair of variables in the spectral datasets X1 and X2, 
    each pair corresponding to a given wavelength.
    
    Parameters :
        X1 (list of arrays): Set of spectra with preprocessing method 1.
        X2 (list of arrays): Set of spectra with preprocessing method 2.
        
    Returns :
        results (list of dicts): List of dictionaries containing the KS statistic and p-value for each wavelength.
    """
    if len(X1) != len(X2):
        raise ValueError("The two lists must have the same length.")
    
    results = []
    
    for i, (x1_i, x2_i) in enumerate(zip(X1, X2)):
        ks_stat, p_val = ks_2samp(x1_i, x2_i)
        results.append({
            "paire": i,
            "ks_stat": ks_stat,
            "p_value": p_val
        })
    
    return results


def dissim_KS(X1, X2):
    """
    Compute the dissimilarity between two spectral datasets using the Kolmogorov-Smirnov test.
    
    Parameters:
        X1 (list of arrays): Set of spectra with preprocessing method 1.
        X2 (list of arrays): Set of spectra with preprocessing method 2.
        
    Returns:
        dissimilarity (float): Dissimilarity value between the two spectral datasets.
    """
    ks_results = ks_tests_apparies(X1, X2)
    
    # Compute the dissimilarity as the mean of the KS statistics
    dissimilarity = np.mean([result["ks_stat"] for result in ks_results])
    
    return dissimilarity


### Compute the KS dissimilarities for several preprocessing methods

In [None]:
def compare_preprocessings(preprocessing_list, Xcal):
    """
    Computes pairwise distances between vectors in data_list using a custom distance function.
    
    Parameters:
        data_list (list of arrays): A list where each element is a 1D array or vector.
        distance_func (callable): A function that takes two vectors and returns a scalar distance.
        
    Returns:
        distance_matrix (ndarray): A square matrix of pairwise distances.
    """
    # Convert the list into a 2D NumPy array (each row is a vector)
    matrix = np.array(preprocessing_list)

    # Compute condensed distance vector using the custom function
    condensed_distances = pdist(matrix, metric=dissim_KS)

    # Convert the condensed form to a square matrix
    distance_matrix = squareform(condensed_distances)

    return distance_matrix

