# Load dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from kennard_stone import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from scipy.signal import savgol_filter
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'pandas'

# Prepare data

In [2]:
# Load spectral(x_block) and targets(y_block) data
x_block = pd.read_csv('df-scan-all.csv', decimal='.', index_col=0)
y_block = pd.read_csv('df-CHN.csv', decimal=',', sep=';',index_col=1)

y = y_block["C"].loc[y_block["C"] < 50]### supprimer deux échantillons superieurs à 50: (32-E4-Qu-Mar(50.68) et (2-E1-Fa-Feb 63.61))
x = np.log10(1/x_block.loc[y.index])## transformer la reflectance en absorbance

### train/test split using kennard-stone
xx_train, xx_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

## apply savitzky golay filter
x_train = savgol_filter(x =xx_train, window_length=7, polyorder=1, deriv=1) #### parametres de la dérivé de savitzku golay( window_length=7, polyorder=1, deriv=1)
x_test = savgol_filter(x =xx_test, window_length=7, polyorder=1, deriv=1) #### parametres de la dérivé de savitzku golay( window_length=7, polyorder=1, deriv=1)

NameError: name 'pd' is not defined

In [None]:
fig, ax = plt.subplots()
pd.DataFrame(x_train).T.plot(figsize=(17, 3), legend = False, ax = ax, color = 'blue')# train
pd.DataFrame(x_test).T.plot(figsize=(17, 3), legend = False, ax = ax, color = 'green', label = 'test')# test
plt.margins(x = 0)

# Build a predictive model and evaluate its performance

In [None]:
## Fit a plsr model
M = PLSRegression(n_components= 9, scale= False) ## fit a pls model avec 9 composantes 
M.fit(x_train, y_train)

# make predictions
yc = M.predict(x_train)
yt = M.predict(x_test)

# compute performance metrics
def metrics(meas, pred):
    r2 = r2_score(meas, pred)
    rmse = np.sqrt(mean_squared_error(meas, pred))
    
    perf = pd.DataFrame(
        {
            'R²' : r2,
            'RMSE': [rmse],
            'RPD': [rmse / np.std(meas, ddof = 1)],
            'RPIQ':[rmse / (np.percentile(meas, 75) - np.percentile(meas, 25))]
        },
    )
    return perf.round(2)

# report the performance of the fitted model in calibration and validation  
perf = pd.concat([metrics(y_train, yc), metrics(y_test, yt)], axis = 0)
perf.index = ['Cal', 'Val']

# Model performance
perf

# plot results

In [None]:
Cal = metrics(y_train, yc)
Cal.index = ['Cal']
Val = metrics(y_test, yt)
Val.index = ['Val']
plt.figure(figsize = (12,4))
sns.regplot(x= y_train, y= yc, label= Cal)
sns.regplot(x= y_test, y= yt, label= Val)
plt.plot([40, 50], [40, 50], color= 'black', label= '1:1 line')
plt.xlabel('measured')
plt.ylabel('predicted')
plt.legend()
plt.margins(0)



plt.figure(figsize = (12,4))
sns.residplot(x= y_train, y= yc, label= "Cal", lowess=True)
sns.residplot(x= y_test, y= yt, label= 'Val', lowess=True)
plt.xlabel('measured')
plt.ylabel('error')
plt.legend()
plt.margins(0)

# Make predicitons for all spectra(including those used for model construction)

In [None]:
predictor = np.log10(1/x_block)## transformer la reflectance en absorbance
## apply savitzky golay filter
preprocessed = savgol_filter(x =predictor, window_length=7, polyorder=1, deriv=1)

results = pd.DataFrame(M.predict(preprocessed), index = x_block.index, columns=["C"]).round(2)


# Distribution of the results

In [None]:
plt.figure(figsize = (12,4))
sns.histplot(results)

In [None]:
results.describe().round(2).T