In [1]:
# Partial least squares methods

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.cross_decomposition import PLSRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import shapiro, anderson, norm

from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import umap as U 


In [2]:
df = pd.read_excel(r'C:\Users\uqkmuroi\Desktop\Van3_results.xlsx', header = 0)

# as there are many missing columns, and some columns with missing values, we remake columns by specifying their excel equivalent
#I chose to remove any columns with no values, and columns with many missing values
columns_prot = df.iloc[:,
    list(range(183,202)) + # GB to GT
    [206] + # GY
    list(range(209, 214)) + # HB to HF
    list(range(216, 225)) + # HI to HQ
    [226] + # HS
    list(range(228, 237))] # HU to IC

# we also want the UQ strain column as it helps group the rows by experiments (since we're tryin to extract one value from each experiment)
column_strain = df['UQ Strain']

# we combine the two column sets from above
grouped = pd.concat([column_strain, columns_prot], axis=1)

# group by uq strain
grouped = grouped.groupby('UQ Strain')

# this extacts the max value (which is equivalent to the only value per experiment)
group = grouped.max()

# remove the last row as it is Nan 
# group = group.iloc[:-1]

# actually since the last row is the reference, I will change them to 1.0 
group.iloc[-1] = 1.0
print (group)
print(group.shape)

               SAH1      MET6      SAM2      SAM1       ADOI       SER3  \
UQ Strain                                                                 
VAN_011    0.476754  0.726056  8.390562  3.466260   0.826203   0.332212   
VAN_022    0.501503  0.658658  0.833097  1.069050   0.899388   2.937712   
VAN_024    0.489304  0.776491  0.848666  1.006409   1.223536   0.015000   
VAN_034    0.518499  0.560981  0.806098  0.900420   0.705531  38.200194   
VAN_035    0.563034  0.680219  0.961508  0.946449   0.747693  30.411039   
...             ...       ...       ...       ...        ...        ...   
VAN_391    4.271964  3.200832  3.767812  2.142580   1.235859   1.388874   
VAN_392    0.715643  1.176335  1.154362  1.119098   1.147058  55.464613   
VAN_393    0.731537  1.676646  1.641193  1.302600  22.974646   2.837416   
VAN_396    0.879216  0.952300  8.880651  4.468680   0.990007   1.930996   
VAN_REF7   1.000000  1.000000  1.000000  1.000000   1.000000   1.000000   

                SER1    

In [3]:
nan_count = group.isna().sum()

print (nan_count)

SAH1     0
MET6     0
SAM2     0
SAM1     0
ADOI     0
SER3     1
SER1     0
SER2     1
SHM2     0
GCV1     0
GCV2     0
GCV3     0
LPD1     0
MET13    0
FDH1     2
ADE3     0
MIS1     0
SNZ1     0
MET7     0
CBF1     1
CYS4     0
CYS3     0
HOM3     0
HOM2     0
HOM6     0
MET3     0
MET14    0
MET16    2
MET10    0
MET5     0
MET2     0
MET17    0
CHO2     2
OPI3     0
GLY1     0
FUM1     0
ICL1     0
MAE1     0
ALD6     0
ZWF1     0
PYC1     0
AGX1     0
SOL3     0
PCK1     3
dtype: int64


In [4]:
# calculate column-wise means
column_means = group.mean()

# replace NaN values
group_filled = group.fillna(column_means)

print(group_filled)
print(group_filled.shape)

               SAH1      MET6      SAM2      SAM1       ADOI       SER3  \
UQ Strain                                                                 
VAN_011    0.476754  0.726056  8.390562  3.466260   0.826203   0.332212   
VAN_022    0.501503  0.658658  0.833097  1.069050   0.899388   2.937712   
VAN_024    0.489304  0.776491  0.848666  1.006409   1.223536   0.015000   
VAN_034    0.518499  0.560981  0.806098  0.900420   0.705531  38.200194   
VAN_035    0.563034  0.680219  0.961508  0.946449   0.747693  30.411039   
...             ...       ...       ...       ...        ...        ...   
VAN_391    4.271964  3.200832  3.767812  2.142580   1.235859   1.388874   
VAN_392    0.715643  1.176335  1.154362  1.119098   1.147058  55.464613   
VAN_393    0.731537  1.676646  1.641193  1.302600  22.974646   2.837416   
VAN_396    0.879216  0.952300  8.880651  4.468680   0.990007   1.930996   
VAN_REF7   1.000000  1.000000  1.000000  1.000000   1.000000   1.000000   

                SER1    

In [5]:
#OPTIONAL use this block to log tranform data without affecting how the rest of the codw works

group_filled= np.log10(group_filled)

print(group_filled.shape)

(64, 44)


In [6]:
X_scaler= StandardScaler()
X_scaled = X_scaler.fit_transform(group_filled)

In [7]:
grouped = df.groupby('UQ Strain')
y = grouped['Vanillate DAD_G 274nm_tot'].max()
print(y)

UQ Strain
VAN_011     4952.447068
VAN_022     1164.833526
VAN_024     2909.079446
VAN_034     3843.687074
VAN_035     4110.693695
               ...     
VAN_391     6482.053633
VAN_392     3669.222194
VAN_393     3693.414898
VAN_396     2958.085578
VAN_REF7    3192.759655
Name: Vanillate DAD_G 274nm_tot, Length: 64, dtype: float64


In [11]:
n_splits = 5 #the number of k-fold splits (usually 5 or 10)
n_components = 2 # latent variables in PLS

#define the kfold
kf = KFold(n_splits = n_splits, shuffle = True, random_state=42)


#You can pick the model here: Since Decision Tree performed quite well, we can try that
model = PLSRegression(n_components = n_components)
#model = GaussianProcessRegressor()

#create list to store results
train_scores = []
test_scores = []
r2_train=[]
r2_test=[]

#perform cross val
for train_index, test_index in kf.split(X_scaled):
    #split the data  into training and testing sets
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]


    # turn off the line above and turn on below to change to the 6th time-point (instaed of all 6 Van tot/ time value)
    #y_train, y_test = y_last_scaled[train_index], y_last_scaled[test_index]

    #train the model
    model.fit(X_train, y_train)

    #make predictions on the test set
    y_pred = model.predict(X_test)

    #evaluate model
    test_score = mean_squared_error(y_test, y_pred)
    r2_test_score = r2_score(y_test, y_pred)

    #store results
    test_scores.append(test_score)
    r2_test.append(r2_test_score)

    #store training scores
    train_score = mean_squared_error(y_train, model.predict(X_train))
    train_scores.append(train_score)
    r2_train_score = r2_score(y_train, model.predict(X_train))
    r2_train.append(r2_train_score)

#print results (average across all folds)
print(f"Average test score (MSE): {np.mean(test_scores)}")
print(f"test scores (MSE): {test_scores}")
print(f"Average training score (MSE): {np.mean(train_scores)}")
print(f"Average r2 tests: {np.mean(r2_test)}")
print(f"test r2 scores {r2_test}")
print(f"Average r2 training score {np.mean(r2_train)}")


#Check that the error above is appropriate.

'''
# After making predictions
y_pred_original = y_scaler.inverse_transform(y_pred)
y_test_original = y_scaler.inverse_transform(y_test)

# Calculate MSE on the original scale
#test_score_original = mean_squared_error(y_test_original, y_pred_original)
'''
'''
test_score_original =y_scaler.inverse_transform(test_scores)
print(f"Test MSE on original scale: {test_score_original}")
'''

Average test score (MSE): 2546592.015716029
test scores (MSE): [2226506.790503396, 3054575.7263669195, 3419710.307354897, 2477346.0929353884, 1554821.1614195427]
Average training score (MSE): 1058668.7549915817
Average r2 tests: -0.16864465096227851
test r2 scores [0.03278834843493594, -0.6432030197060015, -0.2743706878311849, -0.21057717191501002, 0.25213927620586796]
Average r2 training score 0.5382703233605748


  y_train, y_test = y[train_index], y[test_index]
  y_train, y_test = y[train_index], y[test_index]
  y_train, y_test = y[train_index], y[test_index]
  y_train, y_test = y[train_index], y[test_index]
  y_train, y_test = y[train_index], y[test_index]


'\ntest_score_original =y_scaler.inverse_transform(test_scores)\nprint(f"Test MSE on original scale: {test_score_original}")\n'

In [23]:
print(y_train.dtype)
print(X_train.dtype)
print(y_test.dtype)
print(y_pred.dtype)

float64
float64
float64
float64
