In [76]:
from functions import *

In [77]:
import matplotlib.pyplot as plt
import glob
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import geopandas as gpd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

## Load data

In [26]:
lst = '../data/preprocessed/lst'
wt = '../data/preprocessed/wt'
wt_interpolated = '../data/preprocessed/wt_interpolated'
masked = '../data/preprocessed/lst/masked'
ndvi = '../data/preprocessed/ndvi'

In [29]:
lst_df = pd.read_csv(f'{lst}/leboiron_lst.csv',index_col=0)
targets_df = pd.read_csv(f'{wt}/leboiron_wt.csv',index_col=0)
#targets_df = pd.read_csv(f'{wt_interpolated}/leboiron_wt_interpolated.csv',index_col=0)

In [30]:
use_ndvi = True
if use_ndvi:
  ndvi_df = pd.read_csv(f'{ndvi}/leboiron_ndvi.csv',index_col=0)
  df_combined = pd.concat([lst_df, ndvi_df], axis=1)
  train_input, train_target, validation_input, validation_target, test_input, test_target = split_data_df(df_combined, targets_df)
  X = df_combined

else:
  train_input, train_target, validation_input, validation_target, test_input, test_target = split_data_df(lst_df, targets_df)
  X = lst_df

y = targets_df

## Linear Regression model

In [31]:
# Initialize the model
model = LinearRegression()

# Train the model using the training data
model.fit(train_input, train_target)

### Validate and test

In [32]:
validation_prediction = model.predict(validation_input)

In [33]:
results = evaluate_model(validation_target, validation_prediction)
for k,v in results.items():
  print(k,':',v)

MAE : 1.9649642118824169
MSE : 11.725541000996524
RMSE : 3.1965031882024233
R² : -26.507661909122927
MAPE (%) : 41.39929836572837
MSE sample-wise : 10.217632632188257


In [34]:
test_prediction = model.predict(test_input)

In [35]:
results = evaluate_model(test_target, test_prediction)
for k,v in results.items():
  print(k,':',v)

MAE : 36.552524163004925
MSE : 0.9262366198125097
RMSE : 93.25547433650509
R² : -4138.199884861886
MAPE (%) : 1260.3321181535175
MSE sample-wise : 8696.583493726559


Image-wise error

In [37]:
c = 0
mean_results = {k:[] for k in results.keys()}
# Loop through each sample and compute the MSE for that sample
for i in range(test_target.shape[0]):
    res = evaluate_model(test_target[i], test_prediction[i])

    for k,v in res.items():
      mean_results[k].append(v)

for key in mean_results:
    mean_results[key] = np.mean(mean_results[key])

print(mean_results)

{'MAE': np.float64(36.552524163004925), 'MSE': np.float64(26636.296948254734), 'RMSE': np.float64(54.22580976001678), 'R²': np.float64(-890.7988755055293), 'MAPE (%)': np.float64(1260.3321181535177), 'MSE sample-wise': np.float64(8696.583493726559)}


### K-fold cross-validation

In [234]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [235]:
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [237]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

model = LinearRegression()

In [None]:
mean_results_kfold = {k:[] for k in results.keys()}  # Initialize empty lists for each metric

# Iterate over the folds
for train_index, test_index in kf.split(df_scaled):
    X_train, X_test = df_scaled.iloc[train_index], df_scaled.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Initialize mean_results for this fold (to store per-image metrics)
    mean_results = {k:[] for k in results.keys()}

    # Iterate over each image in the test set
    for i in range(y_test.shape[0]):  # For each image (row) in the test set
        # Calculate the metrics for the image
        results = evaluate_model(y_test.iloc[i], y_pred[i])

        # Store the results for this image
        for k, v in results.items():
            mean_results[k].append(v)

    # Compute the average metric for this fold and store in mean_results_kfold
    for key in mean_results:
        fold_mean = np.mean(mean_results[key])
        mean_results_kfold[key].append(fold_mean)  # Append fold mean to global results


In [239]:
for key in mean_results_kfold:
    print('Mitja de', key,'amb ', mean_results_kfold[key])

final_mean_results = {k: np.mean(v) for k, v in mean_results_kfold.items()}


Mitja de MAE amb  [0.8612343182104351, 0.7398277904991541, 0.9051862189083529, 0.895350218677993, 1.8012057555778933]
Mitja de MSE amb  [8.398667425901058, 4.568961193558539, 6.646677108418028, 7.580710163148032, 26.48985264696239]
Mitja de RMSE amb  [1.3085373208367461, 1.1102420370148605, 1.3726944183114747, 1.340271140161635, 2.6904679049631053]
Mitja de R² amb  [-0.6719216788108401, 0.886939443652775, 0.8588310660372435, 0.812971846690092, -0.5566526233571706]
Mitja de MAPE (%) amb  [76.27709869209333, 21.440777614430434, 25.226074670516628, 27.35165484606005, 79.85624985387128]
Mitja de MSE sample-wise amb  [2.4003138959702404, 1.45158427078861, 2.312551932808536, 2.335678798308707, 8.10536525947038]


In [240]:
final_mean_results

{'MAE': 1.0405608603747658,
 'MSE': 10.73697370759761,
 'RMSE': 1.5644425642575643,
 'R²': 0.26603361084241994,
 'MAPE (%)': 46.03037113539434,
 'MSE sample-wise': 3.321098831469295}

### Save model results

In [57]:
model_name = "LR_7"
details = {'Input': 'RGB filtered lst+ndvi scaled', 'Output': 'Water stations', 'Resolution': 256, 'Batch size':'', 'Epochs':''}

file_path = "../results/model_results_img_wise.xlsx"
save_excel(file_path, model_name, final_mean_results, excel = 'Results')

file_path = "../results/model_details.xlsx"
save_excel(file_path, model_name, details, excel = 'Details')

In [112]:
df = pd.read_excel("../results/model_results_img_wise.xlsx")
df

Unnamed: 0,Model Name,MAE,MSE,RMSE,R²,MAPE (%),MSE sample-wise
0,LR_1,1.859391,28.733955,2.802144,0.61733,43.532711,8.83452
1,LR_2,1.136107,10.20849,1.708394,0.518126,46.942153,3.052657
2,LR_3,3.693705,537.045698,5.519793,-23.941264,147.748381,177.914441
3,Physics-based LR,2.598333,104.117442,3.922001,-7.539237,126.056267,31.426612
4,LR_4,4.335644,902.522433,6.463918,-33.118811,177.342522,289.22575
5,LR_5,1.040561,10.736974,1.564443,0.266034,46.030371,3.321099
6,Physics-based LR_2,48.705002,264911.944944,72.290527,-10707.338357,2222.274927,90773.254598
7,LR_6,36.552524,26636.296948,54.22581,-890.798876,1260.332118,8696.583494
8,LR_7,1.040561,10.736974,1.564443,0.266034,46.030371,3.321099


In [111]:
df_d = pd.read_excel("../results/model_details.xlsx")
df_d

Unnamed: 0,Model Name,Input,Output,Resolution,Batch size,Epochs
0,LR_1,RGB filtered lst,Water stations,256,,
1,LR_2,RGB filtered lst+ndvi,Water stations,256,,
2,LR_3,RGB filtered lst+ndvi,Water stations,256,,
3,Physics-based LR,RGB filtered lst scaled,Water stations,256,,
4,LR_4,RGB filtered lst scaled,Water stations,256,,
5,LR_5,RGB filtered lst+ndvi scaled,Water stations,256,,
6,Physics-based LR_2,RGB filtered lst,Water stations,256,,
7,LR_6,RGB filtered lst+ndvi,Water stations,256,,
8,LR_7,RGB filtered lst+ndvi scaled,Water stations,256,,


## Physics regression

In [None]:
use_ndvi = False
if use_ndvi:
  ndvi_df = pd.read_csv(f'{ndvi}/leboiron_ndvi.csv',index_col=0)
  df_combined = pd.concat([lst_df, ndvi_df], axis=1)
  train_input, train_target, validation_input, validation_target, test_input, test_target = split_data_df(df_combined, targets_df)
  X = df_combined

else:
  train_input, train_target, validation_input, validation_target, test_input, test_target = split_data_df(lst_df, targets_df)
  X = lst_df

y = targets_df

In [243]:
T_a_squared_df = lst_df ** 2
T_a_squared_df.columns = [col + '^2' for col in lst_df.columns]

In [244]:
X = pd.concat([lst_df, T_a_squared_df],axis=1)

Scale data

In [210]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [213]:
df_scaled = pd.DataFrame(X_scaled, columns=df_combined.columns)

### Split data and train test model

In [214]:
train_input, train_target, validation_input, validation_target, test_input, test_target = split_data_df(df_scaled, targets_df)

In [215]:
# Initialize the model
model = LinearRegression()

# Train the model using the training data
model.fit(train_input, train_target)

In [216]:
validation_prediction = model.predict(validation_input)

In [217]:
results = evaluate_model(validation_target, validation_prediction)
for k,v in results.items():
  print(k,':',v)

MAE : 1.675405803812477
MSE : 8.508514486988604
RMSE : 2.564793738676436
R² : -0.3608143324054614
MAPE (%) : 46.51545151564784
MSE sample-wise : 6.5781669219538506


### K-fold cross-validation

In [245]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

model = LinearRegression()

In [None]:
mean_results_kfold = {k:[] for k in results.keys()}  # Initialize empty lists for each metric

# Iterate over the folds
for train_index, test_index in kf.split(df_scaled):
    X_train, X_test = df_scaled.iloc[train_index], df_scaled.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Initialize mean_results for this fold (to store per-image metrics)
    mean_results = {k:[] for k in results.keys()}

    # Iterate over each image in the test set
    for i in range(y_test.shape[0]):  # For each image (row) in the test set
        # Calculate the metrics for the image
        results = evaluate_model(y_test.iloc[i], y_pred[i])

        # Store the results for this image
        for k, v in results.items():
            mean_results[k].append(v)

    # Compute the average metric for this fold and store in mean_results_kfold
    for key in mean_results:
        fold_mean = np.mean(mean_results[key])
        mean_results_kfold[key].append(fold_mean)  # Append fold mean to global results


In [248]:
for key in mean_results_kfold:
    print('Mitja de', key,'amb ', mean_results_kfold[key])

final_mean_results = {k: np.mean(v) for k, v in mean_results_kfold.items()}
final_mean_result

Mitja de MAE amb  [13.607991196356908, 6.43292693695243, 171.69906894834412, 5.157575596243587, 46.62744908395774]
Mitja de MSE amb  [5349.534443668887, 591.1517189668024, 1248617.9104077455, 771.3704381805035, 69229.75771047996]
Mitja de RMSE amb  [20.18882268127341, 9.566487077821154, 254.64846169320418, 7.864618159817377, 69.184247523581]
Mitja de R² amb  [-2214.794923109918, -7.042911376304664, -43936.684807034544, -32.24217145677094, -7345.9269697434875]
Mitja de MAPE (%) amb  [2106.620990891825, 153.74982728533436, 5856.622648600487, 200.76180297047858, 2793.619366476714]
Mitja de MSE sample-wise amb  [1885.5444965988015, 201.77008036060016, 428572.32633132255, 211.22749598401174, 22995.404587170855]


{'MAE': 48.705002352370954,
 'MSE': 264911.9449438083,
 'RMSE': 72.29052742713944,
 'R²': -10707.338356544204,
 'MAPE (%)': 2222.2749272449682,
 'MSE sample-wise': 90773.25459828737}

### Save model results

In [249]:
model_name = "Physics-based LR_3"
details = {'Input': 'RGB filtered lst scaled', 'Output': 'Water stations', 'Resolution': 256, 'Batch size':'', 'Epochs':''}

file_path = "/content/gdrive/MyDrive/TFG/results/model_results_img_wise.xlsx"
save_excel(file_path, model_name, final_mean_results, excel = 'Results')

file_path = "/content/gdrive/MyDrive/TFG/results/model_details.xlsx"
save_excel(file_path, model_name, details, excel = 'Details')


In [None]:
df = pd.read_excel("../results/model_results_img_wise.xlsx")
df