# Excerise 3: Downscaling near-future predictions of climatic variables

#### Load all libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier
from scipy.stats import linregress

#### Data inputs

In [None]:
train_df = pd.read_excel("C:\\Users\\arumu002\\OneDrive - Wageningen University & Research\\Project I\\Rutger\\statitical downscaling\\\Station_data_NL_1987_2023.xlsx")
df1 = train_df.dropna()
values_to_drop = [2021,2022,2023]
df = df1[~df1['YEAR'].isin(values_to_drop)]
df2 = df1[df1['YEAR'].isin(values_to_drop)]

#data_X = df.drop(["Year","Yield"],axis=1)
data_X = df.drop(["tmax","tmin","tas"],axis=1)
data_y = df['tmax']
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)

#X_test1 = df2.drop(["Year","Yield"],axis=1)
X_test1 = df2.drop(["tmax","tmin","tas"],axis=1)
y_test1 = df2['tmax']

####  Develop ML model to validate

In [None]:
new = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                                learning_rate=0.2, loss='squared_error', max_depth=8,
                                max_features='sqrt', max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_samples_leaf=100, min_samples_split=1000,
                                min_weight_fraction_leaf=0.0, n_estimators=1000,
                                n_iter_no_change=None,
                                random_state=10, subsample=1, tol=0.0001,
                                validation_fraction=0.1, verbose=0, warm_start=False)

new.fit(X_train, y_train)
y_pred = new.predict(X_test1)
mse = mean_squared_error(y_test1, y_pred)
print(f'The mean squared error (MSE) on test set: {mse:.4f}')
r2_scores = cross_val_score(new, X_train, y_train, cv=3, scoring='r2')
k_fold = r2_scores.mean()
print(f'The k-fold validation (R2) on test set: {k_fold:.4f}')
slope, intercept, r_value, p_value, std_err = linregress(y_test1, y_pred)
r_squared = r_value**2
print(f'The  validation (R2) on test set: {r_squared:.4f}')

#### Plot the prediction

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
#plt.plot(X, y, c='k', label='data')
# Create a scatter plot
axs[0].scatter(y_test1, y_pred)
# Add a 1:1 line
axs[0].plot([min(y_test1), max(y_test1)], [min(y_test1), max(y_test1)], linestyle='--', color='gray', label='1:1 line')
# Perform linear regression to calculate R2
slope, intercept, r_value, p_value, std_err = linregress(y_test1, y_pred)
r_squared = r_value**2
# Add labels and a title
axs[0].set_xlabel('Observed')
axs[0].set_ylabel('Predicted')
axs[0].set_title('Observed Vs Predicted')
axs[0].annotate(f'R2 = {r_squared:.2f}', xy=(0.1, 0.85), xycoords='axes fraction', fontsize=18)
axs[0].annotate(f'MSE = {mse:.2f}', xy=(0.1, 0.75), xycoords='axes fraction', fontsize=18)
# Show the plot

feature_importance = new.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
top_n = 10
top_feature_indices = sorted_idx[-top_n:]
top_feature_importance = feature_importance[top_feature_indices]
top_feature_names = np.array(X_train.columns)[top_feature_indices]
# Create the plot
axs[1].barh(pos[-top_n:], top_feature_importance, align="center")
axs[1].set_yticks(pos[-top_n:], top_feature_names)
axs[1].set_title("Top 10 Feature Importance")
axs[1].set_xlabel("Importance")
# Show the plot
plt.show()