In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor

# Load dataset
file_path = "pine_soil_slope_ndvi.csv"  # Update with actual file path
df = pd.read_csv(file_path)

# Drop non-relevant columns
drop_columns = ["env", "TestId", "latitude", "longitude", "date_initial", "date_final", "Feature"]
df = df.drop(columns=drop_columns)

# Fill missing values
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())
df["Specie"] = df["Specie"].fillna(df["Specie"].mode()[0])

# Encode the "Specie" column using Label Encoding
label_encoder = LabelEncoder()
df["Specie"] = label_encoder.fit_transform(df["Specie"])

# Compute correlation matrix
corr_matrix = df.corr()

# Select features with a strong correlation to the target variable (threshold = 0.2)
corr_threshold = 0.2
target_corr = corr_matrix["Productivity (y)"].abs().sort_values(ascending=False)
selected_features = target_corr[target_corr > corr_threshold].index.tolist()

# Remove multicollinearity: Drop features highly correlated (> 0.85) with each other
corr_drop_threshold = 0.85
drop_features = set()

for feature in selected_features:
    if feature in drop_features:
        continue
    correlated_features = corr_matrix[feature][corr_matrix[feature].abs() > corr_drop_threshold].index.tolist()
    correlated_features.remove(feature)  # Keep one and drop others
    drop_features.update(correlated_features)

# Final selected features
selected_features = [f for f in selected_features if f not in drop_features]

# Define target and reduced feature set
y = df["Productivity (y)"]
X = df[selected_features].drop(columns=["Productivity (y)"])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Predict and evaluate
y_pred = mlp.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Selected Features: {selected_features}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


Selected Features: ['Productivity (y)', 'bio_11', 'bio_13', 'bio_7', 'bio_19', 'bio_18', 'Specie', 'wv0033', 'bio_17', 'bio_8', 'wv0010']
Root Mean Squared Error (RMSE): 1.1923
R² Score: 0.8821




In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor

# Load dataset
file_path = "pine_soil_slope_ndvi.csv"  # Update with actual file path
df = pd.read_csv(file_path)

# Drop non-relevant columns
drop_columns = ["env", "TestId", "latitude", "longitude", "date_initial", "date_final", "Feature"]
df = df.drop(columns=drop_columns)

# Separate numeric and categorical columns
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = ["Specie"]

# Fill missing values
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())  # Fill numeric missing values with mean
df["Specie"] = df["Specie"].fillna(df["Specie"].mode()[0])  # Fill missing species with most frequent value

# Encode the "Specie" column using Label Encoding
label_encoder = LabelEncoder()
df["Specie"] = label_encoder.fit_transform(df["Specie"])

# Define target and features
y = df["Productivity (y)"]
X = df.drop(columns=["Productivity (y)"])

# Standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to retain 95% variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

mlp = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Predict and evaluate
y_pred = mlp.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


Root Mean Squared Error (RMSE): 1.1891
R² Score: 0.8828
