In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('../../data/train_data.csv')
test_data = pd.read_csv('../../data/test_data.csv')

In [3]:
len(train_data), len(test_data)

(124013, 53149)

In [4]:
overlapping_rows = train_data.merge(test_data, how='inner')
print(f"Number of overlapping rows: {len(overlapping_rows)}")


Number of overlapping rows: 0


In [5]:
common_ids = set(train_data["TestId"]).intersection(set(test_data["TestId"]))
print(f"Number of overlapping TestId: {len(common_ids)}")


Number of overlapping TestId: 307


In [6]:
train_data.drop(["latitude", "longitude", "env", "TestId", "date_initial", "date_final", "Feature", "Unnamed: 0"], axis=1, inplace=True)
test_data.drop(["latitude", "longitude", "env", "TestId", "date_initial", "date_final", "Feature", "Unnamed: 0"], axis=1, inplace=True)

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

len(train_data), len(test_data)

(123981, 53130)

In [7]:
X_train = train_data.drop(["Productivity (y)"], axis=1)
y_train = train_data["Productivity (y)"]

X_test = test_data.drop(["Productivity (y)"], axis=1)
y_test = test_data["Productivity (y)"]

In [8]:
# One-hot encoding the "Specie" column
X_train = pd.get_dummies(X_train, columns=["Specie"], drop_first=True)  # drop_first=True to avoid multicollinearity
X_test = pd.get_dummies(X_test, columns=["Specie"], drop_first=True)  # drop_first=True to avoid multicollinearity

In [9]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Initialize and train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Regression Analysis
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


Mean Absolute Error (MAE): 0.8800
Mean Squared Error (MSE): 1.4308
Root Mean Squared Error (RMSE): 1.1961
R² Score: 0.8818


In [10]:
feature_importances = rf.feature_importances_ 
# Sort features by importance
sorted_idx = np.argsort(feature_importances)[::-1]
# Print the most important features

print("Feature Importance:")

for feature in X_train.columns[sorted_idx]:
    print(feature) 

Feature Importance:
bio_1
wv0010
Specie_PMAX
Specie_PGRN
bio_6
bio_4
bio_12
clay
elev
bio_2
wv0033
bio_18
silt
Specie_POOC
bio_17
bio_15
wv1500
bio_13
bio_5
ocs
NDVI_5DayAvg_final
cfvo
Specie_PPAT
bio_16
Specie_PTEH
ocd
NDVI_15DayAvg_final
Specie_PTEL
bio_9
bio_3
soc
bio_8
bio_14
sand
bio_7
slope
bdod
cec
bio_11
NDVI_15DayAvg_initial
phh2o
NDVI_5DayAvg_initial
nitrogen
Specie_PGRS
Specie_PTAE
bio_10
bio_19
Specie_PELL
Specie_PPATxPTEL
Specie_PPATxPTEH
Specie_PELLxPCAR
Specie_PCARxPTEL
Specie_PPATxPGRS
Specie_PPATxPELL
Specie_PCARxPOOC
Specie_PPATxPOOC
age (years)


In [11]:
# Initialize and train Random Forest Regressor
X_train = train_data.drop(["Productivity (y)", "Specie"], axis=1)
X_test = test_data.drop(["Productivity (y)", "Specie"], axis=1)


rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Regression Analysis
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error (MAE): 1.1693
Mean Squared Error (MSE): 2.4905
Root Mean Squared Error (RMSE): 1.5781
R² Score: 0.7942


In [13]:
feature_importances = rf.feature_importances_ 
# Sort features by importance
sorted_idx = np.argsort(feature_importances)[::-1]
# Print the most important features

print("Feature Importance:")

for feature in X_train.columns[sorted_idx]:
    print(feature) 

Feature Importance:
bio_1
wv0010
bio_18
wv0033
bio_2
bio_6
bio_12
bio_4
elev
clay
bio_15
bio_9
bio_14
bio_11
cec
ocd
cfvo
bio_16
silt
bdod
bio_3
bio_17
wv1500
soc
bio_10
NDVI_5DayAvg_initial
slope
bio_19
phh2o
bio_8
NDVI_5DayAvg_final
nitrogen
bio_13
bio_5
NDVI_15DayAvg_final
NDVI_15DayAvg_initial
sand
bio_7
ocs
age (years)


In [None]:
Hello 