Experiment with 70:30 data split where 30 is unused.

In [26]:
import pandas as pd
train_data = pd.read_csv('../../data/tr_data.csv')
test_data = pd.read_csv('../../data/te_data.csv')
len(train_data), len(test_data)

(86809, 37204)

In [27]:
# Drop rows with empty values
train_data.drop(["latitude", "longitude", "env", "TestId", "date_initial", "date_final", "Feature", "Unnamed: 0"], axis=1, inplace=True)
test_data.drop(["latitude", "longitude", "env", "TestId", "date_initial", "date_final", "Feature", "Unnamed: 0"], axis=1, inplace=True)

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

len(train_data), len(test_data)


(86786, 37195)

In [28]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
num_cols = train_data.select_dtypes(include=["number"]).columns.tolist()
train_data[num_cols] = train_data[num_cols].fillna(train_data[num_cols].mean())
train_data["Specie"] = train_data["Specie"].fillna(train_data["Specie"].mode()[0])

# Encode the "Specie" column using Label Encoding
label_encoder = LabelEncoder()
train_data["Specie"] = label_encoder.fit_transform(train_data["Specie"])

In [29]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
num_cols = test_data.select_dtypes(include=["number"]).columns.tolist()
test_data[num_cols] = test_data[num_cols].fillna(test_data[num_cols].mean())
test_data["Specie"] = test_data["Specie"].fillna(test_data["Specie"].mode()[0])

# Encode the "Specie" column using Label Encoding
label_encoder = LabelEncoder()
test_data["Specie"] = label_encoder.fit_transform(test_data["Specie"])

Computing Correlation Matrix

In [30]:
# Compute correlation matrix
corr_matrix = train_data.corr()

# Select features with a strong correlation to the target variable (threshold = 0.2)
corr_threshold = 0.2
target_corr = corr_matrix["Productivity (y)"].abs().sort_values(ascending=False)
selected_features = target_corr[target_corr > corr_threshold].index.tolist()

# Remove multicollinearity: Drop features highly correlated (> 0.85) with each other
corr_drop_threshold = 0.85
drop_features = set()

for feature in selected_features:
    if feature in drop_features:
        continue
    correlated_features = corr_matrix[feature][corr_matrix[feature].abs() > corr_drop_threshold].index.tolist()
    correlated_features.remove(feature)  # Keep one and drop others
    drop_features.update(correlated_features)

# Final selected features
selected_features = [f for f in selected_features if f not in drop_features]

Run the below Cell if you want results for fulll data

In [21]:
X_train = train_data.drop(columns=["Productivity (y)"])
y_train = train_data["Productivity (y)"]

X_test = test_data.drop(columns=["Productivity (y)"])
y_test = test_data["Productivity (y)"]

Run the below cell for correlation analysis

In [31]:
X_train = train_data[selected_features].drop(columns=["Productivity (y)"])
y_train = train_data["Productivity (y)"]

X_test = test_data[selected_features].drop(columns=["Productivity (y)"])
y_test = test_data["Productivity (y)"]

In [32]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
xgb_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

# print(f"Selected Features: {selected_features}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MSE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 1.3897
Mean Absolute Error (MSE): 0.8604
R² Score: 0.8838


MLP result

In [33]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Predict and evaluate
y_pred = mlp.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

# print(f"Selected Features: {selected_features}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MSE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 1.6438
Mean Absolute Error (MSE): 0.9487
R² Score: 0.8625


Random forest result 

In [34]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Initialize and train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Regression Analysis
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


Mean Absolute Error (MAE): 0.8603
Mean Squared Error (MSE): 1.3911
Root Mean Squared Error (RMSE): 1.1794
R² Score: 0.8836
