<a href="https://colab.research.google.com/github/sahilbawa310/EDA/blob/main/eda_mea.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb

# Load the training data
train = pd.read_csv('./train.csv') VNK
# Load the test data
test = pd.read_csv('./test.csv')

# Define features and target
features = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
            'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
            'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
            'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass']

# Target variable is 'yield' based on the column names
X = train[features]
y = train['yield']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Gradient Boosting with preprocessing pipeline
gb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        min_samples_split=5,
        random_state=42,
        verbose=0))
])
gb_pipeline.fit(X_train, y_train)
gb_pred = gb_pipeline.predict(X_val)
gb_mae = mean_absolute_error(y_val, gb_pred)

# Model 2: XGBoost
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgb.XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42))
])
xgb_pipeline.fit(X_train, y_train)
xgb_pred = xgb_pipeline.predict(X_val)
xgb_mae = mean_absolute_error(y_val, xgb_pred)

# Model 3: LightGBM
lgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42))
])
lgb_pipeline.fit(X_train, y_train)
lgb_pred = lgb_pipeline.predict(X_val)
lgb_mae = mean_absolute_error(y_val, lgb_pred)

# Ensemble predictions (weighted average based on performance)
# Weights inversely proportional to MAE
weights = np.array([1/gb_mae, 1/xgb_mae, 1/lgb_mae])
weights /= weights.sum()

ensemble_pred = (gb_pred * weights[0] + xgb_pred * weights[1] + lgb_pred * weights[2])
ensemble_mae = mean_absolute_error(y_val, ensemble_pred)

print(f"Gradient Boosting MAE: {gb_mae:.2f}")
print(f"XGBoost MAE: {xgb_mae:.2f}")
print(f"LightGBM MAE: {lgb_mae:.2f}")
print(f"Ensemble MAE: {ensemble_mae:.2f}")

# Feature importance analysis (using XGBoost as an example)
xgb_importance = xgb_pipeline.named_steps['model'].feature_importances_
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': xgb_importance
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Train final model on all data using the ensemble approach
final_gb_model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=5,
    random_state=42)
final_gb_model.fit(X, y)

final_xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)
final_xgb_model.fit(X, y)

final_lgb_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)
final_lgb_model.fit(X, y)

# If you have a test set, you can make predictions like this:
# test = pd.read_csv('test.csv')
# X_test = test[features]
# final_predictions = (
#     final_gb_model.predict(X_test) * weights[0] +
#     final_xgb_model.predict(X_test) * weights[1] +
#     final_lgb_model.predict(X_test) * weights[2]
# )

# Create submission file (if you had a test set)
# submission = pd.DataFrame({
#     'id': test['id'],
#     'yield': final_predictions
# })
# submission.to_csv('submission.csv', index=False)
# print("Submission file created with final predictions")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 597
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 15
[LightGBM] [Info] Start training from score 5992.148656




Gradient Boosting MAE: 257.83
XGBoost MAE: 261.42
LightGBM MAE: 260.37
Ensemble MAE: 258.18

Feature Importance:
                 Feature  Importance
13              fruitset    0.509849
14             fruitmass    0.203568
7   AverageOfUpperTRange    0.180105
5       MaxOfUpperTRange    0.034748
6       MinOfUpperTRange    0.014537
11           RainingDays    0.008698
12    AverageRainingDays    0.007511
10  AverageOfLowerTRange    0.006914
4                  osmia    0.005888
8       MaxOfLowerTRange    0.005721
3                andrena    0.005576
1               honeybee    0.005404
2                bumbles    0.004823
0              clonesize    0.004243
9       MinOfLowerTRange    0.002415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 595
[LightGBM] [Info] Number of data poi

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')

# --- EDA ---
# (The EDA code was executed successfully and visualizations were generated as mentioned above)

# Identify target variable
target_column = [col for col in sample_submission_df.columns if col != 'id'][0]

# --- Model Implementation Prep ---
# Separate features and target
X = train_df.drop(columns=['id', target_column])
y = train_df[target_column]
X_test = test_df.drop(columns=['id'])

# Handle categorical features for LightGBM
# Convert object columns to 'category' dtype or use LabelEncoder
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    # Apply to test set as well
    if col in X_test.columns:
        # Handle new categories in test set by fitting on combined data
        combined_data = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
        le.fit(combined_data)
        X_test[col] = le.transform(X_test[col].astype(str))

# Convert remaining object columns in X_test if any exist
for col in X_test.select_dtypes(include='object').columns:
    le = LabelEncoder()
    # Handle new categories in test set by fitting on combined data
    combined_data = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined_data)
    X_test[col] = le.transform(X_test[col].astype(str))

# Align columns after encoding to ensure both X and X_test have the same features
# This is crucial if some categorical features existed in train but not test, or vice-versa
common_cols = list(set(X.columns) & set(X_test.columns))
X = X[common_cols]
X_test = X_test[common_cols]

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM Model
lgb_params = {
    'objective': 'mae', # Optimize for Mean Absolute Error
    'metric': 'mae',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1, # Suppress verbose output during training
    'n_jobs': -1, # Use all available cores
    'seed': 42,
    'boosting_type': 'gbdt',
}

model = lgb.LGBMRegressor(**lgb_params)

# Train the model
print("\n--- Training LightGBM Model ---")
model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          eval_metric='mae',
          callbacks=[lgb.early_stopping(100, verbose=False)])

# Evaluate on validation set
val_preds = model.predict(X_val)
val_mae = mean_absolute_error(y_val, val_preds)
print(f"\nValidation MAE: {val_mae:.4f}")

# Make predictions on the test set
test_preds = model.predict(X_test)

# Create submission file
submission_df = pd.DataFrame({'id': test_df['id'], target_column: test_preds})
submission_df.to_csv('submission.csv', index=False)

print(f"\nPredictions saved to submission.csv with MAE optimization. Achieved MAE on validation set: {val_mae:.4f}")


--- Training LightGBM Model ---

Validation MAE: 245.3104

Predictions saved to submission.csv with MAE optimization. Achieved MAE on validation set: 245.3104


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')

# Identify target variable
target_column = [col for col in sample_submission_df.columns if col != 'id'][0]
print(f"Identified Target Variable: {target_column}")

# Prepare features and target
# Exclude 'id' and 'Row#' as they are typically identifiers and not predictive features
X = train_df.drop(columns=['id', 'Row#', target_column])
y = train_df[target_column]
X_test = test_df.drop(columns=['id', 'Row#'])

# Ensure columns are aligned between training and test sets after dropping 'id'/'Row#'
# This is crucial for consistent feature sets.
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0 # Add missing columns to test set with default value (e.g., 0 or mean)

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0 # Add missing columns to train set

X_test = X_test[train_cols] # Ensure column order matches training set

print(f"Features for training: {X.columns.tolist()}")
print(f"Features for testing: {X_test.columns.tolist()}")
print(f"Number of training samples: {len(X)}")
print(f"Number of test samples: {len(X_test)}")

# --- GradientBoostingRegressor Model with K-Fold Cross-Validation ---

# Model Parameters tuned for MAE
# 'loss': 'absolute_error' directly optimizes for MAE
# 'n_estimators': Number of boosting stages
# 'learning_rate': Shrinks the contribution of each tree
# 'max_depth': Limits the number of nodes in the tree
# 'subsample': Fraction of samples used for fitting the individual base learners. Reduces variance.
# 'random_state' for reproducibility
gbr_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 5,
    'subsample': 0.8,
    'loss': 'absolute_error', # Directly optimizes for MAE
    'random_state': 42,
    'verbose': 0 # Suppress verbose output
}

# Initialize K-Fold Cross-Validation
# n_splits: number of folds
# shuffle: whether to shuffle the data before splitting
# random_state for reproducibility
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X)) # Out-of-fold predictions for local MAE calculation
test_preds = np.zeros(len(X_test))
models = [] # To store trained models if needed

print("\n--- Training GradientBoostingRegressor with 5-Fold Cross-Validation ---")
for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}/{kf.n_splits}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = GradientBoostingRegressor(**gbr_params)
    model.fit(X_train, y_train)
    models.append(model) # Store model

    # Make predictions for OOF and test set
    oof_preds[val_index] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits # Average predictions across folds

# Calculate overall OOF MAE
overall_oof_mae = mean_absolute_error(y, oof_preds)
print(f"\nOverall K-Fold Cross-Validation MAE: {overall_oof_mae:.4f}")

# Create submission file
submission_df = pd.DataFrame({'id': test_df['id'], target_column: test_preds})
submission_df.to_csv('submission_gbr_cv.csv', index=False)

print(f"\nNew submission file 'submission_gbr_cv.csv' created.")
print(f"Predictions generated using GradientBoostingRegressor with 5-Fold Cross-Validation.")

Identified Target Variable: yield
Features for training: ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds']
Features for testing: ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds']
Number of training samples: 15000
Number of test samples: 10000

--- Training GradientBoostingRegressor with 5-Fold Cross-Validation ---
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5

Overall K-Fold Cross-Validation MAE: 240.8939

New submission file 'submission_gbr_cv.csv' created.
Predictions generated using GradientBoostingRegressor with 5-Fold Cross-Validation.
