In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [5]:

# Load the dataset
train_path = "train.csv"
test_path = "test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Assume 'output_electricity_generation' is the target variable
target = 'output_electricity_generation'
features = [col for col in train_df.columns if col != target]

# Identify categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()

# Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = train_df[col].astype(str).fillna("Unknown")  # Handle NaN as 'Unknown'
    test_df[col] = test_df[col].astype(str).fillna("Unknown")
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

# Handle missing values (fill with median)
train_df.fillna(train_df.median(), inplace=True)
test_df.fillna(test_df.median(), inplace=True)

# Split into X (features) and y (target)
X = train_df[features]
y = train_df[target]

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(test_df[features])

In [6]:

# Define ensemble models
bagging_model = RandomForestRegressor(n_estimators=100, random_state=42)
boosting_models = {
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')
}

# Train and evaluate models
def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_squared_error(y_valid, y_pred, squared=False)

results = {"Bagging": evaluate_model(bagging_model, X_train, y_train, X_valid, y_valid)}
for name, model in boosting_models.items():
    results[name] = evaluate_model(model, X_train, y_train, X_valid, y_valid)



In [7]:
# Stacking Regressor
stacking_model = StackingRegressor(
    estimators=[("rf", RandomForestRegressor(n_estimators=50, random_state=42)),
                ("xgb", XGBRegressor(n_estimators=50, random_state=42))],
    final_estimator=GradientBoostingRegressor(n_estimators=50, random_state=42)
)
results["Stacking"] = evaluate_model(stacking_model, X_train, y_train, X_valid, y_valid)

# Voting Regressor
voting_model = VotingRegressor([
    ("rf", RandomForestRegressor(n_estimators=50, random_state=42)),
    ("xgb", XGBRegressor(n_estimators=50, random_state=42)),
    ("gb", GradientBoostingRegressor(n_estimators=50, random_state=42))
])
results["Voting"] = evaluate_model(voting_model, X_train, y_train, X_valid, y_valid)

# Display results
results



{'Bagging': 2.958676875889716,
 'Gradient Boosting': 8.875292755054842,
 'AdaBoost': 30.451161722865624,
 'XGBoost': 5.437956725688426,
 'Stacking': 3.2087971116152634,
 'Voting': 4.896883687613865}

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [10]:

# Load the dataset
train_path = "train.csv"
test_path = "test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Assume 'output_electricity_generation' is the target variable
target = 'output_electricity_generation'
features = [col for col in train_df.columns if col != target]

# Identify categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()

# Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = train_df[col].astype(str).fillna("Unknown")  # Handle NaN as 'Unknown'
    test_df[col] = test_df[col].astype(str).fillna("Unknown")
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

# Handle missing values (fill with median)
train_df.fillna(train_df.median(), inplace=True)
test_df.fillna(test_df.median(), inplace=True)

# Split into X (features) and y (target)
X = train_df[features]
y = train_df[target]

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(test_df[features])

In [13]:

# Hyperparameter tuning for Random Forest
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=3, scoring='neg_root_mean_squared_error')
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Hyperparameter tuning for XGBoost
xgb_params = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]}
xgb_grid = GridSearchCV(XGBRegressor(eval_metric='rmse', random_state=42), xgb_params, cv=3, scoring='neg_root_mean_squared_error')
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

# Hyperparameter tuning for Gradient Boosting
gb_params = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]}
gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_params, cv=3, scoring='neg_root_mean_squared_error')
gb_grid.fit(X_train, y_train)
best_gb = gb_grid.best_estimator_

# Hyperparameter tuning for AdaBoost
ada_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
ada_grid = GridSearchCV(AdaBoostRegressor(random_state=42), ada_params, cv=3, scoring='neg_root_mean_squared_error')
ada_grid.fit(X_train, y_train)
best_ada = ada_grid.best_estimator_

# Define ensemble models with optimized hyperparameters
bagging_model = best_rf
boosting_models = {"Gradient Boosting": best_gb, "AdaBoost": best_ada, "XGBoost": best_xgb}

# Define ensemble models with optimized hyperparameters
bagging_model = best_rf
boosting_models = {"Gradient Boosting": best_gb, "AdaBoost": best_ada, "XGBoost": best_xgb}

KeyboardInterrupt: 

In [None]:
# Train and evaluate models
def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_squared_error(y_valid, y_pred, squared=False)

results = {"Bagging": evaluate_model(bagging_model, X_train, y_train, X_valid, y_valid)}
for name, model in boosting_models.items():
    results[name] = evaluate_model(model, X_train, y_train, X_valid, y_valid)

# Stacking Regressor
stacking_model = StackingRegressor(
    estimators=[("rf", best_rf),
                ("xgb", best_xgb)],
    final_estimator=best_gb
)
results["Stacking"] = evaluate_model(stacking_model, X_train, y_train, X_valid, y_valid)

# Voting Regressor
voting_model = VotingRegressor([
    ("rf", best_rf),
    ("xgb", best_xgb),
    ("gb", best_gb)
])
results["Voting"] = evaluate_model(voting_model, X_train, y_train, X_valid, y_valid)

# Display results
results