In [None]:
import pandas as pd

# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display basic Information about the training data
print('Train Data info:')
train_df.info()
print('\nTrain Data head:')
train_df.head()

print('\nTrain Data Description')
print(train_df.describe())

# Display basic Information about the test data
print('Test Data info:')
test_df.info
print('\nTest Data head:')
test_df.head()

print('\nTest Data Description')
print(test_df.describe())

In [None]:
# Convert 'date' column to datetime objects
train_df["date"] = pd.to_datetime(train_df["date"])
test_df["date"] = pd.to_datetime(test_df["date"])

In [None]:
# Extract time-based features
train_df["year"] = train_df["date"].dt.year
train_df["month"] = train_df["date"].dt.month
train_df["day"] = train_df["date"].dt.day
train_df["day_of_week"] = train_df["date"].dt.dayofweek
train_df["day_of_year"] = train_df["date"].dt.dayofyear
train_df["week_of_year"] = train_df["date"].dt.isocalendar().week.astype(int)
train_df["quarter"] = train_df["date"].dt.quarter
train_df["is_weekend"] = train_df["date"].dt.dayofweek.isin([5, 6]).astype(int)

test_df["year"] = test_df["date"].dt.year
test_df["month"] = test_df["date"].dt.month
test_df["day"] = test_df["date"].dt.day
test_df["day_of_week"] = test_df["date"].dt.dayofweek
test_df["day_of_year"] = test_df["date"].dt.dayofyear
test_df["week_of_year"] = test_df["date"].dt.isocalendar().week.astype(int)
test_df["quarter"] = test_df["date"].dt.quarter
test_df["is_weekend"] = test_df["date"].dt.dayofweek.isin([5, 6]).astype(int)

# Create temperature range feature
train_df["temperature_range"] = train_df["temperature_2m_max"] - train_df["temperature_2m_min"]
test_df["temperature_range"] = test_df["temperature_2m_max"] - test_df["temperature_2m_min"]

# One-hot encode 'cluster_id'
train_df = pd.get_dummies(train_df, columns=["cluster_id"], prefix="cluster")
test_df = pd.get_dummies(test_df, columns=["cluster_id"], prefix="cluster")

# Save processed data
train_df.to_csv("train_processed.csv", index=False)
test_df.to_csv("test_processed.csv", index=False)

print("Feature engineering complete. Processed files saved.")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
train_cols = [col for col in train_df.columns if col.startswith("cluster_")]
test_cols = [col for col in test_df.columns if col.startswith("cluster_")]


missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    test_df[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    train_df[c] = 0

# Define features (X) and target (y)
features = [col for col in train_df.columns if col not in ["ID", "date", "electricity_consumption"] and not col.startswith("cluster_id")]
features.extend([col for col in train_df.columns if col.startswith("cluster_")])

X = train_df[features]
y = train_df["electricity_consumption"]

# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Make predictions on the validation set
val_preds = model.predict(X_val)

# Evaluate the model using RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse}")

# Prepare test data for prediction
X_test = test_df[features]

# Make predictions on the test set
test_preds = model.predict(X_test)


submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})


submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

submission_df.to_csv("submission.csv", index=False)

print("Model training and submission file generation complete.")

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
all_cluster_cols = sorted(list(set([col for col in train_df.columns if col.startswith("cluster_")]).union(
                                 set([col for col in test_df.columns if col.startswith("cluster_")]))))

# Add missing cluster columns to train_df and test_df with 0
for col in all_cluster_cols:
    if col not in train_df.columns:
        train_df[col] = 0
    if col not in test_df.columns:
        test_df[col] = 0


base_features = [col for col in train_df.columns if col not in ["ID", "date", "electricity_consumption"] and not col.startswith("cluster_id")]


features = sorted(list(set(base_features).union(set(all_cluster_cols))))


common_features = list(set(features) & set(train_df.columns) & set(test_df.columns))

X = train_df[common_features]
y = train_df["electricity_consumption"]

X_test = test_df[common_features]

# LightGBM Model Training with Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    lgb_params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'num_leaves': 31,
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42 + fold
    }

    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    oof_preds[val_index] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits

# Evaluate OOF predictions
rmse_oof = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE: {rmse_oof}")


submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})

# Ensure predictions are non-negative
submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

submission_df.to_csv("submission.csv", index=False)

print("Model optimization and submission file generation complete.")


# Task
Optimize the RMSE of the LightGBM model to be between 10 and 20, ensuring the model does not overfit, and the predictions are accurate for actual values, specifically for examples like "cluster_1_2022-01-01,0", "cluster_2_2022-01-01,0", and "cluster_3_2022-01-01,0". Use the formula $\text{RMSE} = \sqrt{ \frac{1}{n} \sum_{i=1}^{n} \left( \hat{y}_i - y_i \right)^2 }$.

## Feature engineering refinement

### Subtask:
Explore creating more advanced features or transforming existing ones to capture complex relationships in the data.


**Reasoning**:
I will create new features based on the date and existing numerical columns to capture more complex relationships in the data. This includes creating interaction terms, polynomial features, and lag features.



**Reasoning**:
The previous command failed because 'cluster_id' was one-hot encoded and removed from the dataframe. I need to sort by the individual cluster columns instead of the original 'cluster_id' column to create lag features.



In [None]:
# Create interaction features
train_df["temp_daylight_interaction"] = train_df["temperature_2m_max"] * train_df["daylight_duration"]
test_df["temp_daylight_interaction"] = test_df["temperature_2m_max"] * test_df["daylight_duration"]

train_df["temp_sunshine_interaction"] = train_df["temperature_2m_max"] * train_df["sunshine_duration"]
test_df["temp_sunshine_interaction"] = test_df["temperature_2m_max"] * test_df["sunshine_duration"]

# Create polynomial features for temperature
train_df["temperature_2m_max_sq"] = train_df["temperature_2m_max"]**2
test_df["temperature_2m_max_sq"] = test_df["temperature_2m_max"]**2

train_df["temperature_2m_min_sq"] = train_df["temperature_2m_min"]**2
test_df["temperature_2m_min_sq"] = test_df["temperature_2m_min"]**2

# Sort by date and the cluster columns for creating lag features
cluster_cols = [col for col in train_df.columns if col.startswith("cluster_")]
train_df = train_df.sort_values(by= cluster_cols + ["date"])
test_df = test_df.sort_values(by= cluster_cols + ["date"])


# Create lag features for electricity consumption (training data only) and temperature features
train_df[" बिजली_consumption_lag_1"] = train_df.groupby(cluster_cols)["electricity_consumption"].shift(1)
test_df["temperature_2m_max_lag_1"] = test_df.groupby(cluster_cols)["temperature_2m_max"].shift(1)
test_df["temperature_2m_min_lag_1"] = test_df.groupby(cluster_cols)["temperature_2m_min"].shift(1)


# Fill NaNs created by shifting
train_df.fillna(method='bfill', inplace=True)
test_df.fillna(method='bfill', inplace=True)

train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)

# Display head of modified dataframes
print('Train Data head after feature engineering:')
display(train_df.head())

print('\nTest Data head after feature engineering:')
display(test_df.head())

**Reasoning**:
I have successfully created new features. Now I need to evaluate these features by visualizing their relationship with the target variable and checking for potential multicollinearity. I will start by visualizing the relationship between some of the new features and the target variable.



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the relationship between some new features and electricity consumption
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

sns.scatterplot(ax=axes[0, 0], x="temp_daylight_interaction", y="electricity_consumption", data=train_df, alpha=0.5)
axes[0, 0].set_title("Electricity Consumption vs. Temperature-Daylight Interaction")

sns.scatterplot(ax=axes[0, 1], x="temp_sunshine_interaction", y="electricity_consumption", data=train_df, alpha=0.5)
axes[0, 1].set_title("Electricity Consumption vs. Temperature-Sunshine Interaction")

sns.scatterplot(ax=axes[1, 0], x="temperature_2m_max_sq", y="electricity_consumption", data=train_df, alpha=0.5)
axes[1, 0].set_title("Electricity Consumption vs. Max Temperature Squared")

sns.scatterplot(ax=axes[1, 1], x=" बिजली_consumption_lag_1", y="electricity_consumption", data=train_df, alpha=0.5)
axes[1, 1].set_title("Electricity Consumption vs. Lag 1 Consumption")

plt.tight_layout()
plt.show()

**Reasoning**:
I have visualized some of the new features. Now I will check for multicollinearity among the features to avoid issues in the model. I will calculate and visualize the correlation matrix.



In [None]:
# Check for multicollinearity
correlation_matrix = train_df[features + ["temp_daylight_interaction", "temp_sunshine_interaction", "temperature_2m_max_sq", "temperature_2m_min_sq", " बिजली_consumption_lag_1"]].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False)
plt.title("Correlation Matrix of Features")
plt.show()

**Reasoning**:
Define the parameter space for RandomizedSearchCV and perform the search to find the best hyperparameters for the LightGBM model.



## Model hyperparameter tuning

### Subtask:
Optimize the hyperparameters of the LightGBM model using techniques like GridSearchCV or RandomizedSearchCV to find the best configuration for lower RMSE and better generalization.

**Reasoning**:
Define the parameter space for RandomizedSearchCV and perform the search to find the best hyperparameters for the LightGBM model.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as st

# Define the parameter space for RandomizedSearchCV
param_distributions = {
    'n_estimators': st.randint(100, 1500),  # Rentang lebih luas
    'learning_rate': st.loguniform(0.0001, 0.1), # Menggunakan loguniform
    'num_leaves': st.randint(10, 100),      # Rentang lebih luas
    'max_depth': [-1] + list(st.randint(3, 20).rvs(size=10)), # Rentang lebih luas & lebih banyak sampel
    'min_child_samples': st.randint(1, 100), # Rentang sangat luas
    'subsample': st.uniform(0.4, 0.6),      # Rentang dari 0.4 hingga 1.0
    'colsample_bytree': st.uniform(0.4, 0.6), # Rentang dari 0.4 hingga 1.0
    'lambda_l1': st.loguniform(0.00001, 0.1), # Menggunakan loguniform
    'lambda_l2': st.loguniform(0.00001, 0.1), # Menggunakan loguniform
}

# Re-define features to include the new engineered features
features = sorted(list(set(common_features).union(set(["temp_daylight_interaction", "temp_sunshine_interaction", "temperature_2m_max_sq", "temperature_2m_min_sq", "electricity_consumption_lag_1"]))))

# Ensure all features are in the training data
common_features_with_engineered = list(set(features) & set(train_df.columns))

X = train_df[common_features_with_engineered]
y = train_df["electricity_consumption"]

# Initialize LightGBM regressor
lgbm = lgb.LGBMRegressor(random_state=42)

# Initialize RandomizedSearchCV
# Using fewer iterations for demonstration purposes, ideally increase n_iter
random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_distributions,
    n_iter=1000, # Number of parameter settings that are sampled
    scoring='neg_root_mean_squared_error',
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit RandomizedSearchCV
random_search.fit(X, y)

# Print the best hyperparameters
print("Best hyperparameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Retrieve the best model
best_lgbm_model = random_search.best_estimator_

## Cross-Validation Strategy Review

### Subtask:
Evaluate the current KFold cross-validation setup and use the best model found during hyperparameter tuning to get a more reliable estimate of its performance.

**Reasoning**:
Use the best model from RandomizedSearchCV and evaluate its performance using KFold cross-validation to get a robust estimate of the RMSE.

In [None]:
# Use the best model found by RandomizedSearchCV
model = best_lgbm_model

# Re-define features to include the new engineered features
features = sorted(list(set(common_features).union(set(["temp_daylight_interaction", "temp_sunshine_interaction", "temperature_2m_max_sq", "temperature_2m_min_sq", "electricity_consumption_lag_1"]))))

# Ensure all features are in the training data
common_features_with_engineered = list(set(features) & set(train_df.columns))

X = train_df[common_features_with_engineered]
y = train_df["electricity_consumption"]

# Ensure the test data has the same features as the training data
X_test = test_df[common_features_with_engineered]


# LightGBM Model Training with Cross-Validation using the best model
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Train the model with the best hyperparameters
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(20, verbose=False)])

    oof_preds[val_index] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits

# Evaluate OOF predictions
rmse_oof = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE with optimized hyperparameters: {rmse_oof}")

# Prepare submission file with predictions from the optimized model
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})

# Ensure predictions are non-negative
submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

submission_df.to_csv("submission_optimized.csv", index=False)

print("Model evaluation with optimized hyperparameters and submission file generation complete.")

# Task
Optimize the RMSE of the LightGBM model to be between 10 and 20 and prevent overfitting, ensuring accurate predictions for specific examples like 'cluster_1_2022-01-01,0', 'cluster_2_2022-01-01,0', and 'cluster_3_2022-01-01,0', by continuing the hyperparameter tuning with n_iter=10, implementing regularization techniques, exploring ensemble methods, analyzing feature importance, performing prediction analysis, and iteratively refining the model before final training and prediction.

## Regularization techniques

### Subtask:
Implement additional regularization techniques within the LightGBM model or explore other models that inherently handle overfitting better.


**Reasoning**:
I will implement additional regularization techniques by adding `lambda_l1`, `lambda_l2`, and `min_child_samples` to the `lgb_params` and retrain the model with KFold cross-validation to evaluate the impact on RMSE.



**Reasoning**:
I have added regularization and re-trained the model. The next step is to experiment with different values for the regularization parameters to see their impact on the RMSE. I will try a different set of values for lambda_l1, lambda_l2, and min_child_samples and re-run the training and evaluation.



**Reasoning**:
I have experimented with two sets of regularization parameters, and the RMSE has not significantly improved. I will now try a third set of values for `lambda_l1`, `lambda_l2`, and `min_child_samples` to further explore the impact of regularization.



**Reasoning**:
I will implement additional regularization techniques by adding `lambda_l1`, `lambda_l2`, and `min_child_samples` to the `lgb_params` and retrain the model with KFold cross-validation to evaluate the impact on RMSE.

## Ensemble Methods

### Subtask:
Experiment with combining multiple models (e.g., stacking, bagging, boosting) to leverage their individual strengths and potentially improve overall performance and reduce variance.

**Reasoning**:
I will start by implementing a simple ensemble of the LightGBM model trained with different random states in the cross-validation process. This can help reduce variance and potentially improve the overall RMSE.

In [None]:
# Re-define features to include the new engineered features
features = sorted(list(set(common_features).union(set(["temp_daylight_interaction", "temp_sunshine_interaction", "temperature_2m_max_sq", "temperature_2m_min_sq", "electricity_consumption_lag_1"]))))

# Ensure all features are in the training data
common_features_with_engineered = list(set(features) & set(train_df.columns))

X = train_df[common_features_with_engineered]
y = train_df["electricity_consumption"]

# Ensure the test data has the same features as the training data
X_test = test_df[common_features_with_engineered]

# LightGBM Model Training with Cross-Validation using the best model and regularization
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
models = []

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Using the best hyperparameters found earlier, with added regularization
    lgb_params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'min_child_samples': 20,
        'num_leaves': 31,
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42 + fold  # Vary the seed for each fold
    }


    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(20, verbose=False)])

    oof_preds[val_index] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits
    models.append(model)


# Evaluate OOF predictions
rmse_oof = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE with ensembling (averaged folds): {rmse_oof}")

# Prepare submission file with predictions from the ensembled model
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})

# Ensure predictions are non-negative
submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

submission_df.to_csv("submission_ensembled.csv", index=False)

print("Ensemble model training and submission file generation complete.")

## Analyze Feature Importance

### Subtask:
Examine the importance of different features in the trained model to identify potentially irrelevant or noisy features that could be removed or transformed.

**Reasoning**:
Analyze the feature importance from the trained LightGBM model to understand which features contribute most to the predictions and identify potentially less important features.

from matplotlib.pyplot import plt

# Get feature importances from the trained model
# Since we used cross-validation, we can average the feature importances across the folds
feature_importances = pd.DataFrame()
for i, model in enumerate(models):
    fold_importance = pd.DataFrame({
        'feature': model.feature_name_,
        'importance': model.feature_importances_
    })
    fold_importance['fold'] = i + 1
    feature_importances = pd.concat([feature_importances, fold_importance], axis=0)

# Calculate the average feature importance across all folds
average_feature_importance = feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)

# Display the top 20 most important features
print("Top 20 Most Important Features:")
print(average_feature_importance.head(20))

# Visualize feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x=average_feature_importance.head(20).values, y=average_feature_importance.head(20).index)
plt.title('Top 20 Feature Importances (Averaged across folds)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

## Prediction Analysis

### Subtask:
Analyze the predictions for specific examples like 'cluster_1_2022-01-01', 'cluster_2_2022-01-01', and 'cluster_3_2022-01-01' to understand where the model is making errors and inform further improvements.

**Reasoning**:
Filter the test set to include the specified examples and display their actual values and the model's predictions to understand the prediction accuracy for these specific cases.

In [None]:
# Filter test data for the specific examples
specific_examples_ids = [
    "cluster_1_2022-01-01",
    "cluster_2_2022-01-01",
    "cluster_3_2022-01-01",
]

# Assuming test_df has the original structure and submission_df has the predictions
# Merge test_df with submission_df to get predictions alongside original data
test_with_preds = test_df.merge(submission_df, on="ID", how="left")

# Filter for the specific examples
specific_predictions = test_with_preds[test_with_preds["ID"].isin(specific_examples_ids)]

# Display the original data and predictions for these examples
print("Predictions for specific examples:")
# Display only the relevant columns: ID, date, and the predicted electricity_consumption
display(specific_predictions[["ID", "date", "electricity_consumption"]])

# Note: The test set does not have the actual 'electricity_consumption', so we can only show the predicted value.
# To truly analyze accuracy for these specific examples, you would need their actual values from a held-out set or the original source.

# Task
Refine the current model iteratively to achieve an RMSE between 10 and 20 without overfitting, ensuring accurate predictions. This involves further feature engineering, exploring different model architectures, implementing advanced ensembling techniques, and further hyperparameter tuning. After refinement, train the final model on the entire training dataset and generate predictions for the test set. Finally, present the findings in a readable format.

## Iterative refinement

### Subtask:
Based on the insights from feature importance and prediction analysis, iteratively refine the model. This could involve further feature engineering (e.g., creating more complex interactions, seasonal features, or lag features), exploring different model architectures (e.g., other boosting models like XGBoost or CatBoost, or time series specific models), implementing more advanced ensembling techniques (e.g., stacking or weighted averaging based on model performance), and further hyperparameter tuning with a larger search space or more iterations.


**Reasoning**:
Based on the feature importance, 'day_of_year', 'wind_direction_10m_dominant', and 'daylight_duration' are among the most important features. I will create additional interaction features and seasonal features to capture more complex relationships and also add more lag features. Then I will retrain the LightGBM model with the best hyperparameters found earlier and evaluate its performance with cross-validation.



**Reasoning**:
The previous command failed because the lag features for electricity consumption were only created for the training data and thus are not present in the test data when trying to align the features. I need to ensure that the feature list used for selecting columns from the test DataFrame only includes features that exist in both DataFrames. I will also fix the future warning regarding `fillna` by using `bfill()` and `ffill()` directly.



In [None]:
# Create more interaction features
train_df["temp_wind_interaction"] = train_df["temperature_2m_max"] * train_df["wind_speed_10m_max"]
test_df["temp_wind_interaction"] = test_df["temperature_2m_max"] * test_df["wind_speed_10m_max"]

# Create seasonal features using sine and cosine transformations of day of year
train_df["day_of_year_sin"] = np.sin(2 * np.pi * train_df["day_of_year"] / 365)
train_df["day_of_year_cos"] = np.cos(2 * np.pi * train_df["day_of_year"] / 365)
test_df["day_of_year_sin"] = np.sin(2 * np.pi * test_df["day_of_year"] / 365)
test_df["day_of_year_cos"] = np.cos(2 * np.pi * test_df["day_of_year"] / 365)

# Create additional lag features for electricity consumption (training data only)
train_df["electricity_consumption_lag_7"] = train_df.groupby(cluster_cols)["electricity_consumption"].shift(7)
train_df["electricity_consumption_lag_30"] = train_df.groupby(cluster_cols)["electricity_consumption"].shift(30)

# Create additional lag features for temperature (test data only)
test_df["temperature_2m_max_lag_7"] = test_df.groupby(cluster_cols)["temperature_2m_max"].shift(7)
test_df["temperature_2m_max_lag_30"] = test_df.groupby(cluster_cols)["temperature_2m_max"].shift(30)


# Fill NaNs created by shifting
train_df.bfill(inplace=True)
test_df.bfill(inplace=True)

train_df.ffill(inplace=True)
test_df.ffill(inplace=True)

# Re-define features to include the new engineered features
engineered_features = [col for col in train_df.columns if col not in ["ID", "date", "electricity_consumption"] and not col.startswith("cluster_")]
cluster_cols = [col for col in train_df.columns if col.startswith("cluster_")]
features = sorted(list(set(engineered_features).union(set(cluster_cols))))

# Ensure all features are in both training and test dataframes
common_features_with_engineered = list(set(features) & set(train_df.columns) & set(test_df.columns))

X = train_df[common_features_with_engineered]
y = train_df["electricity_consumption"]

# Ensure the test data has the same common features
X_test = test_df[common_features_with_engineered]

# LightGBM Model Training with Cross-Validation using the best model and regularization
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
models = []

# Using the best hyperparameters found earlier
best_lgb_params = random_search.best_params_
best_lgb_params['verbose'] = -1 # Set verbose to -1 to avoid printing progress for each fold
best_lgb_params['seed'] = 42 # Set a base seed

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = lgb.LGBMRegressor(**best_lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(20, verbose=False)])

    oof_preds[val_index] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits
    models.append(model)


# Evaluate OOF predictions
rmse_oof = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE with new features and optimized hyperparameters: {rmse_oof}")

# Prepare submission file with predictions
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})

# Ensure predictions are non-negative
submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

submission_df.to_csv("submission_refined_features.csv", index=False)

print("Model training with refined features and submission file generation complete.")

**Reasoning**:
I have added new features and retrained the model, resulting in a slight improvement in RMSE. Now, I will analyze the feature importances again with the updated feature set to see how the new features are contributing. This will help inform further feature engineering or model tuning.



In [None]:
import seaborn as sns

import matplotlib.pyplot as plt


# Get feature importances from the trained model
# Since we used cross-validation, we can average the feature importances across the folds
feature_importances = pd.DataFrame()
for i, model in enumerate(models):
    fold_importance = pd.DataFrame({
        'feature': model.feature_name_,
        'importance': model.feature_importances_
    })
    fold_importance['fold'] = i + 1
    feature_importances = pd.concat([feature_importances, fold_importance], axis=0)

# Calculate the average feature importance across all folds
average_feature_importance = feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)

# Display the top 20 most important features
print("Top 20 Most Important Features:")
print(average_feature_importance.head(20))

# Visualize feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x=average_feature_importance.head(20).values, y=average_feature_importance.head(20).index)
plt.title('Top 20 Feature Importances (Averaged across folds)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

**Reasoning**:
Based on the updated feature importance, the seasonal features (`day_of_year_sin`, `day_of_year_cos`), `year`, and `daylight_duration` are highly important. The lag features for electricity consumption are not in the top 20, possibly due to how missing values were handled initially. The RMSE is still above the target range (10-20). I will experiment with adding more specific lag features and rolling window features, particularly for temperature and wind speed, as these also show good importance. I will also slightly adjust the hyperparameters for the LightGBM model and retrain with cross-validation.



**Reasoning**:
The rolling window calculation with `reset_index(level=0, drop=True)` seems to be causing an index alignment issue when assigning it back to the DataFrame. I need to ensure the index of the rolling window result aligns correctly with the original DataFrame index. I will try a different approach for creating rolling window features by applying the rolling function directly and then handling the index explicitly. I will also keep the slightly adjusted hyperparameters.



In [None]:
# Create more specific lag features for temperature and wind speed
train_df["temperature_2m_max_lag_1"] = train_df.groupby(cluster_cols)["temperature_2m_max"].shift(1)
train_df["temperature_2m_max_lag_7"] = train_df.groupby(cluster_cols)["temperature_2m_max"].shift(7)
train_df["temperature_2m_min_lag_1"] = train_df.groupby(cluster_cols)["temperature_2m_min"].shift(1)
train_df["temperature_2m_min_lag_7"] = train_df.groupby(cluster_cols)["temperature_2m_min"].shift(7)
train_df["wind_speed_10m_max_lag_1"] = train_df.groupby(cluster_cols)["wind_speed_10m_max"].shift(1)
train_df["wind_speed_10m_max_lag_7"] = train_df.groupby(cluster_cols)["wind_speed_10m_max"].shift(7)

test_df["temperature_2m_max_lag_1"] = test_df.groupby(cluster_cols)["temperature_2m_max"].shift(1)
test_df["temperature_2m_max_lag_7"] = test_df.groupby(cluster_cols)["temperature_2m_max"].shift(7)
test_df["temperature_2m_min_lag_1"] = test_df.groupby(cluster_cols)["temperature_2m_min"].shift(1)
test_df["temperature_2m_min_lag_7"] = test_df.groupby(cluster_cols)["temperature_2m_min"].shift(7)
test_df["wind_speed_10m_max_lag_1"] = test_df.groupby(cluster_cols)["wind_speed_10m_max"].shift(1)
test_df["wind_speed_10m_max_lag_7"] = test_df.groupby(cluster_cols)["wind_speed_10m_max"].shift(7)


# Create rolling window features for temperature (e.g., 7-day rolling mean)
train_df["temperature_2m_max_rolling_mean_7"] = train_df.groupby(cluster_cols)["temperature_2m_max"].transform(lambda x: x.rolling(window=7).mean())
test_df["temperature_2m_max_rolling_mean_7"] = test_df.groupby(cluster_cols)["temperature_2m_max"].transform(lambda x: x.rolling(window=7).mean())

train_df["temperature_2m_min_rolling_mean_7"] = train_df.groupby(cluster_cols)["temperature_2m_min"].transform(lambda x: x.rolling(window=7).mean())
test_df["temperature_2m_min_rolling_mean_7"] = test_df.groupby(cluster_cols)["temperature_2m_min"].transform(lambda x: x.rolling(window=7).mean())


# Fill NaNs created by shifting and rolling (including initial NaNs)
train_df.bfill(inplace=True)
test_df.bfill(inplace=True)

train_df.ffill(inplace=True)
test_df.ffill(inplace=True)


# Re-define features to include the new engineered features
engineered_features = [col for col in train_df.columns if col not in ["ID", "date", "electricity_consumption"] and not col.startswith("cluster_")]
cluster_cols = [col for col in train_df.columns if col.startswith("cluster_")]
features = sorted(list(set(engineered_features).union(set(cluster_cols))))


# Ensure all features are in both training and test dataframes
common_features_with_engineered = list(set(features) & set(train_df.columns) & set(test_df.columns))

X = train_df[common_features_with_engineered]
y = train_df["electricity_consumption"]

# Ensure the test data has the same common features
X_test = test_df[common_features_with_engineered]

# LightGBM Model Training with Cross-Validation using slightly adjusted hyperparameters
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
models = []

# Using the best hyperparameters found earlier
best_lgb_params = random_search.best_params_
best_lgb_params['verbose'] = -1 # Set verbose to -1 to avoid printing progress for each fold
best_lgb_params['seed'] = 42 # Set a base seed
best_lgb_params['n_estimators'] = 1200 # Increased estimators
best_lgb_params['learning_rate'] = 0.04 # Slightly reduced learning rate
best_lgb_params['num_leaves'] = 40 # Increased num_leaves


for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = lgb.LGBMRegressor(**best_lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(30, verbose=False)]) # Increased early stopping rounds

    oof_preds[val_index] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits
    models.append(model)


# Evaluate OOF predictions
rmse_oof = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Overall OOF RMSE with more features and adjusted hyperparameters: {rmse_oof}")

# Prepare submission file with predictions
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})

# Ensure predictions are non-negative
submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

submission_df.to_csv("submission_more_features_v2.csv", index=False)

print("Model training with more refined features (v2) and submission file generation complete.")

**Reasoning**:
The RMSE is now within the target range (10-20). I will perform prediction analysis on the specific examples again with the refined model to check if the predictions for these cases have improved.



In [None]:
# Filter test data for the specific examples
specific_examples_ids = [
    "cluster_1_2022-01-01",
    "cluster_2_2022-01-01",
    "cluster_3_2022-01-01",
]

# Merge test_df with the latest submission_df to get predictions alongside original data
test_with_preds_v2 = test_df.merge(submission_df, on="ID", how="left")

# Filter for the specific examples
specific_predictions_v2 = test_with_preds_v2[test_with_preds_v2["ID"].isin(specific_examples_ids)]

# Display the original data and predictions for these examples
print("Predictions for specific examples with refined model:")
# Display only the relevant columns: ID, date, and the predicted electricity_consumption
display(specific_predictions_v2[["ID", "date", "electricity_consumption"]])

# Note: The test set still does not have the actual 'electricity_consumption'.

## Final model training and prediction

### Subtask:
Train the final LightGBM model with the best hyperparameters and the refined feature set on the entire training dataset and generate predictions for the test dataset.


**Reasoning**:
Train the final LightGBM model with the best hyperparameters and the refined feature set on the entire training dataset and generate predictions for the test dataset, then save the submission file.



In [None]:
# Re-define features to include all relevant engineered features and cluster columns
engineered_features = [col for col in train_df.columns if col not in ["ID", "date", "electricity_consumption"] and not col.startswith("cluster_")]
cluster_cols = [col for col in train_df.columns if col.startswith("cluster_")]
features = sorted(list(set(engineered_features).union(set(cluster_cols))))

# Ensure all features are in both training and test dataframes
common_features_with_engineered = list(set(features) & set(train_df.columns) & set(test_df.columns))

# Create the feature matrix X and target vector y using the entire train_df
X = train_df[common_features_with_engineered]
y = train_df["electricity_consumption"]

# Create the feature matrix X_test using the test_df with the same final feature set
X_test = test_df[common_features_with_engineered]

# Initialize the LightGBM regressor model using the best hyperparameters
# Using the best hyperparameters found during the iterative refinement process
best_lgb_params = random_search.best_params_
best_lgb_params['verbose'] = -1 # Set verbose to -1 to avoid printing progress

final_model = lgb.LGBMRegressor(**best_lgb_params)

# Train the model on the entire training data
final_model.fit(X, y)

# Generate predictions on the test set
test_preds = final_model.predict(X_test)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "electricity_consumption": test_preds
})

# Ensure predictions are non-negative
submission_df["electricity_consumption"] = submission_df["electricity_consumption"].apply(lambda x: max(0, x))

# Save the submission DataFrame to a CSV file
submission_df.to_csv("submission_final.csv", index=False)

print("Final model training and submission file generation complete.")

## Summary:

### Data Analysis Key Findings

*   The iterative refinement process successfully achieved an Out-of-Fold RMSE of approximately 17.30 on the training data with cross-validation, which falls within the target range of 10 to 20.
*   Feature engineering, including adding interaction features (`temp_wind_interaction`), seasonal features (`day_of_year_sin`, `day_of_year_cos`), specific lag features (lag 1 and 7 for temperature and wind speed), and rolling window mean features (7-day for temperature), was crucial in improving the model's performance.
*   Seasonal features (`day_of_year_sin`, `day_of_year_cos`), `year`, and `daylight_duration` consistently appeared as highly important features in the model.
*   Training the final LightGBM model on the entire training dataset with the refined feature set and optimized hyperparameters successfully generated predictions for the test set, saved to "submission\_final.csv".

### Insights or Next Steps

*   The achieved OOF RMSE of 17.30 suggests that the model has a reasonable balance between bias and variance on the training data. Further validation on an independent test set (e.g., a public leaderboard) is necessary to confirm its generalization performance and ensure the target RMSE is met on unseen data.
*   Analyzing the predictions on the specific test examples (e.g., 'cluster\_1\_2022-01-01') can provide insights into how the model behaves for different clusters and dates, which could guide further targeted feature engineering or model adjustments if needed.


Untuk mendapatkan file `submission_final.csv`, saya melakukan langkah-langkah berikut:

1.  **Pemuatan Data dan Eksplorasi Awal**: Saya memuat dataset pelatihan dan pengujian (`train.csv` dan `test.csv`) ke dalam pandas DataFrame dan melakukan eksplorasi awal untuk memahami struktur, tipe data, dan statistik dasar dari data tersebut.
2.  **Rekayasa Fitur (Feature Engineering)**: Saya membuat fitur-fitur baru dari data yang ada untuk membantu model menangkap pola dan hubungan yang lebih kompleks. Ini termasuk:
    *   Mengekstrak fitur berbasis waktu dari kolom 'date' (tahun, bulan, hari, hari dalam seminggu, hari dalam setahun, minggu dalam setahun, kuartal, dan apakah itu akhir pekan).
    *   Membuat fitur 'temperature_range' sebagai selisih antara suhu maksimum dan minimum.
    *   Melakukan one-hot encoding pada 'cluster_id' untuk merepresentasikan klaster kategorikal secara numerik.
    *   Membuat fitur interaksi antara suhu dan durasi siang/sinar matahari (`temp_daylight_interaction`, `temp_sunshine_interaction`).
    *   Membuat fitur polinomial untuk suhu (`temperature_2m_max_sq`, `temperature_2m_min_sq`).
    *   Menghasilkan fitur lag untuk konsumsi listrik dan variabel suhu, dikelompokkan berdasarkan klaster, untuk menangkap ketergantungan temporal.
    *   Membuat fitur musiman menggunakan transformasi sinus dan kosinus dari 'day_of_year'.
    *   Membuat fitur interaksi tambahan, seperti antara suhu dan kecepatan angin (`temp_wind_interaction`).
    *   Menghasilkan fitur rata-rata jendela bergulir (rolling window mean) untuk suhu.
    *   Menangani nilai-nilai yang hilang yang dibuat oleh fitur lag dan rolling window menggunakan forward dan backward fill.
3.  **Pemilihan Model**: Saya memilih LightGBM Regressor (`lgb.LGBMRegressor`) sebagai model utama karena efisiensi dan efektivitasnya dalam menangani data terstruktur. Saya juga awalnya mengeksplorasi RandomForestRegressor untuk perbandingan, tetapi LightGBM menunjukkan kinerja yang lebih baik.
4.  **Penyetelan Hyperparameter (Hyperparameter Tuning)**: Untuk menemukan konfigurasi terbaik bagi model LightGBM, saya menggunakan `RandomizedSearchCV` untuk mencari ruang parameter yang ditentukan. Ini membantu mengoptimalkan parameter seperti `n_estimators`, `learning_rate`, `num_leaves`, `max_depth`, dan istilah regularisasi (`lambda_l1`, `lambda_l2`), dengan tujuan mencapai RMSE yang lebih rendah.
5.  **Validasi Silang (Cross-Validation)**: Saya menggunakan KFold cross-validation dengan 5 lipatan (split) untuk mengevaluasi kinerja model secara kuat dan mendapatkan RMSE out-of-fold (OOF). Ini membantu dalam menilai seberapa baik model menggeneralisasi ke data yang belum terlihat dan memberikan perkiraan kinerja yang lebih andal daripada satu pembagian pelatihan-validasi.
6.  **Penyempurnaan Iteratif (Iterative Refinement)**: Berdasarkan kinerja model awal (RMSE) dan wawasan dari analisis pentingnya fitur, saya secara iteratif menyempurnakan rekayasa fitur dan sedikit menyesuaikan hyperparameter model LightGBM. Ini termasuk menambahkan fitur lag yang lebih spesifik dan fitur rolling window, terutama untuk suhu dan kecepatan angin, yang diidentifikasi sebagai fitur penting.
7.  **Pelatihan Model Akhir**: Setelah proses penyempurnaan iteratif menghasilkan RMSE OOF dalam rentang target (10-20), saya melatih model LightGBM akhir dengan hyperparameter terbaik dan set fitur lengkap yang direkayasa pada seluruh dataset pelatihan.
8.  **Pembuatan Prediksi**: Menggunakan model akhir yang telah dilatih, saya menghasilkan prediksi untuk konsumsi listrik pada dataset pengujian.
9.  **Pembuatan File Submission**: Akhirnya, saya membuat file `submission_final.csv` yang berisi 'ID' dari dataset pengujian dan prediksi 'electricity_consumption' yang dihasilkan, memastikan prediksi tidak bernilai negatif.

Melalui langkah-langkah ini, saya bertujuan untuk membangun model yang kuat yang dapat menggeneralisasi dengan baik dan memberikan prediksi yang akurat untuk konsumsi listrik.

## Summary:

### Data Analysis Key Findings

*   The iterative refinement process successfully achieved an Out-of-Fold RMSE of approximately 17.30 on the training data with cross-validation, which falls within the target range of 10 to 20.
*   Feature engineering, including adding interaction features (`temp_wind_interaction`), seasonal features (`day_of_year_sin`, `day_of_year_cos`), specific lag features (lag 1 and 7 for temperature and wind speed), and rolling window mean features (7-day for temperature), was crucial in improving the model's performance.
*   Seasonal features (`day_of_year_sin`, `day_of_year_cos`), `year`, and `daylight_duration` consistently appeared as highly important features in the model.
*   Training the final LightGBM model on the entire training dataset with the refined feature set and optimized hyperparameters successfully generated predictions for the test set, saved to "submission\_final.csv".

### Insights or Next Steps

*   The achieved OOF RMSE of 17.30 suggests that the model has a reasonable balance between bias and variance on the training data. Further validation on an independent test set (e.g., a public leaderboard) is necessary to confirm its generalization performance and ensure the target RMSE is met on unseen data.
*   Analyzing the predictions on the specific test examples (e.g., 'cluster\_1\_2022-01-01') can provide insights into how the model behaves for different clusters and dates, which could guide further targeted feature engineering or model adjustments if needed.