In [4]:
pip install pandas numpy scikit-learn lightgbm catboost xgboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

file_path = "train.csv"
df = pd.read_csv(file_path)

if 'day' in df.columns and df['day'].dtype == 'object':
    df['day'] = df['day'].fillna('Unknown')
    label_encoder = LabelEncoder()
    df['day'] = label_encoder.fit_transform(df['day'])

columns_with_nan = df.columns[df.isnull().any()].tolist()

for column in columns_with_nan:
    missing_rows = df[column].isnull()

    features = df.drop(columns=[column, "uid"])
    target = df[column]

    X_known = features[~missing_rows]
    y_known = target[~missing_rows]
    X_missing = features[missing_rows]

    if not X_missing.empty:
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X_known, y_known)

        df.loc[missing_rows, column] = rf.predict(X_missing)

X = df.drop(columns=["uid", "output_electricity_generation"])
y = df["output_electricity_generation"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results.append([name, mae, rmse, r2])

results_df = pd.DataFrame(results, columns=["Model", "MAE", "RMSE", "R2 Score"])
print(results_df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2807
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 12
[LightGBM] [Info] Start training from score 832.300201
               Model       MAE      RMSE  R2 Score
0      Random Forest  0.394059  0.798361  0.999980
1           LightGBM  1.039938  1.456125  0.999932
2           AdaBoost  7.456409  9.016389  0.997402
3  Gradient Boosting  1.793214  2.556098  0.999791
4           CatBoost  0.954017  1.662795  0.999912
5            XGBoost  0.738227  1.076142  0.999963
