In [1]:
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import openpyxl
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.neural_network import MLPRegressor
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('./datasets/train2.csv')
df = pd.read_csv('./datasets/test2.csv')
dataset.drop(columns="time", inplace = True)
df.drop(columns="time", inplace = True)
X_train = dataset.iloc[:, :-1].values
y_train = dataset.iloc[:, -1].values
X_test = df.iloc[:, :].values
X_train.shape,y_train.shape,X_test.shape

((17520, 25), (17520,), (4368, 25))

In [3]:
dataset.columns

Index(['temperature_2m (?E?EC)', 'relative_humidity_2m (%)',
       'dew_point_2m (?E?EC)', 'surface_pressure (hPa)', 'cloud_cover (%)',
       'cloud_cover_low (%)', 'cloud_cover_mid (%)', 'cloud_cover_high (%)',
       'vapour_pressure_deficit (kPa)', 'wind_speed_10m (km/h)',
       'wind_speed_100m (km/h)', 'wind_gusts_10m (km/h)',
       'soil_temperature_0_to_7cm (?E?EC)',
       'soil_temperature_100_to_255cm (?E?EC)',
       'soil_moisture_0_to_7cm (m?/m?)', 'soil_moisture_7_to_28cm (m?/m?)',
       'soil_moisture_28_to_100cm (m?/m?)',
       'soil_moisture_100_to_255cm (m?/m?)', 'shortwave_radiation (W/m?)',
       'direct_radiation (W/m?)', 'diffuse_radiation (W/m?)',
       'direct_normal_irradiance (W/m?)', 'global_tilted_irradiance (W/m?)',
       'terrestrial_radiation (W/m?)', 'is_day ()', 'precipitation (mm)'],
      dtype='object')

In [4]:
df.columns

Index(['temperature_2m (?E?EC)', 'relative_humidity_2m (%)',
       'dew_point_2m (?E?EC)', 'surface_pressure (hPa)', 'cloud_cover (%)',
       'cloud_cover_low (%)', 'cloud_cover_mid (%)', 'cloud_cover_high (%)',
       'vapour_pressure_deficit (kPa)', 'wind_speed_10m (km/h)',
       'wind_speed_100m (km/h)', 'wind_gusts_10m (km/h)',
       'soil_temperature_0_to_7cm (?E?EC)',
       'soil_temperature_100_to_255cm (?E?EC)',
       'soil_moisture_0_to_7cm (m?/m?)', 'soil_moisture_7_to_28cm (m?/m?)',
       'soil_moisture_28_to_100cm (m?/m?)',
       'soil_moisture_100_to_255cm (m?/m?)', 'shortwave_radiation (W/m?)',
       'direct_radiation (W/m?)', 'diffuse_radiation (W/m?)',
       'direct_normal_irradiance (W/m?)', 'global_tilted_irradiance (W/m?)',
       'terrestrial_radiation (W/m?)', 'is_day ()'],
      dtype='object')

In [5]:
k_fold = KFold(n_splits = 15, random_state = 11, shuffle = True)
def cv_rmse(model, X = X_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = "neg_mean_squared_error", cv = k_fold))
    return rmse

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

### XGBoost

In [6]:
xgb = make_pipeline(RobustScaler(),
                    XGBRegressor(colsample_bytree = 0.5, n_estimators = 6000,
                                 max_depth = 4, learning_rate = 0.01, gamma = 0.45,
                                 subsample = 0.5, random_state = 11, reg_alpha = 0.00006,
                                 reg_lambda = None, nthread = -1))

In [7]:
# get CV score of the xgb model
score = cv_rmse(xgb)
print("Xgboost model's cross validation score: ", score.mean())

Xgboost model's cross validation score:  0.7481506382662773


### LightGBM

In [8]:
lgbm = make_pipeline(RobustScaler(),
                     LGBMRegressor(num_leaves = 6, bagging_fraction = 0.7,
                                   bagging_freq = 4, min_sum_hessian_in_leaf = 11,
                                   learning_rate = 0.01, n_estimators = 7500, max_bin = 200,
                                   random_state = 11))

In [9]:
# get CV score of the lgbm model
score = cv_rmse(lgbm)
print("Light GBM model's cross validation score: ", score.mean())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4191
[LightGBM] [Info] Number of data points in the train set: 16352, number of used features: 25
[LightGBM] [Info] Start training from score 0.394527
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4191
[LightGBM] [Info] Number of data points in the train set: 16352, number of used features: 25
[LightGBM] [Info] Start training from score 0.389679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005818 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4193
[LightGBM] [Info] Number of data points in the train set

### Ridge

In [10]:
ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas = [1e-10, 1e-8, 1e-5, 1e-2, 9e-4,
                                                        5e-4, 3e-4, 1e-4, 1e-3, 1e-2, 0.1,
                                                        0.3, 0.6, 1, 3, 5, 7, 14, 18, 25, 30, 
                                                        45, 50, 70, 90], cv = k_fold))

In [11]:
# get CV score of the ridge model
score = cv_rmse(ridge)
print("Ridge model's cross validation score: ", score.mean())

Ridge model's cross validation score:  0.972790033192454


### MLP

In [12]:
# Define the pipeline
mlp = make_pipeline(
    RobustScaler(),
    MLPRegressor(random_state=42,hidden_layer_sizes= (200,200,200),
                activation='relu')
)

In [13]:
# get CV score of the MLP model
score = cv_rmse(mlp)
print("Ridge model's cross validation score: ", score.mean())

Ridge model's cross validation score:  0.870246723405638


### LASSO

In [14]:
lasso = make_pipeline(RobustScaler(),
                      LassoCV(alphas = [1e-10, 1e-8, 1e-5, 1e-2, 9e-4,
                                                        5e-4, 3e-4, 1e-4, 1e-3, 1e-2, 0.1,
                                                        0.3, 0.6, 1, 3, 5, 7, 14, 18, 25, 30,
                                                        45, 50, 70, 90], n_jobs = -1, cv = k_fold))

In [15]:
# get CV score of the lasso model
score = cv_rmse(lasso)
print("Lasso model's cross validation score: ", score.mean())

Lasso model's cross validation score:  0.972670303867549


### ElasticNet

In [16]:
elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1000, 
                                        alphas=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007], 
                                        cv=k_fold, 
                                        l1_ratio=[0.8, 0.85, 0.9, 0.95, 0.99, 1]))  

In [17]:
# get CV score of the lasso model
score = cv_rmse(elasticnet)
print("Elastic Net model's cross validation score: ", score.mean())

Elastic Net model's cross validation score:  0.9726087454752703


### Support Vector Regression

In [18]:
svr = make_pipeline(RobustScaler(), SVR(C = 30, gamma = 0.0002, epsilon = 0.009))

In [19]:
# get CV score of the svr model
score = cv_rmse(svr)
print("Support vector machines model's cross validation score: ", score.mean())

Support vector machines model's cross validation score:  1.0641357097015764


### Gradient Boosting

In [20]:
gbr = make_pipeline(RobustScaler(),
                    GradientBoostingRegressor(n_estimators = 700, learning_rate = 0.01,
                                              max_depth = 5, min_samples_split = 12, min_samples_leaf = 16,
                                              loss = "huber", max_features = "sqrt", random_state = 11))

In [21]:
# get CV score of the gbr model
score = cv_rmse(gbr)
print("Gradient boosting model's cross validation score: ", score.mean())

Gradient boosting model's cross validation score:  0.7852256224392368


### Random Forest

In [22]:
rf = make_pipeline(RobustScaler(),
                   RandomForestRegressor(n_estimators = 250, max_depth = 15,
                                         min_samples_split = 6, min_samples_leaf = 6,
                                         random_state = 11))

In [23]:
# get CV score of the rf model
score = cv_rmse(rf)
print("Random forest model's cross validation score: ", score.mean())

Random forest model's cross validation score:  0.7486683036262431


### Stacked Model

In [24]:
stacked = StackingCVRegressor(regressors = (xgb, lgbm, ridge, svr, lasso, elasticnet,rf,gbr,mlp),
                              meta_regressor = xgb, use_features_in_secondary = True)

stackedv2 = StackingCVRegressor(regressors = (xgb, lgbm, ridge, svr, lasso, elasticnet,rf,gbr,mlp),
                              meta_regressor = mlp, use_features_in_secondary = True)

stackedv3 = StackingCVRegressor(regressors = (xgb, lgbm, ridge, svr, lasso, elasticnet,rf,gbr,mlp),
                              meta_regressor = lgbm, use_features_in_secondary = True)

## Fit all model

In [25]:
mlp_model = mlp.fit(X_train, y_train)

#RMSLE score of the gbr model on full train data
mlp_score = rmsle(y_train, mlp_model.predict(X_train))
print("RMSLE score of MLP model on full data:", mlp_score)

RMSLE score of MLP model on full data: 0.31043696880703747


In [26]:
gbr_model = gbr.fit(X_train, y_train)

#RMSLE score of the gbr model on full train data
gbr_score = rmsle(y_train, gbr_model.predict(X_train))
print("RMSLE score of xgboost model on full data:", gbr_score)

RMSLE score of xgboost model on full data: 0.6962047565733002


In [27]:
rf_model = rf.fit(X_train, y_train)

#RMSLE score of the rf model on full train data
rf_score = rmsle(y_train, rf_model.predict(X_train))
print("RMSLE score of random forest model on full data:", rf_score)

RMSLE score of random forest model on full data: 0.5294056549041162


In [28]:
en_model = elasticnet.fit(X_train, y_train)

#RMSLE score of the gbr model on full train data
en_score = rmsle(y_train, en_model.predict(X_train))
print("RMSLE score of xgboost model on full data:", en_score)

RMSLE score of xgboost model on full data: 0.9780321649324113


In [29]:
svr_model = svr.fit(X_train, y_train)

#RMSLE score of the svr model on full train data
svr_score = rmsle(y_train, svr_model.predict(X_train))
print("RMSLE score of svr model on full data:", svr_score)

RMSLE score of svr model on full data: 1.072289805196111


In [30]:
xgb_model = xgb.fit(X_train, y_train)

#RMSLE score of the xgb model on full train data
xgb_score = rmsle(y_train, xgb_model.predict(X_train))
print("RMSLE score of xgboost model on full data:", xgb_score)

RMSLE score of xgboost model on full data: 0.3539533233461878


In [31]:
lgbm_model = lgbm.fit(X_train, y_train)

#RMSLE score of the lgbm model on full train data
lgbm_score = rmsle(y_train, lgbm_model.predict(X_train))
print("RMSLE score of lgbm model on full data:", lgbm_score)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4197
[LightGBM] [Info] Number of data points in the train set: 17520, number of used features: 25
[LightGBM] [Info] Start training from score 0.391207
RMSLE score of lgbm model on full data: 0.440737411531584


In [32]:
ridge_model = ridge.fit(X_train, y_train)

#RMSLE score of the ridge model on full train data
ridge_score = rmsle(y_train, ridge_model.predict(X_train))
print("RMSLE score of ridge model on full data:", ridge_score)

RMSLE score of ridge model on full data: 0.978121407411212


In [33]:
lasso_model = lasso.fit(X_train, y_train)

#RMSLE score of the lasso model on full train data
lasso_score = rmsle(y_train, lasso_model.predict(X_train))
print("RMSLE score of lasso model on full data:", lasso_score)

RMSLE score of lasso model on full data: 0.9780462325310353


In [41]:
stacked_model = stacked.fit(np.array(X_train), np.array(y_train))

#RMSLE score of the stacked model on full train data
stacked_score = rmsle(y_train, stacked_model.predict(X_train))
print("RMSLE score of stacked models on full data:", stacked_score)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4182
[LightGBM] [Info] Number of data points in the train set: 14016, number of used features: 25
[LightGBM] [Info] Start training from score 0.391290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4174
[LightGBM] [Info] Number of data points in the train set: 14016, number of used features: 25
[LightGBM] [Info] Start training from score 0.387375
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4187
[LightGBM] [Info] Number of data points in the train set

In [42]:
stacked_model2 = stackedv2.fit(np.array(X_train), np.array(y_train))

#RMSLE score of the stacked model on full train data
stacked_score = rmsle(y_train, stacked_model2.predict(X_train))
print("RMSLE score of stacked models on full data:", stacked_score)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4171
[LightGBM] [Info] Number of data points in the train set: 14016, number of used features: 25
[LightGBM] [Info] Start training from score 0.393494
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000892 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4173
[LightGBM] [Info] Number of data points in the train set: 14016, number of used features: 25
[LightGBM] [Info] Start training from score 0.394385
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4190
[LightGBM] [Info] Number of data points in the train set

In [43]:
stacked_model3 = stackedv3.fit(np.array(X_train), np.array(y_train))

#RMSLE score of the stacked model on full train data
stacked_score = rmsle(y_train, stacked_model3.predict(X_train))
print("RMSLE score of stacked models on full data:", stacked_score)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4177
[LightGBM] [Info] Number of data points in the train set: 14016, number of used features: 25
[LightGBM] [Info] Start training from score 0.399497
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4180
[LightGBM] [Info] Number of data points in the train set: 14016, number of used features: 25
[LightGBM] [Info] Start training from score 0.390652
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4190
[LightGBM] [Info] Number of data points in the train set

In [44]:
y_pred = stacked_model.predict(X_test)
y_pred2 = stacked_model2.predict(X_test)
y_pred3 = stacked_model3.predict(X_test)



In [50]:
df_sub = pd.read_csv("./datasets/sub.csv")
df_sub["precipitation (mm)"] = np.array(y_pred)
df_sub["electricity_shutdown"] = df_sub["electricity_shutdown"].fillna(0)
df_sub["electricity_shutdown"] = df_sub["electricity_shutdown"].astype(int)

In [51]:
df_sub.to_csv('./datasets/STACK2.csv',index=False)

In [None]:
# df_last = pd.read_csv('analysis.csv')
# df_last = df_last.tail(2208)
# df_last["Pred"] = y_pred2
# df_test = pd.read_csv("dataset/sample_submission.csv")
# a = df_test.copy()
# # Ensure both Timestamp columns are in datetime format
# df_last['Timestamp'] = pd.to_datetime(df_last['Timestamp'])
# df_test['Timestamp'] = pd.to_datetime(df_test['Timestamp'], format='%b %d, %Y %I%p')

# # Filter df_last based on the Timestamps in df_test
# filtered_df_last = df_last[df_last['Timestamp'].isin(df_test['Timestamp'])]

# # Display the filtered dataframe
# pred = filtered_df_last['Pred']
# a['% Baseline'] = list(pred)
# a

In [None]:
# a.to_csv('submit/stack_newv2.csv',index=False)

In [None]:
# df_last = pd.read_csv('analysis.csv')
# df_last = df_last.tail(2208)
# df_last["Pred"] = y_pred3
# df_test = pd.read_csv("dataset/sample_submission.csv")
# a = df_test.copy()
# # Ensure both Timestamp columns are in datetime format
# df_last['Timestamp'] = pd.to_datetime(df_last['Timestamp'])
# df_test['Timestamp'] = pd.to_datetime(df_test['Timestamp'], format='%b %d, %Y %I%p')

# # Filter df_last based on the Timestamps in df_test
# filtered_df_last = df_last[df_last['Timestamp'].isin(df_test['Timestamp'])]

# # Display the filtered dataframe
# pred = filtered_df_last['Pred']
# a['% Baseline'] = list(pred)
# a

In [None]:
# a.to_csv('submit/stack_newv3.csv',index=False)

In [None]:
# import joblib

# # Save the models to disk
# joblib.dump(stacked, 'stacked_metaxgb_model.pkl')
# joblib.dump(stackedv2, 'stackedv2_metamlp_model.pkl')
# joblib.dump(stackedv3, 'stackedv3_metalgb_model.pkl')

In [None]:
# def blend_models_predict(X):
#     return ((0.1 * gbr_model.predict(X)) + \
#             (0.125 * rf_model.predict(X)) + \
#             (0.05 * en_model.predict(X)) + \
#             (0.05 * svr_model.predict(X)) + \
#             (0.1 * xgb_model.predict(X)) + \
#             (0.15 * lgbm_model.predict(X)) + \
#             (0.05 * ridge_model.predict(X)) + \
#             (0.05 * lasso_model.predict(X))+ \
#             (0.125 * mlp_model.predict(X))+ \
#             (0.2 * stacked_model.predict(X)))

In [None]:
# blend_score = rmsle(y_train, blend_models_predict(X_train))
# print("RMSLE score of stacked models on full data:", stacked_score)

In [None]:
# y_pred4 = blend_models_predict(X_test)
# df_test2 = pd.read_csv("dataset/sample_submission.csv")
# df_test2["% Baseline"] = y_pred2

In [None]:
# df_test2.to_csv('submit/stack_newv5.csv',index=False)