# Import/Install Libraries

In [None]:
!pip install feature-engine
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings("ignore")
from scipy import stats
from sklearn.model_selection import (train_test_split,cross_val_score,
                                     GridSearchCV,KFold,learning_curve,
                                     cross_validate,RandomizedSearchCV,
                                    TimeSeriesSplit)
from IPython.display import display
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots
import shap
import missingno as msno
import statsmodels.api as sm
from lightgbm import LGBMRegressor,plot_importance,plot_metric,plot_tree
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import (MinMaxScaler,OneHotEncoder,
                                   RobustScaler,LabelEncoder)

from feature_engine.encoding import (MeanEncoder,CountFrequencyEncoder,
                                    OneHotEncoder,OrdinalEncoder)

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import lightgbm as lgb
from eli5.sklearn import PermutationImportance
from lightgbm.plotting import create_tree_digraph,plot_split_value_histogram
import shap
from pdpbox import pdp, get_dataset, info_plots
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
energy_data=pd.read_csv("/kaggle/input/energy-consumption-generation-prices-and-weather/energy_dataset.csv")
weather_data=pd.read_csv("/kaggle/input/energy-consumption-generation-prices-and-weather/weather_features.csv")

# Exploratory Data Analysis and Data Cleaning

In [None]:
weather_data.head()

In [None]:
weather_data.city_name.value_counts()

In [None]:
energy_data.head()

In [None]:
energy_data.shape

In [None]:
energy_data.time.min(),energy_data.time.max()

In [None]:
weather_data.query("city_name == 'Valencia' ").head()

In [None]:
def miss_frame(data):
    frame=pd.DataFrame()
    frame["missing_no"]=data.isnull().sum()
    frame["missing_rate"]=data.isnull().sum()/len(data)
    frame=frame.loc[frame["missing_rate"]>0].sort_values(by="missing_rate",
                                                         ascending=False)
    frame["bigger_005"]=frame["missing_rate"].map(lambda x:1 if x>=0.05 else 0)
    return frame

miss_frame(energy_data)

In [None]:
energy_data.eq(0).sum().to_frame(name="Zero No")

In [None]:
energy_data.drop(["generation fossil coal-derived gas",
                  "generation fossil oil shale",
                  "generation fossil peat",
                  "generation geothermal",
                  "generation marine",
                  "generation wind offshore",
                  "generation hydro pumped storage aggregated",
                    "forecast wind offshore eday ahead"],
                 axis=1,
                 inplace=True)

In [None]:
miss_frame(energy_data)

In [None]:
msno.bar(energy_data,figsize=(12,5),
         inline=True,
         sort="descending",
        fontsize=12);

In [None]:
msno.dendrogram(df=energy_data,figsize=(12,5),fontsize=12);

In [None]:
msno.heatmap(df=energy_data);

In [None]:
weather_val=weather_data.query("city_name == 'Valencia'")

weather_val.shape, energy_data.shape

In [None]:
weather_val[weather_val.duplicated(subset="dt_iso",
                                   keep="first")].shape

In [None]:
weather_val=weather_val.rename({"dt_iso":"time"},axis=1)

In [None]:
print(weather_val.time.min(),weather_val.time.max())
print(energy_data.time.min(),energy_data.time.max())

In [None]:
weather_val=weather_val.drop_duplicates(subset="time",
                                        keep="first")
weather_val.shape

In [None]:
full_df=pd.merge(energy_data,weather_val,on="time")
full_df.shape

In [None]:
full_df.head()

In [None]:
full_df.dtypes.to_frame().T

In [None]:
dates = full_df["time"].str.split(" ",n=1,expand=True)[0]
times = full_df["time"].str.split(" ",n=1,expand=True)[1]

full_df["date"] = dates
full_df["hours"] = times.str.split("+",n=1,expand=True)[0]

full_df["full_time"]=full_df["date"] + " " + full_df["hours"]

full_df["full_time"]=pd.to_datetime(full_df["full_time"],
                                    format='%Y-%m-%d %H:%M')

full_df["full_time"].head()

In [None]:
full_df.head()

In [None]:
def extract_from_date(data,time_column):
    
    data["year"]=data[time_column].dt.year
    data["quarter"]=data[time_column].dt.quarter
    data["month"]=data[time_column].dt.month
    data["week"]=data[time_column].dt.week
    data["hour"]=data[time_column].dt.hour
    data["time"]=data[time_column].dt.time
    data['dayofweek'] = data[time_column].dt.dayofweek
    data["dayofmonth"]=data[time_column].dt.day
    data["day_name"]=data[time_column].dt.day_name()
    data['is_weekend'] = np.where(data['day_name'].isin(['Sunday', 'Saturday']), 1,0)
    data["is_year_start"]=data[time_column].dt.is_year_start
    
    return data

full_df=extract_from_date(full_df,"full_time")

full_df.sample(5)

In [None]:
full_df.dtypes.to_frame().T

In [None]:
train=full_df.loc[~((full_df["year"]==2018) & (full_df["month"]==12))]

test=full_df.loc[((full_df["year"]==2018) & (full_df["month"]==12))] 

In [None]:
fig = make_subplots()

fig.add_trace(
    go.Line(x=train["full_time"],y=train["price actual"],
                                name="price actual"))

fig.add_trace(
    go.Line(x=train["full_time"],y=train.rolling(window=24).mean()["price actual"],
                                        name="rolling window=24h"))

fig.add_trace(
    go.Line(x=train["full_time"],y=train.rolling(window=168).mean()["price actual"],
                                        name="rolling window=168h"))

fig.update_xaxes(rangeslider_visible=True)

fig.show()

In [None]:
train.corr().abs()["price actual"].nlargest(20).to_frame(name="corr")

In [None]:
plt.figure(figsize=(20,10));
sns.heatmap(train.iloc[:,:21].corr().abs(),
            annot=True,
            cmap="coolwarm");

In [None]:
plt.figure(figsize=(20,10));
sns.heatmap(train.iloc[:,20:].corr().abs(),
            annot=True,
            cmap="coolwarm");

In [None]:
fig = ff.create_distplot([train["price actual"]],
                         group_labels=["distplot"])
fig.show()

fig=px.box(train["price actual"],
           orientation="h",
           title="Actual Price Boxplot")
fig.show()
#fig.update_traces(marker=dict(line=dict(width=2)))

In [None]:
train.isnull().sum().to_frame().T

In [None]:
train.iloc[:,:21].describe().T

In [None]:
train.describe(include="O").T

In [None]:
def diagnostic_plots(df, variable):
    
    plt.figure(figsize=(17, 5))

    plt.subplot(1, 3, 1)
    sns.distplot(df[variable])
    plt.title('Histogram')
    
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('RM quantiles')

    plt.subplot(1, 3, 3)
    sns.boxplot(x=df[variable])
    plt.title('Boxplot')
    
    plt.show()
    
    
for col in train.select_dtypes(exclude="O").columns[:20].to_list():
        diagnostic_plots(train.dropna(),col)

In [None]:
fig = make_subplots()

fig.add_trace(
    go.Line(x=train.loc[train["year"]==2018,"full_time"],
            y=train.loc[train["year"]==2018,"price actual"],
                                name="price actual"))

fig.add_trace(
    go.Line(x=train.loc[train["year"]==2018,"full_time"],
            y=train.loc[train["year"]==2018,"temp"],
                                name="Temp"))
fig.add_trace(
    go.Line(x=train.loc[train["year"]==2018,"full_time"],
            y=train.loc[train["year"]==2018,"generation waste"],
                                name="generation waste"))
fig.add_trace(
    go.Line(x=train.loc[train["year"]==2018,"full_time"],
            y=train.loc[train["year"]==2018,"generation fossil oil"],
                                name="generation fossil oil"))

fig.update_xaxes(rangeslider_visible=True)

fig.show()

In [None]:
columns_to_plot=train.select_dtypes(exclude="O").columns[:20]

for col in columns_to_plot:
    
    with plt.style.context('seaborn-whitegrid'):
        
        plt.figure(figsize=(20,5));
        plt.plot(train["full_time"].tail(720),
                 train[col].tail(720),color="purple");
        
        plt.axhline(train[col].tail(720).mean(), 
                                                color='black',
                                                linestyle='-',
                                                linewidth=1,
                                                label="Mean for 2018-11"); #mean
        
        plt.axhline(train[col].mean(), 
                            color='red',
                            linestyle='--',
                            linewidth=1,
                            label="Mean"); # general mean
        
        plt.axvspan("2018-11-03","2018-11-04",color="gray") #weekends
        plt.axvspan("2018-11-10","2018-11-11",color="gray")
        plt.axvspan("2018-11-17","2018-11-18",color="gray")
        plt.axvspan("2018-11-24","2018-11-25",color="gray")

        plt.xlabel("Date")
        plt.ylabel(col)
        plt.title(f"{col} on 2018-11")
        plt.legend()
        plt.show();

In [None]:
with plt.style.context('seaborn-whitegrid'):
    fig, ax = plt.subplots(figsize=(25,8))

    ax.set_title('Price day ahead and price actual on 2018-11')
    ax = train.tail(720).plot.line(x="full_time",y=['price day ahead',"price actual"],ax=ax)
    ax.set_xlabel("date")
    plt.axvspan("2018-11-03","2018-11-04",color="gray") #weekends
    plt.axvspan("2018-11-10","2018-11-11",color="gray")
    plt.axvspan("2018-11-17","2018-11-18",color="gray")
    plt.axvspan("2018-11-24","2018-11-25",color="gray")

    ax.axhline(train["price day ahead"].tail(720).mean(), 
                                color='black',
                                linestyle='-.',
                                linewidth=1,
                                label="Price day ahead mean on 2018-11"); 
    
    ax.axhline(train["price actual"].tail(720).mean(), 
                                color='blue',
                                linestyle='-.',
                                linewidth=1,
                                label="Price actual mean on 2018-11"); 
    
    ax.axhline(train["price actual"].mean(), 
                                color='red',
                                linestyle='-',
                                linewidth=1,
                                label="Price actual general mean on 2018-11"); 
    
    ax.axhline(train["price day ahead"].mean(), 
                                color='green',
                                linestyle='-',
                                linewidth=1,
                                label="Price day ahead general mean on 2018-11"); 
    plt.legend()

    

In [None]:
with plt.style.context('seaborn-whitegrid'):
    fig, ax = plt.subplots(figsize=(25,8))

    ax.set_title('Weather Cond. on 2018-11-30')
    ax = train.tail(24).plot.line(x="full_time",
                                  y=["temp","generation waste",
                                   "generation fossil oil",
                                  "generation biomass"],
                                        ax=ax)
 

In [None]:
def plot_actual_price(price_col,other_col,train):
    
    mean_price=train[price_col].mean()
    
    train.groupby([other_col])[price_col].mean().plot.bar(figsize=(20,6),
                                                            color="cornflowerblue");
    plt.axhline(train[price_col].mean(),
                                    linestyle='-',
                                    linewidth=2,
                                    color="red");

    plt.axhline(mean_price + train[price_col].std(),
                                    linestyle=':',
                                    linewidth=2,
                                    color="red");

    plt.axhline(mean_price - train[price_col].std(),
                                    linestyle=':',
                                    linewidth=2,
                                    color="red");
    plt.title(f"{price_col}&{other_col} BarPlot");
    plt.show()


In [None]:
for col in ["year","hour","month","weather_description"]:
    plot_actual_price("price actual",col,train)

In [None]:
fig = px.scatter(train, x="total load actual", y="price actual", trendline="ols")
fig.show()
results = px.get_trendline_results(fig)
results.px_fit_results.iloc[0].summary()

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(20,5));
    plt.plot(train["full_time"],train["price actual"].expanding().mean())
     #average of all prior rows

In [None]:
missing_df=train[train.isna().any(axis=1)]
missing_df.head()

In [None]:
miss_cols=miss_frame(train).index.to_list()
miss_index=missing_df.index.to_list()
miss_cols

In [None]:
train_new=train.copy()
test_new=test.copy()


for var in miss_cols:  #missing_indicator
    
    train_new[var+'_NA'] = np.where(train_new[var].isnull(), 1, 0)
    
    test_new[var+'_NA'] = np.where(test_new[var].isnull(), 1, 0)
   
train_new.head()

In [None]:
train_new=train_new.interpolate(method='linear', limit_direction='forward', axis=0)

In [None]:
display(train["price actual"].describe().to_frame().T)
display(train_new["price actual"].describe().to_frame().T)

In [None]:
train_new.iloc[miss_index].head()

In [None]:
display(train["weather_main"].value_counts(normalize=True).to_frame().T)
display(train["weather_icon"].value_counts(normalize=True).to_frame().T)
display(train["weather_id"].value_counts(normalize=True).to_frame().T)

In [None]:
train_new=train_new.drop(["time","city_name","date","hours","full_time","day_name",
                         "weather_id","weather_icon","is_year_start"],axis=1)
test_new=test_new.drop(["time","city_name","date","hours","full_time","day_name",
                        "weather_id","weather_icon","is_year_start"],axis=1)

In [None]:
train_new.shape, test_new.shape

In [None]:
train_new.head()

In [None]:
X_train=train_new.drop(["price actual"],axis=1)
y_train=train_new["price actual"]

X_test=test_new.drop(["price actual"],axis=1)
y_test=test_new["price actual"]

In [None]:
encoder = OrdinalEncoder(encoding_method='ordered', variables=['weather_main',"weather_description"])
encoder.fit(X_train, y_train)

X_train_t = encoder.transform(X_train)
X_test_t = encoder.transform(X_test)
encoder.encoder_dict_

In [None]:
X_train_t.head()

In [None]:
def MAPE(y_actual,y_pred):
    mape = np.mean(np.abs((y_actual - y_pred)/y_actual))*100
    return mape

def result_df(y_actual,y_pred):
    
    r2=r2_score(y_actual,y_pred)
    mse=mean_squared_error(y_actual,y_pred)
    rmse=np.sqrt(mean_squared_error(y_actual,y_pred))
    mae=mean_absolute_error(y_actual,y_pred)
    mape=MAPE(y_actual,y_pred)
    
    result=pd.DataFrame.from_dict({"r2":[r2],"MSE":[mse],"RMSE":[rmse],
                         "MAE":[mae],"MAPE":[mape]})
    return result

# Baseline Model

In [None]:
def run_lgb(train_X, train_y, val_X, val_y):
    params = {
        "boosting_type":"dart",
        "objective": "regression",
        "metric": ["mape","rmse"],
        "num_iterations":30,
        "reg_lambda":100,
        "learning_rate":0.09
              
    }

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain)

    pred_test_y = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, model

y_pred,model=run_lgb(X_train_t,y_train,X_test_t,y_test)

In [None]:
model.lower_bound(),model.upper_bound()

In [None]:
model.trees_to_dataframe().head(5)

In [None]:
pred_train=model.predict(X_train_t)
result_df(y_train,pred_train)

In [None]:
result_df(y_test,y_pred)

In [None]:
pred_df=pd.DataFrame({"actual":y_test,"pred":y_pred}).set_index(test["full_time"])

pred_df[["actual","pred"]].plot(figsize=(20,5));

In [None]:
#fig = go.Figure()
#fig.add_trace(go.Scatter(y=pred_df.actual,
 #                   mode='lines',
  #                  name='test'))
#fig.add_trace(go.Scatter(y=pred_df.pred,
 #                   mode='lines',
  #                  name='pred'))
#fig.update_xaxes(rangeslider_visible=True)
#fig.update_layout(title='Prediction Result',
 #                  xaxis_title='Date',
  #                 yaxis_title='Production')
#fig.show()

In [None]:
#plot_metric(evals_result,dpi=100);
#print("Best Iteration :", model.best_iteration)

In [None]:
plot_tree(model,dpi=500,tree_index=0);

In [None]:
plot_importance(booster=model,
                height=0.8,
                figsize=(15,10),
                dpi=100);

In [None]:
#model=LGBMRegressor(random_state=0,
 #                   num_iterations=21
  #                  ).fit(X_train_t,y_train)

#perm = PermutationImportance(model, random_state=0).fit(X_test_t,y_test)
#eli5.show_weights(perm, feature_names = X_test_t.columns.tolist(),top=20)

# TimeSeriesSplit Cross Validation 

In [None]:
params = {
    'boosting_type':'gbdt',
    'objective': 'regression',
    'metric': ['mape','rmse'],
    
}

tscv=TimeSeriesSplit(n_splits=5)
full_df=pd.concat([X_train_t,y_train],axis=1)
X=full_df.drop(["price actual"],axis=1)
y=full_df["price actual"]
scores=[]

for train_index, test_index in tscv.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    model=LGBMRegressor(random_state=0,metric=['rmse']) 
    train = lgb.Dataset(X_train,y_train)
    val = lgb.Dataset(X_test,y_test,reference=train)
    #print('Train and Validate')
    model = lgb.train(params=params,train_set=train,valid_sets=val,
                           num_boost_round=1000,verbose_eval=10,early_stopping_rounds=50)
    score=model.best_score.items()
    scores.append(score)
    
print("Best Scores: ",scores)

# Feaure Engineering and New Model

In [None]:
full_data=pd.concat([train_new,test_new])  
full_data.head()

In [None]:
full_data.shape

In [None]:
#full_data["price(t-1)"]=full_data["price actual"].shift(1)
#full_data["price(t-2)"]=full_data["price actual"].shift(2)
#full_data["price(t-3)"]=full_data["price actual"].shift(3)
#full_data["price(t-4)"]=full_data["price actual"].shift(4)
#full_data["price(t-5)"]=full_data["price actual"].shift(5)
#full_data["price(t-6)"]=full_data["price actual"].shift(6)
#full_data["price(t-12)"]=full_data["price actual"].shift(12)

full_data["price(t-12)"]=full_data["price actual"].shift(12)
full_data["price(t-24)"]=full_data["price actual"].shift(36)
full_data["price(t-36)"]=full_data["price actual"].shift(48)
full_data["price(t-48)"]=full_data["price actual"].shift(48)
full_data["price(t-72)"]=full_data["price actual"].shift(72)
full_data["price(t-168)"]=full_data["price actual"].shift(168)

full_data["price(t-12)"].fillna(method="bfill",inplace=True)
full_data["price(t-36)"].fillna(method="bfill",inplace=True)
full_data["price(t-24)"].fillna(method="bfill",inplace=True)
full_data["price(t-48)"].fillna(method="bfill",inplace=True)
full_data["price(t-72)"].fillna(method="bfill",inplace=True)
full_data["price(t-168)"].fillna(method="bfill",inplace=True)

full_data["diff_forecast"]=full_data["total load actual"] - full_data["total load forecast"]

full_data["roll_price_4"]=full_data["price actual"].rolling(window=4).mean()
full_data["roll_price_6"]=full_data["price actual"].rolling(window=6).mean()
full_data["roll_price_12"]=full_data["price actual"].rolling(window=12).mean()
full_data["roll_price_24"]=full_data["price actual"].rolling(window=24).mean()
full_data["roll_price_48"]=full_data["price actual"].rolling(window=48).mean()

full_data["roll_price_24_min"]=full_data["price actual"].rolling(window=24).min()
full_data["roll_price_12_min"]=full_data["price actual"].rolling(window=12).min()
full_data["roll_price_24_max"]=full_data["price actual"].rolling(window=24).max()
full_data["roll_price_12_max"]=full_data["price actual"].rolling(window=12).max()

full_data["roll_price_4"].fillna(method="bfill",inplace=True)
full_data["roll_price_6"].fillna(method="bfill",inplace=True)
full_data["roll_price_12"].fillna(method="bfill",inplace=True)
full_data["roll_price_24"].fillna(method="bfill",inplace=True)
full_data["roll_price_48"].fillna(method="bfill",inplace=True)

full_data["roll_price_24_min"].fillna(method="bfill",inplace=True)
full_data["roll_price_12_min"].fillna(method="bfill",inplace=True)
full_data["roll_price_24_max"].fillna(method="bfill",inplace=True)
full_data["roll_price_12_max"].fillna(method="bfill",inplace=True)

In [None]:
full_data.shape

In [None]:
full_data.tail()

In [None]:
columns_to_use=["generation biomass",
                "generation fossil brown coal/lignite",
                "generation fossil gas",
               "generation fossil hard coal",
                "generation fossil oil",
                "generation hydro pumped storage consumption",
                "generation hydro run-of-river and poundage",
                "generation hydro water reservoir","generation nuclear",
                "generation other renewable",
                "generation waste",
                "forecast solar day ahead",
                "price day ahead",
                "forecast wind onshore day ahead",
                "total load actual",
                "temp_min",
                "pressure",
                "humidity",
                "wind_speed",
                "wind_deg",
                "rain_1h",
                "rain_3h",
                "snow_3h",
                "clouds_all",
                "weather_main",
                "weather_description",
                "hour",
                "dayofweek",
                "dayofmonth",
                "is_weekend",
                "price(t-12)",
                  'price(t-168)',
                'diff_forecast',
                "week",
                "roll_price_24_min",
                "price(t-72)",
                'roll_price_6',

                "price actual"
               ]
    

In [None]:
train_=full_data[columns_to_use].iloc[:34320,:]
test_=full_data[columns_to_use].iloc[34320:,:]

In [None]:
plt.figure(figsize=(25,15));
sns.heatmap(train_.corr().abs(),annot=True,cmap="coolwarm");

In [None]:
X_train=train_.drop(["price actual"],axis=1)
y_train=train_["price actual"]

X_test=test_.drop(["price actual"],axis=1)
y_test=test_["price actual"]


encoder = OrdinalEncoder(encoding_method='arbitrary', variables=['weather_main',"weather_description"])
#encoder=OneHotEncoder(variables=['weather_main',"weather_description"])
encoder.fit(X_train, y_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [None]:
params = {
    'boosting_type':'gbdt',#gbdt
    'objective': 'regression',
    'metric': ['mape','rmse'],
    "num_iterations":26,#26
    "feature_fraction":0.9,
    "reg_lambda":300,
    "bagging_fraction":0.8,
    }
full_train=lgb.Dataset(X_train,y_train)
model=lgb.train(params,full_train,num_boost_round=10,verbose_eval=10)

pred_train=model.predict(X_train)
pred_test=model.predict(X_test)

display(result_df(pred_train,y_train))

display(result_df(pred_test,y_test))

In [None]:
pred_df=pd.DataFrame({"actual":y_test,"pred":pred_test})

pred_df[["actual","pred"]].plot(figsize=(25,8));

In [None]:
pred_df.head(10)

In [None]:
plot_importance(booster=model,
                height=0.8,
                figsize=(15,10),
                dpi=100);

In [None]:
create_tree_digraph(model)

In [None]:
lgb.plot_split_value_histogram(model,feature="hour",figsize=(12,5),dpi=100);

In [None]:
lgb.plot_split_value_histogram(model,feature="roll_price_6",figsize=(12,5),dpi=100);

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap.force_plot(explainer.expected_value, shap_values, X_test)