### Attribute info:

- date time year-month-day hour:minute:second
- Appliances, energy use in Wh (target variable for prediction)
- lights, energy use of light fixtures in the house in Wh
- T1, Temperature in kitchen area, in Celsius
- RH_1, Humidity in kitchen area, in %
- T2, Temperature in living room area, in Celsius
- RH_2, Humidity in living room area, in %
- T3, Temperature in laundry room area
- RH_3, Humidity in laundry room area, in %
- T4, Temperature in office room, in Celsius
- RH_4, Humidity in office room, in %
- T5, Temperature in bathroom, in Celsius
- RH_5, Humidity in bathroom, in %
- T6, Temperature outside the building (north side), in Celsius
- RH_6, Humidity outside the building (north side), in %
- T7, Temperature in ironing room , in Celsius
- RH_7, Humidity in ironing room, in %
- T8, Temperature in teenager room 2, in Celsius
- RH_8, Humidity in teenager room 2, in %
- T9, Temperature in parents room, in Celsius
- RH_9, Humidity in parents room, in %
- To, Temperature outside (from Chievres weather station), in Celsius
- Pressure (from Chievres weather station), in mm Hg
- RH_out, Humidity outside (from Chievres weather station), in %
- Wind speed (from Chievres weather station), in m/s
- Visibility (from Chievres weather station), in km
- Tdewpoint (from Chievres weather station), Â°C
- rv1, Random variable 1, nondimensional
- rv2, Random variable 2, nondimensional

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv('/kaggle/input/appliances-energy-prediction-data-set/energydata_complete.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

# Downcasting

- Downcast to save cpu resources
- Note that there are only numerical features

In [None]:
# Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            df[cols[i]] = pd.to_datetime(df[cols[i]], format='%d-%m-%Y %H:%M')
    return df
                
df = downcast(df)

# EDA

In [None]:
# make a copy of df that we can modify and explore
# we will also use 1/8 of the data for some EDA since to better identify trends
# 19735 rows in total, so we want 19735/8 ~= 2467 rows (2 weeks)
edadf = df.copy()
edadfsmall = df[0:2467].copy()

# Overview

In [None]:
fig = px.line(edadf, x='date', y=edadf.columns[:], title='All Features over time')
fig.show()

Here is an overview of the entire timeline and all variables.

- We can see that there peaks of high appliance usage and low appliance usage. Probably indicating night time and daytime. 

- Note that there are two large gaps in appliance usage. (potential outliers)
    - between 27-1-2016 and 30-1-2016
    - between 01-4-2016 and 03-04-2016
    
    
I will be using only two weeks of data for the next section of data exploration to better depict patterns and trends

# Appliances and Lights

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['Appliances'], 
               name="Appliances",
               mode='lines'),
    secondary_y=False,
)



fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['lights'], 
               name='lights',
               mode='lines'),
    secondary_y=True,
)

fig.update_layout(
    title='Appliance and Light usage over two weeks',
    xaxis_title="Date")
    
fig.update_yaxes(title_text="Appliance Usage (in Wh)", secondary_y=False, color="blue")
fig.update_yaxes(title_text="Lights Usage (in Wh)", secondary_y=True, color="red")
    
fig.show()

We can see that there are peaks of high appliance usage and low appliance usage. It seems like this is following a night time and daytime routine.

Light usage matches well with appliance usage.

# Appliances and Temperatures

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['Appliances'], 
               name="Appliances",
               mode='lines'),
    secondary_y=False,
)
tempcolumns = ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T_out']

for i in tempcolumns:
    fig.add_trace(
        go.Scatter(x=edadfsmall['date'], y=edadfsmall[i], 
                   name=i,
                   mode='lines'),
        secondary_y=True,
    )

fig.update_layout(
    title='Appliance usage and Temperature over two weeks',
    xaxis_title="Date")
    
fig.update_yaxes(title_text="Appliance Usage (in Wh)", secondary_y=False, color="blue")
fig.update_yaxes(title_text="Temperature (in Celsius)", secondary_y=True)    
    
fig.show()

When T6 and T_out are filtered out, we can see that the temperature of the rest of the rooms spike up when appliance usage peaks.

When Appliances, T6 and T_out are selected only, we can still see temperature spikes that very roughly follow along appliance usage.

# Appliances and Humidity

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['Appliances'], 
               name="Appliances",
               mode='lines'),
    secondary_y=False,
)
humiditycolumns = ['RH_1', 'RH_2', 'RH_3', 'RH_4', 'RH_5', 'RH_6', 'RH_7', 'RH_8', 'RH_9', 'RH_out']

for i in humiditycolumns:
    fig.add_trace(
        go.Scatter(x=edadfsmall['date'], y=edadfsmall[i], 
                   name=i,
                   mode='lines'),
        secondary_y=True,
    )

fig.update_layout(
    title='Appliance usage and Humidity over two weeks',
    xaxis_title="Date")
    
fig.update_yaxes(title_text="Appliance Usage (in Wh)", secondary_y=False, color="blue")
fig.update_yaxes(title_text="humidity (in %)", secondary_y=True)    
    
fig.show()

- RH5 is the bathroom, so it makes sense that the humidity spikes sharply due to the water from showering/bathing.
- RH5 spikes with appliance usage, but not all appliance usage spikes with RH5.
- All other variables except RH_6 and RH_out (both are humidity outside) peak when appliance usage is low or not peaking.

# Appliances and Outside Variables

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['Appliances'], 
               name="Appliances",
               mode='lines'),
    secondary_y=False,
)
humiditycolumns = ['T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint']

for i in humiditycolumns:
    fig.add_trace(
        go.Scatter(x=edadfsmall['date'], y=edadfsmall[i], 
                   name=i,
                   mode='lines'),
        secondary_y=True,
    )
    
fig.update_layout(
    title='Appliance and Outside Variables over two weeks',
    xaxis_title="Date")
    
fig.update_yaxes(title_text="Appliance Usage (in Wh)", secondary_y=False, color="blue")
fig.update_yaxes(title_text="Arbitrary values", secondary_y=True)

fig.show()

- Outside Temperature and Tdewpoint seem to correlate well
    - The small fluctuations in temp and Tdewpoint represent day and night time. During daytime, the the temperature naturally rises with the sun as well as Tdewpoint. We can assume that the residents in the home are active during the daytime and so we see a correlation between appliance usage and temperature or Tdewpoint

- There seems to be no other clear correlation or trend between outside conditions from the weather station and Appliance usage.

# Appliances and Random Variables

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['Appliances'], 
               name="Appliances",
               mode='lines'),
    secondary_y=False,
)
humiditycolumns = ['rv1', 'rv2']

for i in humiditycolumns:
    fig.add_trace(
        go.Scatter(x=edadfsmall['date'], y=edadfsmall[i], 
                   name=i,
                   mode='lines'),
        secondary_y=True,
    )
    
fig.update_layout(
    title='Appliance usage and Random variables over two weeks',
    xaxis_title="Date")
    
fig.update_yaxes(title_text="Appliance Usage (in Wh)", secondary_y=False, color="blue")
fig.update_yaxes(title_text="arbitrary value", secondary_y=True)

fig.show()

Can't visually make out any patterns or trends

# Feature Engineering

In [None]:
fedf = df.copy()

### Is Daytime
- We can see that during the daytime, the residents are presumably active and awake and so appliance usage peaks during this time.

- choose daytime to be from 7:00am - 12:00am (roughly estimated visually on appliance usage)

In [None]:
# create column to distinguish between day and night for plot
edadfsmall['daytime'] = [1100 if i.hour < 24 and i.hour > 6 else 0 for i in edadfsmall['date']]

# Create figure
fig = make_subplots()

# Add traces

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['Appliances'], 
               name="Appliances",
               mode='lines')
)

fig.add_trace(
    go.Scatter(x=edadfsmall['date'], y=edadfsmall['daytime'],
               name='daytime (7am - midnight)',
               mode='none',
               fill='tozeroy'))

fig.update_layout(
    title='Appliance usage during daytime over two weeks',
    xaxis_title="Date",
    yaxis_range=(0, 1100)
)
    
fig.update_yaxes(title_text="Appliance Usage (in Wh)", color="blue")
fig.show()

We can see that there is almost no appliance usage during nighttime

In [None]:
df['is_daytime'] = [1 if i.hour < 24 and i.hour > 6 else 0 for i in df['date']] # 1 if daytime, 0 if nighttime

### Weekday

In [None]:
# weekday column for plotting 
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekdaysnum = [0,1,2,3,4,5,6] # The day of the week with Monday=0 to Sunday=6 with pd.dayofweek

for a,b in zip(weekdays, weekdaysnum):  
    fedf[a] = [1100 if i.dayofweek == b else 0 for i in fedf['date']]
# Create figure
fig = make_subplots()

# Add traces

fig.add_trace(
    go.Scatter(x=fedf['date'], y=fedf['Appliances'], 
               name="Appliances",
               mode='lines')
)

for i in weekdays:
    fig.add_trace(
        go.Scatter(x=fedf['date'], y=fedf[i],
                   name=i,
                   mode='none',
                   fill='tozeroy'))

fig.update_layout(
    title='Appliance per weekday overtime',
    xaxis_title="Date",
    yaxis_range=(0, 1100)
)
fig.update_yaxes(title_text="Appliance Usage (in Wh)", color="blue")

fig.show()

In [None]:
# Heatmap

# weekday column 
fedf['weekday'] = [i.dayofweek for i in fedf['date']] # The day of the week with Monday=0 to Sunday=6.

for a,b in zip(weekdays, weekdaysnum):
    fedf['weekday'] = fedf['weekday'].replace(b, a)

# weeknum column
fedf['week'] = fedf['date'].dt.isocalendar().week

heatdf = pd.DataFrame(fedf.groupby(['week', 'weekday'])['Appliances'].sum()).reset_index()

# Create figure
fig = make_subplots()

fig.add_trace(
    go.Heatmap(x=heatdf['week'], y=heatdf['weekday'], z=heatdf['Appliances'],
               colorbar=dict(title='Appliance Usage (in Wh)'))
)

fig.update_layout(
    title='Total Appliance usage per weekday',
    xaxis_title="Week # in the year",
    yaxis={'categoryarray': weekdays}
)

fig.update_yaxes(title_text="Weekday")

fig.show()

We don't see any obvious trends or patterns for particular weekdays

In [None]:
# The day of the week with Monday=0 to Sunday=6.
df['weekday'] = [i.dayofweek for i in df['date']]

### Is Weekday

In [None]:
# week or weekend column for plotting
weekorweekend = ['weekday', 'weekend']
weeknums = [[0,1,2,3,4], [5,6]]

for a,b in zip(weekorweekend, weeknums):
    fedf[a] = [1100 if i.dayofweek in b else 0 for i in fedf['date']]

# Create figure
fig = make_subplots()

# Add traces

fig.add_trace(
    go.Scatter(x=fedf['date'], y=fedf['Appliances'], 
               name="Appliances",
               mode='lines')
)

for i in weekorweekend:
    fig.add_trace(
        go.Scatter(x=fedf['date'], y=fedf[i],
                   name=i,
                   mode='none',
                   fill='tozeroy'))

fig.update_layout(
    title='Appliance usage per weekday and weekend overtime',
    xaxis_title="Date",
    yaxis_range=(0, 1100)
)
fig.update_yaxes(title_text="Appliance Usage (in Wh)", color="blue")
fig.show()

Hard to visually find clear patterns/trends from weekdays or weekends

In [None]:
# Heatmap

# weekday column 
fedf['is_weekday'] = ['Weekday' if i.dayofweek in weeknums[0] else 'Weekend' for i in fedf['date']]

# weekend column
fedf['week'] = fedf['date'].dt.isocalendar().week

# must standardize weekday and weekend columns (To evenly compare 5 days and 2 days of appliance usage)
fedf['stdappliances'] = fedf['Appliances']
mask1 = fedf['is_weekday'] == 'Weekday'
mask2 = fedf['is_weekday'] == 'Weekend'
fedf.loc[mask1, 'stdappliances'] = fedf['Appliances'].mask(mask1, fedf['Appliances'] * (2/7))
fedf.loc[mask2, 'stdappliances'] = fedf['Appliances'].mask(mask2, fedf['Appliances'] * (5/7))

# heatmap
heatdf = pd.DataFrame(fedf.groupby(['week', 'is_weekday'])['stdappliances'].sum()).reset_index()

# Create figure
fig = make_subplots()

fig.add_trace(
    go.Heatmap(x=heatdf['week'], y=heatdf['is_weekday'], z=heatdf['stdappliances'],
               colorbar=dict(title='Appliance Usage (in Wh)'
               )))

fig.update_layout(
    title='Total Appliance usage per Weekend and Weekday',
    xaxis_title="Week # in the year"
)

fig.show()

Note that total appliance usages in weekend and weekday are standardized. 
- Weekday was multiplied by 2/7
- Weekend was multiplied by 5/7

Cannot see any clear patterns or trends from the heatmap

In [None]:
# is_weekday with weekday=1 and weekend=0
df['is_weekday'] = [1 if i.dayofweek in weeknums[0] else 0 for i in df['date']]

### Distribution of Appliances (Target Variable)

In [None]:
df['logappliances'] = df['Appliances'].apply(lambda x: np.log2(x+1))

fig, ax = plt.subplots(1,2, figsize=(16,6))

sns.histplot(x='Appliances', data=df, binwidth=20, ax=ax[0])
sns.histplot(x='logappliances', data=df, binwidth=0.5, ax=ax[1])

ax[0].set_title('Appliance Usage Distribution')
ax[1].set_title('LogAppliance Usage Distribution')

plt.show()


Better with transformation, but visually still slightly skewed

# Mean encoding Weekday

Mean encoding is the conditioanl probability of your target variable based on each value of the feature. weekday is our only applicable categorical variable

In [None]:
df['weekday_appliance_avg'] = df.groupby('weekday')['logappliances'].transform('mean').astype(np.float16)

### Lags

In [None]:
# Introduce lags
# Note every consecutive data point is a 10 min difference
# Lets choose 10min, 30min, 60(1 hour), 180(3 hours), 360(6 hours), 1440(1 day)
lags = [1,3,6,18,36,144]
for lag in lags:
    df['logappliances_lag_'+str(lag)] = df['logappliances'].shift(lag).astype(np.float16)
    
# remove null values created by lag
df = df.iloc[144:]

### Time Period

In [None]:
# Break down date to month, day of month, hour, minute and day of year(to ID and split the data)
df['month'] = df['date'].dt.month
df['day_of_month'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['day_of_year'] = df['date'].dt.day_of_year

# drop date
df = df.drop(columns=['date'], axis=1)

In [None]:
df.info()

# Modeling

In [None]:
dfm = df.drop(['Appliances'], axis=1) # replaced with logappliances

In [None]:
# Split the data:
# Training: First 108 days
# Validation: day 109 - 122
# Test: day 123 - 137

X_train, y_train = dfm[dfm['day_of_year'] < 109].drop('logappliances', axis=1), dfm[dfm['day_of_year'] < 109]['logappliances']
X_valid = dfm[(dfm['day_of_year']>= 109) & (dfm['day_of_year'] < 123)].drop('logappliances',axis=1)
y_valid = dfm[(dfm['day_of_year']>= 109) & (dfm['day_of_year'] < 123)]['logappliances']
X_test, y_test = dfm[dfm['day_of_year'] >= 123].drop('logappliances',axis=1), dfm[dfm['day_of_year'] >= 123]['logappliances']

# drop day of year (acted as an id for each row to split data)
X_train = X_train.drop(columns=['day_of_year'], axis=1)
X_valid = X_valid.drop(columns=['day_of_year'], axis=1)
X_test = X_test.drop(columns=['day_of_year'], axis=1)

In [None]:
# Create lagless dataset

logcolumns = ['logappliances_lag_1', 'logappliances_lag_3',
              'logappliances_lag_6', 'logappliances_lag_18', 
              'logappliances_lag_36', 'logappliances_lag_144']

X_trainnolag = X_train.copy().drop(columns=logcolumns, axis=1)
X_validnolag = X_valid.copy().drop(columns=logcolumns, axis=1)
X_testnolag = X_test.copy().drop(columns=logcolumns, axis=1)

In [None]:
# create empty dataframe to hold model evaluations

results = pd.DataFrame(columns=['feature', 'model', 'train rmse', 'valid rmse'])

def addresult(feature, model, trainrmse, validrmse):
    modelname = type(model).__name__
    return {'feature':feature, 'model':modelname, 'train rmse':trainrmse, 'valid rmse':validrmse}

In [None]:
# Train + Valdiate

lbgm = LGBMRegressor()
lbgm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],
         eval_metric='rmse', verbose=None, early_stopping_rounds=20)
lbgmresults = list(lbgm.best_score_.items())
results = results.append(addresult('lag', lbgm, lbgmresults[0][1]['rmse'], 
                                   lbgmresults[1][1]['rmse']), ignore_index=True)


lbgmnolag = LGBMRegressor()
lbgmnolag.fit(X_trainnolag, y_train, eval_set=[(X_trainnolag, y_train), (X_validnolag, y_valid)],
              eval_metric='rmse', verbose=None, early_stopping_rounds=20)
lbgmnolagresults = list(lbgmnolag.best_score_.items())
results = results.append(addresult('no lag', lbgmnolag, lbgmnolagresults[0][1]['rmse'], 
                                   lbgmnolagresults[1][1]['rmse']), ignore_index=True)


models = [SVR(), RandomForestRegressor(random_state=1)]

def evalmodel(model, xtrain, ytrain, xvalid, yvalid):
    mod = model
    mod.fit(xtrain, ytrain)
    rmsetrain = round(mean_squared_error(ytrain, mod.predict(xtrain), squared=False),3)
    rmsevalid = round(mean_squared_error(yvalid, mod.predict(xvalid), squared=False),3)
    return rmsetrain, rmsevalid

for i in models:
    rmsetrain, rmsevalid = evalmodel(i, X_train, y_train, X_valid, y_valid)
    results = results.append(addresult('lag', i, rmsetrain, rmsevalid), ignore_index=True)
    rmsetrain, rmsevalid = evalmodel(i, X_trainnolag, y_train, X_validnolag, y_valid)
    results = results.append(addresult('no lag', i, rmsetrain, rmsevalid), ignore_index=True)

In [None]:
results

- RandomForestRegressor is overfitting.
- LBGMRegressor has the lowest validation set RMSE.

# Test set 

In [None]:
lagrmse = mean_squared_error(y_test, lbgm.predict(X_test), squared=False)
nolagrmse = mean_squared_error(y_test, lbgmnolag.predict(X_testnolag), squared=False)

print('LGBMRegressor \nRMSE with lag feature: ' + str(round(lagrmse, 3)) + 
      '\nRMSE with no lag feature: ' + str(round(nolagrmse,3)))

In [None]:
# get date data to combine with predictions
edadf['day_of_year'] = edadf['date'].dt.day_of_year
X_eda = edadf[edadf['day_of_year'] >= 123]['date']

# convert prediction values from log2 transformation

y_pred = lbgm.predict(X_test)
y_pred = 2**(y_pred) - 1

y_prednolag = lbgmnolag.predict(X_testnolag)
y_prednolag = 2**(y_prednolag) - 1

pred = zip(X_eda, y_pred)
testpreds = pd.DataFrame(pred,
                        columns=['date', 'testpred'] )

prednolag = zip(X_eda, y_prednolag)
testpredsnolag = pd.DataFrame(prednolag,
                        columns=['date', 'testpred'] )

In [None]:
# Create figure
fig = make_subplots(rows=2, cols=1,
                    subplot_titles=('Predictions with lag feature', 'Predictions without lag feature')
                   )

# Add traces

fig.add_trace(
    go.Scatter(x=edadf['date'], y=edadf['Appliances'], 
               name="Appliances",
               mode='lines'),
               row=1, col=1
)


fig.add_trace(
    go.Scatter(x=testpreds['date'], y=testpreds['testpred'], 
               name='prediction',
               mode='lines'),
               row=1, col=1
)

fig.add_trace(
    go.Scatter(x=edadf['date'], y=edadf['Appliances'], 
               name="Appliances",
               mode='lines',
               line=dict(color='blue')),
               row=2, col=1
)


fig.add_trace(
    go.Scatter(x=testpredsnolag['date'], y=testpredsnolag['testpred'], 
               name='no lag prediction',
               mode='lines',
               line=dict(color='#d62728')),
               row=2, col=1
)

fig.update_layout(
    title='Appliance usage Predictions',
    yaxis_range=(0, 1100)
)
fig.update_yaxes(title_text="Appliance Usage (in Wh)", color="blue")
fig.update_xaxes(title_text="Date")
fig.show()

lag features seems to contribute significantly to the model prediction

# Feature Importance

In [None]:
# create dataframe for features and importance
feats = zip(X_train.columns, lbgm.feature_importances_)

featimportance = pd.DataFrame(feats,
                             columns = ['features', 'importance'])

featsnolag = zip(X_trainnolag.columns, lbgmnolag.feature_importances_)
featimportancenolag = pd.DataFrame(featsnolag,
                             columns = ['features', 'importance'])

# figure
fig, ax = plt.subplots(1,2, figsize=(20,10))

sns.barplot(x='importance', y='features', data=featimportance.sort_values(by='importance', ascending=False), ax=ax[0])
sns.barplot(x='importance', y='features', data=featimportancenolag.sort_values(by='importance', ascending=False), ax=ax[1])

ax[0].set_title('feature importance with lag')
ax[1].set_title('feature importance without lag')


plt.tight_layout()
plt.show()

Thanks for getting to the end of my notebook. Comments and feedback are much appreciated!